You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@systemml.apache.org by du...@apache.org on 2017/04/01 01:42:34 UTC

[1/7] incubator-systemml git commit: [SYSTEMML-1452] General code cleanup of SystemML-NN

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 2e48d951b -> ac8ee2bef


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/test/conv_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/conv_simple.dml b/scripts/staging/SystemML-NN/nn/test/conv_simple.dml
index fb9d02c..efd99c3 100644
--- a/scripts/staging/SystemML-NN/nn/test/conv_simple.dml
+++ b/scripts/staging/SystemML-NN/nn/test/conv_simple.dml
@@ -24,6 +24,7 @@
  *
  * This implementation is intended to be a simple, reference version.
  */
+
 forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
                    int C, int Hin, int Win, int Hf, int Wf,
                    int strideh, int stridew, int padh, int padw)
@@ -36,9 +37,9 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    * This implementation is intended to be a simple, reference version.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -56,8 +57,8 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    */
   N = nrow(X)
   F = nrow(W)
-  Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
-  Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
 
   # Create output volume
   out = matrix(0, rows=N, cols=F*Hout*Wout)
@@ -71,14 +72,14 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
       Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
       Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
       Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c, ] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
     }
     # Convolve image with filters
     parfor (f in 1:F, check=0) {  # all filters
       parfor (hout in 1:Hout, check=0) {  # all output rows
-        h0 = (hout-1) * strideh + 1
+        h0 = (hout-1)*strideh + 1
         parfor (wout in 1:Wout, check=0) {  # all output columns
-          w0 = (wout-1) * stridew + 1
+          w0 = (wout-1)*stridew + 1
           # Create a patch of the input example corresponding spatially to the filter sizes
           Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
           parfor (c in 1:C, check=0) {
@@ -106,12 +107,13 @@ backward = function(matrix[double] dout, int Hout, int Wout,
    * This implementation is intended to be a simple, reference version.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, F*Hout*Wout).
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, F*Hout*Wout).
    *  - Hout: Output height.
    *  - Wout: Output width.
-   *  - X: Previous input data matrix, of shape (N, C*Hin*Win).
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -123,14 +125,14 @@ backward = function(matrix[double] dout, int Hout, int Wout,
    *  - padw: Padding for left and right sides.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt W, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt b, of shape (F, 1).
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt `b`, of shape (F, 1).
    */
   N = nrow(X)
   F = nrow(W)
-  Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
-  Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
 
   # Create gradient volumes
   dX = matrix(0, rows=N, cols=C*Hin*Win)
@@ -146,7 +148,7 @@ backward = function(matrix[double] dout, int Hout, int Wout,
       Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
       Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
       Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c, ] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
     }
     dXn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
     for (f in 1:F) {  # all filters
@@ -191,10 +193,11 @@ init = function(int F, int C, int Hf, int Wf)
   /*
    * Initialize the parameters of this layer.
    *
-   * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852],
-   * which limits the magnification of inputs/gradients during
-   * forward/backward passes by scaling unit-Gaussian weights by a
-   * factor of sqrt(2/n), under the assumption of relu neurons.
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
    *
    * Inputs:
    *  - F: Number of filters.
@@ -203,8 +206,8 @@ init = function(int F, int C, int Hf, int Wf)
    *  - Wf: Filter width.
    *
    * Outputs:
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
    */
   W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
   b = matrix(0, rows=F, cols=1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/grad_check.dml b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
index 6b90d56..adc1c9a 100644
--- a/scripts/staging/SystemML-NN/nn/test/grad_check.dml
+++ b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
@@ -117,7 +117,7 @@ affine = function() {
       outph = affine::forward(X, W, b)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
-      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -136,7 +136,7 @@ affine = function() {
       outph = affine::forward(X, W, b)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
-      dW_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
@@ -155,7 +155,7 @@ affine = function() {
       outph = affine::forward(X, W, b)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
-      db_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
@@ -214,7 +214,7 @@ batch_norm = function() {
             batch_norm::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
-        dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
         rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -235,7 +235,7 @@ batch_norm = function() {
             batch_norm::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
         lossph = l2_loss::forward(outph, y)
         gamma[i,j] = old  # reset
-        dgamma_num = (lossph - lossmh) / (2 * h) # numerical derivative
+        dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
         rel_error = check_rel_error(as.scalar(dgamma[i,j]), dgamma_num, lossph, lossmh)
@@ -256,7 +256,7 @@ batch_norm = function() {
             batch_norm::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
         lossph = l2_loss::forward(outph, y)
         beta[i,j] = old  # reset
-        dbeta_num = (lossph - lossmh) / (2 * h) # numerical derivative
+        dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
         rel_error = check_rel_error(as.scalar(dbeta[i,j]), dbeta_num, lossph, lossmh)
@@ -307,7 +307,7 @@ conv = function() {
       [outph, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
-      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -326,7 +326,7 @@ conv = function() {
       [outph, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
-      dW_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
@@ -345,7 +345,7 @@ conv = function() {
       [outph, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
-      db_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
@@ -355,7 +355,8 @@ conv = function() {
 
 conv_builtin = function() {
   /*
-   * Gradient check for the convolutional layer using built-in functions.
+   * Gradient check for the convolutional layer using built-in
+   * functions.
    */
   print("Grad checking the built-in convolutional layer with L2 loss.")
 
@@ -397,7 +398,7 @@ conv_builtin = function() {
                                                   pad, pad)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
-      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -418,7 +419,7 @@ conv_builtin = function() {
                                                   pad, pad)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
-      dW_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
@@ -439,7 +440,7 @@ conv_builtin = function() {
                                                   pad, pad)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
-      db_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
@@ -491,7 +492,7 @@ conv_simple = function() {
                                                  pad, pad)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
-      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -512,7 +513,7 @@ conv_simple = function() {
                                                  pad, pad)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
-      dW_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
@@ -533,7 +534,7 @@ conv_simple = function() {
                                                  pad, pad)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
-      db_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
@@ -569,7 +570,7 @@ cross_entropy_loss = function() {
       pred[i,j] = old + h
       lossph = cross_entropy_loss::forward(pred, y)
       pred[i,j] = old  # reset W[i,j]
-      dpred_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
@@ -609,7 +610,7 @@ dropout = function() {
       [outph, mask] = dropout::forward(X, p, seed)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
-      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -643,7 +644,7 @@ l1_loss = function() {
       pred[i,j] = old + h
       lossph = l1_loss::forward(pred, y)
       pred[i,j] = old  # reset W[i,j]
-      dpred_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
@@ -677,7 +678,7 @@ l1_reg = function() {
       W[i,j] = old + h
       reg_lossph = l1_reg::forward(W, lambda)
       W[i,j] = old  # reset W[i,j]
-      dW_num = (reg_lossph - reg_lossmh) / (2 * h) # numerical derivative
+      dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, reg_lossph, reg_lossmh)
@@ -711,7 +712,7 @@ l2_loss = function() {
       pred[i,j] = old + h
       lossph = l2_loss::forward(pred, y)
       pred[i,j] = old  # reset W[i,j]
-      dpred_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
@@ -745,7 +746,7 @@ l2_reg = function() {
       W[i,j] = old + h
       reg_lossph = l2_reg::forward(W, lambda)
       W[i,j] = old  # reset W[i,j]
-      dW_num = (reg_lossph - reg_lossmh) / (2 * h) # numerical derivative
+      dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, reg_lossph, reg_lossmh)
@@ -779,7 +780,7 @@ log_loss = function() {
       pred[i,j] = old + h
       lossph = log_loss::forward(pred, y)
       pred[i,j] = old  # reset W[i,j]
-      dpred_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
@@ -831,7 +832,7 @@ lstm = function() {
       loss_cph = l2_loss::forward(cph, yc)
       lossph = loss_outph + loss_cph
       X[i,j] = old  # reset
-      dX_num = (lossph - lossmh) / (2 * h)  # numerical derivative
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -854,7 +855,7 @@ lstm = function() {
       loss_cph = l2_loss::forward(cph, yc)
       lossph = loss_outph + loss_cph
       W[i,j] = old  # reset
-      dW_num = (lossph - lossmh) / (2 * h)  # numerical derivative
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
@@ -877,7 +878,7 @@ lstm = function() {
       loss_cph = l2_loss::forward(cph, yc)
       lossph = loss_outph + loss_cph
       b[i,j] = old  # reset
-      db_num = (lossph - lossmh) / (2 * h)  # numerical derivative
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
@@ -900,7 +901,7 @@ lstm = function() {
       loss_cph = l2_loss::forward(cph, yc)
       lossph = loss_outph + loss_cph
       out0[i,j] = old  # reset
-      dout0_num = (lossph - lossmh) / (2 * h)  # numerical derivative
+      dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
@@ -923,7 +924,7 @@ lstm = function() {
       loss_cph = l2_loss::forward(cph, yc)
       lossph = loss_outph + loss_cph
       c0[i,j] = old  # reset
-      dc0_num = (lossph - lossmh) / (2 * h)  # numerical derivative
+      dc0_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
@@ -949,8 +950,8 @@ max_pool = function() {
 
   for (pad in 0:1) {
     print(" - Grad checking w/ pad="+pad+".")
-    Hout = as.integer((Hin + 2 * pad - Hf) / stride + 1)
-    Wout = as.integer((Win + 2 * pad - Wf) / stride + 1)
+    Hout = as.integer((Hin + 2*pad - Hf)/stride + 1)
+    Wout = as.integer((Win + 2*pad - Wf)/stride + 1)
     y = rand(rows=N, cols=C*Hout*Wout)
 
     # Compute analytical gradients of loss wrt parameters
@@ -971,7 +972,7 @@ max_pool = function() {
         [outph, Hout, Wout] = max_pool::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
-        dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
         rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -1023,7 +1024,7 @@ max_pool_builtin = function() {
                                                         pad, pad)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
-        dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
         rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -1050,8 +1051,8 @@ max_pool_simple = function() {
 
   for (pad in 0:1) {
     print(" - Grad checking w/ pad="+pad+".")
-    Hout = as.integer((Hin + 2 * pad - Hf) / stride + 1)
-    Wout = as.integer((Win + 2 * pad - Wf) / stride + 1)
+    Hout = as.integer((Hin + 2*pad - Hf)/stride + 1)
+    Wout = as.integer((Win + 2*pad - Wf)/stride + 1)
     y = rand(rows=N, cols=C*Hout*Wout)
 
     # Compute analytical gradients of loss wrt parameters
@@ -1075,7 +1076,7 @@ max_pool_simple = function() {
                                                        pad, pad)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
-        dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
         rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -1121,7 +1122,7 @@ relu = function() {
       outph = relu::forward(X)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
-      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -1165,7 +1166,7 @@ rnn = function() {
       [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
-      dX_num = (lossph - lossmh) / (2 * h)  # numerical derivative
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -1184,7 +1185,7 @@ rnn = function() {
       [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
-      dW_num = (lossph - lossmh) / (2 * h)  # numerical derivative
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
@@ -1203,7 +1204,7 @@ rnn = function() {
       [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
-      db_num = (lossph - lossmh) / (2 * h)  # numerical derivative
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
@@ -1222,7 +1223,7 @@ rnn = function() {
       [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
       lossph = l2_loss::forward(outph, y)
       out0[i,j] = old  # reset
-      dout0_num = (lossph - lossmh) / (2 * h)  # numerical derivative
+      dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
@@ -1260,7 +1261,7 @@ sigmoid = function() {
       outph = sigmoid::forward(X)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
-      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -1299,7 +1300,7 @@ softmax = function() {
       outph = softmax::forward(X)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
-      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -1364,7 +1365,7 @@ spatial_batch_norm = function() {
                                         ema_mean, ema_var, mu, eps)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
-        dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
         rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -1387,7 +1388,7 @@ spatial_batch_norm = function() {
                                         ema_mean, ema_var, mu, eps)
         lossph = l2_loss::forward(outph, y)
         gamma[i,j] = old  # reset
-        dgamma_num = (lossph - lossmh) / (2 * h) # numerical derivative
+        dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
         rel_error = check_rel_error(as.scalar(dgamma[i,j]), dgamma_num, lossph, lossmh)
@@ -1410,7 +1411,7 @@ spatial_batch_norm = function() {
                                         ema_mean, ema_var, mu, eps)
         lossph = l2_loss::forward(outph, y)
         beta[i,j] = old  # reset
-        dbeta_num = (lossph - lossmh) / (2 * h) # numerical derivative
+        dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
         rel_error = check_rel_error(as.scalar(dbeta[i,j]), dbeta_num, lossph, lossmh)
@@ -1421,7 +1422,8 @@ spatial_batch_norm = function() {
 
 tanh = function() {
   /*
-   * Gradient check for the hyperbolic tangent (tanh) nonlinearity layer.
+   * Gradient check for the hyperbolic tangent (tanh) nonlinearity
+   * layer.
    */
   print("Grad checking the tanh nonlinearity layer with L2 loss.")
 
@@ -1449,7 +1451,7 @@ tanh = function() {
       outph = tanh::forward(X)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
-      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -1517,7 +1519,7 @@ two_layer_affine_l2_net = function() {
       X[i,j] = old_x + h
       [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       X[i,j] = old_x  # reset X[i,j]
-      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
@@ -1534,7 +1536,7 @@ two_layer_affine_l2_net = function() {
       W1[i,j] = old_w + h
       [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       W1[i,j] = old_w  # reset W[i,j]
-      dWij_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh)
@@ -1551,7 +1553,7 @@ two_layer_affine_l2_net = function() {
       W2[i,j] = old_w + h
       [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       W2[i,j] = old_w  # reset W[i,j]
-      dWij_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh)
@@ -1568,7 +1570,7 @@ two_layer_affine_l2_net = function() {
       b1[i,j] = old_b + h
       [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       b1[i,j] = old_b  # reset b[1,j]
-      dbij_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh)
@@ -1585,7 +1587,7 @@ two_layer_affine_l2_net = function() {
       b2[i,j] = old_b + h
       [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
       b2[i,j] = old_b  # reset b[1,j]
-      dbij_num = (lossph - lossmh) / (2 * h) # numerical derivative
+      dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
       rel_error = check_rel_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/test/max_pool_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/max_pool_simple.dml b/scripts/staging/SystemML-NN/nn/test/max_pool_simple.dml
index 4394ffd..786b0a1 100644
--- a/scripts/staging/SystemML-NN/nn/test/max_pool_simple.dml
+++ b/scripts/staging/SystemML-NN/nn/test/max_pool_simple.dml
@@ -24,6 +24,7 @@
  *
  * This implementation is intended to be a simple, reference version.
  */
+
 forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
                    int strideh, int stridew, int padh, int padw)
     return (matrix[double] out, int Hout, int Wout) {
@@ -35,7 +36,7 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
    * This implementation is intended to be a simple, reference version.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -54,8 +55,8 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
    *  - Wout: Output width.
    */
   N = nrow(X)
-  Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
-  Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
 
   # Create output volume
   out = matrix(0, rows=N, cols=C*Hout*Wout)
@@ -99,10 +100,11 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
    * unrolled into a single vector.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, C*Hout*Wout).
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
    *  - Hout: Output height.
    *  - Wout: Output width.
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -116,7 +118,7 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
    *      A typical value is 0.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
    */
   N = nrow(X)
 
@@ -134,7 +136,7 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
       Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
       Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
       Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c, ] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
     }
     img = Xn_padded
 
@@ -162,7 +164,7 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
     parfor (c in 1:C, check=0) {
       dXn_padded_slice = matrix(dimg[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
       dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
-      dXn[c, ] = matrix(dXn_slice, rows=1, cols=Hin*Win)
+      dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
     }
     dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
   }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/util.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/util.dml b/scripts/staging/SystemML-NN/nn/util.dml
index dd0ac19..6b86225 100644
--- a/scripts/staging/SystemML-NN/nn/util.dml
+++ b/scripts/staging/SystemML-NN/nn/util.dml
@@ -22,14 +22,15 @@
 /*
  * Utility functions.
  */
+
 all_equal = function(matrix[double] X1, matrix[double] X2)
     return(boolean equivalent) {
   /*
    * Determine if two matrices are equivalent.
    *
    * Inputs:
-   *  - X1: Input matrix, of shape (any, any).
-   *  - X2: Input matrix, of same shape as X1.
+   *  - X1: Inputs, of shape (any, any).
+   *  - X2: Inputs, of same shape as X1.
    *
    * Outputs:
    *  - equivalent: Whether or not the two matrices are equivalent.
@@ -42,12 +43,12 @@ check_all_equal = function(matrix[double] X1, matrix[double] X2)
   /*
    * Check if two matrices are equivalent, and report any issues.
    *
-   *  - Issues an "ERROR" statement if elements of the two matrices
-   *  are not equal.
+   * Issues an "ERROR" statement if elements of the two matrices are
+   * not equal.
    *
    * Inputs:
-   *  - X1: Input matrix, of shape (any, any).
-   *  - X2: Input matrix, of same shape as X1.
+   *  - X1: Inputs, of shape (any, any).
+   *  - X2: Inputs, of same shape as X1.
    *
    * Outputs:
    *  - equivalent: Whether or not the two matrices are equivalent.
@@ -61,7 +62,8 @@ check_all_equal = function(matrix[double] X1, matrix[double] X2)
   }
 }
 
-compute_rel_error = function(double x1, double x2) return (double rel_error) {
+compute_rel_error = function(double x1, double x2)
+    return (double rel_error) {
   /*
    * Relative error measure between two values.
    *
@@ -74,7 +76,7 @@ compute_rel_error = function(double x1, double x2) return (double rel_error) {
    * Outputs:
    *  - rel_error: Relative error measure between the two values.
    */
-  rel_error = abs(x1 - x2) / max(1e-8, abs(x1) + abs(x2))
+  rel_error = abs(x1-x2) / max(1e-8, abs(x1)+abs(x2))
 }
 
 check_rel_error = function(double x1, double x2, double thresh_error, double thresh_warn)
@@ -83,10 +85,12 @@ check_rel_error = function(double x1, double x2, double thresh_error, double thr
    * Check and report any issues with the relative error measure between
    * two values.
    *
-   *  - Issues an "ERROR" statement for relative errors > thresh_error,
-   *  indicating that the implementation is likely incorrect.
-   *  - Issues a "WARNING" statement for relative errors < thresh_error
-   *  but > thresh_warn, indicating that the implementation may be incorrect.
+   * Issues an "ERROR" statement for relative errors > thresh_error,
+   * indicating that the implementation is likely incorrect.
+   *
+   * Issues a "WARNING" statement for relative errors < thresh_error
+   * but > thresh_warn, indicating that the implementation may be
+   * incorrect.
    *
    * Inputs:
    *  - x1: First value.
@@ -117,7 +121,7 @@ channel_sums = function(matrix[double] X, int C, int Hin, int Win)
    * Computes a channel-wise summation over a 4D input.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -152,16 +156,16 @@ im2col = function(matrix[double] img, int Hin, int Win, int Hf, int Wf, int stri
    *      out into columns, of shape (C*Hf*Wf, Hout*Wout).
    */
   C = nrow(img)
-  Hout = as.integer((Hin - Hf) / strideh + 1)
-  Wout = as.integer((Win - Wf) / stridew + 1)
+  Hout = as.integer((Hin-Hf)/strideh + 1)
+  Wout = as.integer((Win-Wf)/stridew + 1)
 
   # Note: We start with `img_cols` transposed to allow for row-major
   # left-indexing inside the loop, which is more performant.
   img_cols = matrix(0, rows=Hout*Wout, cols=C*Hf*Wf)  # zeros
   parfor (hout in 1:Hout, check=0) {  # all output rows
-    hin = (hout-1) * strideh + 1
+    hin = (hout-1)*strideh + 1
     parfor (wout in 1:Wout, check=0) {  # all output columns
-      win = (wout-1) * stridew + 1
+      win = (wout-1)*stridew + 1
       # Extract a local patch of the input image corresponding spatially to the filter sizes.
       img_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
       parfor (c in 1:C) {  # all channels
@@ -207,14 +211,14 @@ col2im = function(matrix[double] img_cols, int C, int Hin, int Win, int Hf, int
    * Outputs:
    *  - img: Input image, of shape (C, Hin*Win).
    */
-  Hout = as.integer((Hin - Hf) / strideh + 1)
-  Wout = as.integer((Win - Wf) / stridew + 1)
+  Hout = as.integer((Hin-Hf)/strideh + 1)
+  Wout = as.integer((Win-Wf)/stridew + 1)
 
   img = matrix(0, rows=C, cols=Hin*Win)  # zeros
   for (hout in 1:Hout) {  # all output rows
-    hin = (hout-1) * strideh + 1
+    hin = (hout-1)*strideh + 1
     for (wout in 1:Wout) {  # all output columns
-      win = (wout-1) * stridew + 1
+      win = (wout-1)*stridew + 1
       # Extract a local patch of the input image corresponding spatially to the filter sizes.
       img_patch = matrix(img_cols[,(hout-1)*Wout + wout], rows=C, cols=Hf*Wf)  # zeros
       parfor (c in 1:C) {  # all channels

[4/7] incubator-systemml git commit: [SYSTEMML-1413] Extract test-only utilities from `nn/util.dml`

Posted by du...@apache.org.

[SYSTEMML-1413] Extract test-only utilities from `nn/util.dml`

This commit extracts utility functions only used for testing from
`nn/util.dml` to a new `nn/test/util.dml`.

Closes #447.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/5c59e03b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/5c59e03b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/5c59e03b

Branch: refs/heads/master
Commit: 5c59e03b4caca3a519ec871475d2081bff16fd3a
Parents: 7744924
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Fri Mar 31 18:39:04 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Fri Mar 31 18:39:04 2017 -0700

----------------------------------------------------------------------
 .../staging/SystemML-NN/nn/test/grad_check.dml  | 138 +++++++----------
 scripts/staging/SystemML-NN/nn/test/test.dml    |  47 +++---
 scripts/staging/SystemML-NN/nn/test/util.dml    | 155 +++++++++++++++++++
 scripts/staging/SystemML-NN/nn/util.dml         |  92 -----------
 4 files changed, 232 insertions(+), 200 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5c59e03b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/grad_check.dml b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
index adc1c9a..ba9a317 100644
--- a/scripts/staging/SystemML-NN/nn/test/grad_check.dml
+++ b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
@@ -44,45 +44,7 @@ source("nn/layers/spatial_batch_norm.dml") as spatial_batch_norm
 source("nn/layers/tanh.dml") as tanh
 source("nn/test/conv_simple.dml") as conv_simple
 source("nn/test/max_pool_simple.dml") as max_pool_simple
-source("nn/util.dml") as util
-
-check_rel_error = function(double dw_a, double dw_n, double lossph, double lossmh)
-    return (double rel_error) {
-  /*
-   * Check and report any issues with the relative error measure between
-   * the analytical and numerical partial derivatives.
-   *
-   *  - Issues an "ERROR" statement for relative errors > 1e-2,
-   *  indicating that the gradient is likely incorrect.
-   *  - Issues a "WARNING" statement for relative errors < 1e-2
-   *  but > 1e-4, indicating that the may be incorrect.
-   *
-   * Inputs:
-   *  - dw_a: Analytical partial derivative wrt w.
-   *  - dw_n: Numerical partial derivative wrt w.
-   *  - lossph: Loss evaluated with w set to w+h.
-   *  - lossmh: Loss evaluated with w set to w-h.
-   *
-   * Outputs:
-   *  - rel_error: Relative error measure between the two derivatives.
-   */
-  # Compute relative error
-  rel_error = util::compute_rel_error(dw_a, dw_n)
-
-  # Evaluate relative error
-  thresh_error = 1e-2
-  thresh_warn = 1e-4
-  if (rel_error > thresh_error) {
-    print("ERROR: Relative error " + rel_error + " > " + thresh_error + " with " + dw_a +
-          " analytical vs " + dw_n + " numerical, with lossph " + lossph +
-          " and lossmh " + lossmh)
-  }
-  else if (rel_error > thresh_warn & rel_error <= thresh_error) {
-    print("WARNING: Relative error " + rel_error + " > " + thresh_warn + " & <= " + thresh_error +
-          " with " + dw_a + " analytical vs " + dw_n + " numerical, with lossph " + lossph +
-          " and lossmh " + lossmh)
-  }
-}
+source("nn/test/util.dml") as test_util
 
 affine = function() {
   /*
@@ -120,7 +82,7 @@ affine = function() {
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 
@@ -139,7 +101,7 @@ affine = function() {
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
     }
   }
 
@@ -158,7 +120,7 @@ affine = function() {
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
     }
   }
 }
@@ -217,7 +179,7 @@ batch_norm = function() {
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
-        rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
       }
     }
 
@@ -238,7 +200,8 @@ batch_norm = function() {
         dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
-        rel_error = check_rel_error(as.scalar(dgamma[i,j]), dgamma_num, lossph, lossmh)
+        rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+                                                    lossph, lossmh)
       }
     }
 
@@ -259,7 +222,8 @@ batch_norm = function() {
         dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
-        rel_error = check_rel_error(as.scalar(dbeta[i,j]), dbeta_num, lossph, lossmh)
+        rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+                                                    lossph, lossmh)
       }
     }
   }
@@ -310,7 +274,7 @@ conv = function() {
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 
@@ -329,7 +293,7 @@ conv = function() {
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
     }
   }
 
@@ -348,7 +312,7 @@ conv = function() {
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
     }
   }
 }
@@ -401,7 +365,7 @@ conv_builtin = function() {
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 
@@ -422,7 +386,7 @@ conv_builtin = function() {
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
     }
   }
 
@@ -443,7 +407,7 @@ conv_builtin = function() {
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
     }
   }
 }
@@ -495,7 +459,7 @@ conv_simple = function() {
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 
@@ -516,7 +480,7 @@ conv_simple = function() {
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
     }
   }
 
@@ -537,7 +501,7 @@ conv_simple = function() {
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
     }
   }
 }
@@ -573,7 +537,7 @@ cross_entropy_loss = function() {
       dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
     }
   }
 }
@@ -613,7 +577,7 @@ dropout = function() {
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 }
@@ -647,7 +611,7 @@ l1_loss = function() {
       dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
     }
   }
 }
@@ -681,7 +645,8 @@ l1_reg = function() {
       dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, reg_lossph, reg_lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
+                                                  reg_lossph, reg_lossmh)
     }
   }
 }
@@ -715,7 +680,7 @@ l2_loss = function() {
       dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
     }
   }
 }
@@ -749,7 +714,8 @@ l2_reg = function() {
       dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, reg_lossph, reg_lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
+                                                  reg_lossph, reg_lossmh)
     }
   }
 }
@@ -783,7 +749,7 @@ log_loss = function() {
       dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
     }
   }
 }
@@ -835,7 +801,7 @@ lstm = function() {
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 
@@ -858,7 +824,7 @@ lstm = function() {
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
     }
   }
 
@@ -881,7 +847,7 @@ lstm = function() {
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
     }
   }
 
@@ -904,7 +870,7 @@ lstm = function() {
       dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
     }
   }
 
@@ -927,7 +893,7 @@ lstm = function() {
       dc0_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
     }
   }
 }
@@ -975,7 +941,7 @@ max_pool = function() {
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
-        rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
       }
     }
   }
@@ -1027,7 +993,7 @@ max_pool_builtin = function() {
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
-        rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
       }
     }
   }
@@ -1079,7 +1045,7 @@ max_pool_simple = function() {
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
-        rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
       }
     }
   }
@@ -1125,7 +1091,7 @@ relu = function() {
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 }
@@ -1169,7 +1135,7 @@ rnn = function() {
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 
@@ -1188,7 +1154,7 @@ rnn = function() {
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
     }
   }
 
@@ -1207,7 +1173,7 @@ rnn = function() {
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
     }
   }
 
@@ -1226,7 +1192,7 @@ rnn = function() {
       dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
     }
   }
 }
@@ -1264,7 +1230,7 @@ sigmoid = function() {
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 }
@@ -1303,7 +1269,7 @@ softmax = function() {
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 }
@@ -1368,7 +1334,7 @@ spatial_batch_norm = function() {
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
-        rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
       }
     }
 
@@ -1391,7 +1357,8 @@ spatial_batch_norm = function() {
         dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
-        rel_error = check_rel_error(as.scalar(dgamma[i,j]), dgamma_num, lossph, lossmh)
+        rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+                                                    lossph, lossmh)
       }
     }
 
@@ -1414,7 +1381,8 @@ spatial_batch_norm = function() {
         dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
         # Check error
-        rel_error = check_rel_error(as.scalar(dbeta[i,j]), dbeta_num, lossph, lossmh)
+        rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+                                                    lossph, lossmh)
       }
     }
   }
@@ -1454,7 +1422,7 @@ tanh = function() {
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 }
@@ -1522,7 +1490,7 @@ two_layer_affine_l2_net = function() {
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
 
@@ -1539,7 +1507,7 @@ two_layer_affine_l2_net = function() {
       dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh)
     }
   }
 
@@ -1556,7 +1524,7 @@ two_layer_affine_l2_net = function() {
       dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh)
     }
   }
 
@@ -1573,7 +1541,7 @@ two_layer_affine_l2_net = function() {
       dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh)
     }
   }
 
@@ -1590,7 +1558,7 @@ two_layer_affine_l2_net = function() {
       dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
       # Check error
-      rel_error = check_rel_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh)
+      rel_error = test_util::check_rel_grad_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh)
     }
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5c59e03b/scripts/staging/SystemML-NN/nn/test/test.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/test.dml b/scripts/staging/SystemML-NN/nn/test/test.dml
index b25fae2..8fb0d04 100644
--- a/scripts/staging/SystemML-NN/nn/test/test.dml
+++ b/scripts/staging/SystemML-NN/nn/test/test.dml
@@ -32,6 +32,7 @@ source("nn/layers/spatial_batch_norm.dml") as spatial_batch_norm
 source("nn/layers/tanh.dml") as tanh
 source("nn/test/conv_simple.dml") as conv_simple
 source("nn/test/max_pool_simple.dml") as max_pool_simple
+source("nn/test/util.dml") as test_util
 source("nn/util.dml") as util
 
 batch_norm = function() {
@@ -62,7 +63,7 @@ batch_norm = function() {
                     1.34160733  1.34160721  1.34160733  1.34160733", rows=1, cols=N*D)
   out = matrix(out, rows=1, cols=N*D)
   for (i in 1:length(out)) {
-    rel_error = util::check_rel_error(as.scalar(out[1,i]),
+    rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
                                       as.scalar(target[1,i]), 1e-3, 1e-4)
   }
 }
@@ -100,9 +101,9 @@ conv = function() {
   out_simple = matrix(out_simple, rows=1, cols=N*F*Hout*Wout)
   out_builtin = matrix(out_builtin, rows=1, cols=N*F*Hout*Wout)
   for (i in 1:length(out)) {
-    rel_error = util::check_rel_error(as.scalar(out[1,i]),
+    rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
                                       as.scalar(out_simple[1,i]), 1e-10, 1e-12)
-    rel_error = util::check_rel_error(as.scalar(out[1,i]),
+    rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
                                       as.scalar(out_builtin[1,i]), 1e-10, 1e-12)
   }
 }
@@ -160,7 +161,7 @@ im2col = function() {
   x_pad2 = util::col2im(x_cols, C, Hin+2*pad, Win+2*pad, Hf, Wf, stride, stride, "none")
 
   # Equivalency check
-  equivalent = util::all_equal(x_pad, x_pad2)
+  equivalent = test_util::all_equal(x_pad, x_pad2)
   if (!equivalent) {
     print("ERROR: im2col and then col2im does not yield the original image.")
   }
@@ -199,7 +200,7 @@ padding = function() {
   x1 = util::unpad_image(x_pad, Hin, Win, pad, pad)
 
   # Equivalency check
-  equivalent = util::all_equal(x, x1)
+  equivalent = test_util::all_equal(x, x1)
   if (!equivalent) {
     print("ERROR: Padding and then unpadding does not yield the original image.")
   }
@@ -238,9 +239,9 @@ max_pool = function() {
       out_simple = matrix(out_simple, rows=1, cols=N*C*Hout*Wout)
       out_builtin = matrix(out_builtin, rows=1, cols=N*C*Hout*Wout)
       for (i in 1:length(out)) {
-        rel_error = util::check_rel_error(as.scalar(out[1,i]),
+        rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
                                           as.scalar(out_simple[1,i]), 1e-10, 1e-12)
-        rel_error = util::check_rel_error(as.scalar(out[1,i]),
+        rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
                                           as.scalar(out_builtin[1,i]), 1e-10, 1e-12)
       }
 
@@ -257,9 +258,9 @@ max_pool = function() {
       dX_simple = matrix(dX_simple, rows=1, cols=N*C*Hin*Win)
       dX_builtin = matrix(dX_builtin, rows=1, cols=N*C*Hin*Win)
       for (i in 1:length(dX)) {
-        rel_error = util::check_rel_error(as.scalar(dX[1,i]),
+        rel_error = test_util::check_rel_error(as.scalar(dX[1,i]),
                                           as.scalar(dX_simple[1,i]), 1e-10, 1e-12)
-        rel_error = util::check_rel_error(as.scalar(dX[1,i]),
+        rel_error = test_util::check_rel_error(as.scalar(dX[1,i]),
                                           as.scalar(dX_builtin[1,i]), 1e-10, 1e-12)
       }
     }
@@ -302,9 +303,9 @@ max_pool = function() {
   #  8  16
   target = matrix("6 8 14 16 6 14 8 16", rows=1, cols=C*Hout*Wout)
   target = rbind(target, target)  # n=2
-  tmp = util::check_all_equal(out, target)
-  tmp = util::check_all_equal(out_simple, target)
-  tmp = util::check_all_equal(out_builtin, target)
+  tmp = test_util::check_all_equal(out, target)
+  tmp = test_util::check_all_equal(out_simple, target)
+  tmp = test_util::check_all_equal(out_builtin, target)
 
   print(" - Testing for correct behavior against known answer w/ pad=1.")
   # generate data
@@ -342,9 +343,9 @@ max_pool = function() {
   #  4 12 16
   target = matrix("1 3 4 9 11 12 13 15 16 1 9 13 3 11 15 4 12 16", rows=1, cols=C*Hout*Wout)
   target = rbind(target, target)  # n=2
-  tmp = util::check_all_equal(out, target)
-  tmp = util::check_all_equal(out_simple, target)
-  tmp = util::check_all_equal(out_builtin, target)
+  tmp = test_util::check_all_equal(out, target)
+  tmp = test_util::check_all_equal(out_simple, target)
+  tmp = test_util::check_all_equal(out_builtin, target)
 
   print(" - Testing for correct behavior against known answer w/ all negative matrix w/ pad=0.")
   # generate data
@@ -377,9 +378,9 @@ max_pool = function() {
   #  -3 -11
   target = matrix("-1 -3 -9 -11 -1 -9 -3 -11", rows=1, cols=C*Hout*Wout)
   target = rbind(target, target)  # n=2
-  tmp = util::check_all_equal(out, target)
-  tmp = util::check_all_equal(out_simple, target)
-  tmp = util::check_all_equal(out_builtin, target)
+  tmp = test_util::check_all_equal(out, target)
+  tmp = test_util::check_all_equal(out_simple, target)
+  tmp = test_util::check_all_equal(out_builtin, target)
 
 
   print(" - Testing for correct behavior against known answer w/ all negative matrix w/ pad=1.")
@@ -418,9 +419,9 @@ max_pool = function() {
   #  0  0  0
   target = matrix("-1 -2 -4 -5 -6 -8 -13 -14 -16 -1 -5 -13 -2 -6 -14 -4 -8 -16", rows=1, cols=C*Hout*Wout)
   target = rbind(target, target)  # n=2
-  tmp = util::check_all_equal(out, target)
-  tmp = util::check_all_equal(out_simple, target)
-  tmp = util::check_all_equal(out_builtin, target)
+  tmp = test_util::check_all_equal(out, target)
+  tmp = test_util::check_all_equal(out_simple, target)
+  tmp = test_util::check_all_equal(out_builtin, target)
 }
 
 spatial_batch_norm = function() {
@@ -509,7 +510,7 @@ spatial_batch_norm = function() {
                                                                                 cols=N*C*Hin*Win)
   out = matrix(out, rows=1, cols=N*C*Hin*Win)
   for (i in 1:length(out)) {
-    rel_error = util::check_rel_error(as.scalar(out[1,i]),
+    rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
                                       as.scalar(target[1,i]), 1e-3, 1e-4)
   }
 }
@@ -531,7 +532,7 @@ tanh = function() {
   # Equivalency check
   for (i in 1:nrow(out)) {
     for (j in 1:ncol(out)) {
-      rel_error = util::check_rel_error(as.scalar(out[i,j]), as.scalar(out_ref[i,j]), 1e-10, 1e-12)
+      rel_error = test_util::check_rel_error(as.scalar(out[i,j]), as.scalar(out_ref[i,j]), 1e-10, 1e-12)
     }
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5c59e03b/scripts/staging/SystemML-NN/nn/test/util.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/util.dml b/scripts/staging/SystemML-NN/nn/test/util.dml
new file mode 100644
index 0000000..128e4db
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/test/util.dml
@@ -0,0 +1,155 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Test utility functions.
+ */
+
+all_equal = function(matrix[double] X1, matrix[double] X2)
+    return(boolean equivalent) {
+  /*
+   * Determine if two matrices are equivalent.
+   *
+   * Inputs:
+   *  - X1: Inputs, of shape (any, any).
+   *  - X2: Inputs, of same shape as X1.
+   *
+   * Outputs:
+   *  - equivalent: Whether or not the two matrices are equivalent.
+   */
+  equivalent = as.logical(prod(X1 == X2))
+}
+
+check_all_equal = function(matrix[double] X1, matrix[double] X2)
+    return(boolean equivalent) {
+  /*
+   * Check if two matrices are equivalent, and report any issues.
+   *
+   * Issues an "ERROR" statement if elements of the two matrices are
+   * not equal.
+   *
+   * Inputs:
+   *  - X1: Inputs, of shape (any, any).
+   *  - X2: Inputs, of same shape as X1.
+   *
+   * Outputs:
+   *  - equivalent: Whether or not the two matrices are equivalent.
+   */
+  # Determine if matrices are equivalent
+  equivalent = all_equal(X1, X2)
+
+  # Evaluate relative error
+  if (!equivalent) {
+    print("ERROR: The two matrices are not equivalent.")
+  }
+}
+
+compute_rel_error = function(double x1, double x2)
+    return (double rel_error) {
+  /*
+   * Relative error measure between two values.
+   *
+   * Uses smoothing to avoid divide-by-zero errors.
+   *
+   * Inputs:
+   *  - x1: First value.
+   *  - x2: Second value.
+   *
+   * Outputs:
+   *  - rel_error: Relative error measure between the two values.
+   */
+  rel_error = abs(x1-x2) / max(1e-8, abs(x1)+abs(x2))
+}
+
+check_rel_error = function(double x1, double x2, double thresh_error, double thresh_warn)
+    return (double rel_error) {
+  /*
+   * Check and report any issues with the relative error measure between
+   * two values.
+   *
+   * Issues an "ERROR" statement for relative errors > thresh_error,
+   * indicating that the implementation is likely incorrect.
+   *
+   * Issues a "WARNING" statement for relative errors < thresh_error
+   * but > thresh_warn, indicating that the implementation may be
+   * incorrect.
+   *
+   * Inputs:
+   *  - x1: First value.
+   *  - x2: Second value.
+   *  - thresh_error: Error threshold.
+   *  - thresh_warn: Warning threshold.
+   *
+   * Outputs:
+   *  - rel_error: Relative error measure between the two values.
+   */
+  # Compute relative error
+  rel_error = compute_rel_error(x1, x2)
+
+  # Evaluate relative error
+  if (rel_error > thresh_error) {
+    print("ERROR: Relative error " + rel_error + " > " + thresh_error + " with " + x1 +
+          " vs " + x2 + ".")
+  }
+  else if (rel_error > thresh_warn & rel_error <= thresh_error) {
+    print("WARNING: Relative error " + rel_error + " > " + thresh_warn + " & <= " + thresh_error +
+          " with " + x1 + " vs " + x2 + ".")
+  }
+}
+
+check_rel_grad_error = function(double dw_a, double dw_n, double lossph, double lossmh)
+    return (double rel_error) {
+  /*
+   * Check and report any issues with the relative error measure between
+   * the analytical and numerical partial derivatives.
+   *
+   *  - Issues an "ERROR" statement for relative errors > 1e-2,
+   *  indicating that the gradient is likely incorrect.
+   *  - Issues a "WARNING" statement for relative errors < 1e-2
+   *  but > 1e-4, indicating that the may be incorrect.
+   *
+   * Inputs:
+   *  - dw_a: Analytical partial derivative wrt w.
+   *  - dw_n: Numerical partial derivative wrt w.
+   *  - lossph: Loss evaluated with w set to w+h.
+   *  - lossmh: Loss evaluated with w set to w-h.
+   *
+   * Outputs:
+   *  - rel_error: Relative error measure between the two derivatives.
+   */
+  # Compute relative error
+  rel_error = compute_rel_error(dw_a, dw_n)
+
+  # Evaluate relative error
+  thresh_error = 1e-2
+  thresh_warn = 1e-4
+  if (rel_error > thresh_error) {
+    print("ERROR: Relative error " + rel_error + " > " + thresh_error + " with " + dw_a +
+          " analytical vs " + dw_n + " numerical, with lossph " + lossph +
+          " and lossmh " + lossmh)
+  }
+  else if (rel_error > thresh_warn & rel_error <= thresh_error) {
+    print("WARNING: Relative error " + rel_error + " > " + thresh_warn + " & <= " + thresh_error +
+          " with " + dw_a + " analytical vs " + dw_n + " numerical, with lossph " + lossph +
+          " and lossmh " + lossmh)
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5c59e03b/scripts/staging/SystemML-NN/nn/util.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/util.dml b/scripts/staging/SystemML-NN/nn/util.dml
index 6b86225..405d208 100644
--- a/scripts/staging/SystemML-NN/nn/util.dml
+++ b/scripts/staging/SystemML-NN/nn/util.dml
@@ -23,98 +23,6 @@
  * Utility functions.
  */
 
-all_equal = function(matrix[double] X1, matrix[double] X2)
-    return(boolean equivalent) {
-  /*
-   * Determine if two matrices are equivalent.
-   *
-   * Inputs:
-   *  - X1: Inputs, of shape (any, any).
-   *  - X2: Inputs, of same shape as X1.
-   *
-   * Outputs:
-   *  - equivalent: Whether or not the two matrices are equivalent.
-   */
-  equivalent = as.logical(prod(X1 == X2))
-}
-
-check_all_equal = function(matrix[double] X1, matrix[double] X2)
-    return(boolean equivalent) {
-  /*
-   * Check if two matrices are equivalent, and report any issues.
-   *
-   * Issues an "ERROR" statement if elements of the two matrices are
-   * not equal.
-   *
-   * Inputs:
-   *  - X1: Inputs, of shape (any, any).
-   *  - X2: Inputs, of same shape as X1.
-   *
-   * Outputs:
-   *  - equivalent: Whether or not the two matrices are equivalent.
-   */
-  # Determine if matrices are equivalent
-  equivalent = all_equal(X1, X2)
-
-  # Evaluate relative error
-  if (!equivalent) {
-    print("ERROR: The two matrices are not equivalent.")
-  }
-}
-
-compute_rel_error = function(double x1, double x2)
-    return (double rel_error) {
-  /*
-   * Relative error measure between two values.
-   *
-   * Uses smoothing to avoid divide-by-zero errors.
-   *
-   * Inputs:
-   *  - x1: First value.
-   *  - x2: Second value.
-   *
-   * Outputs:
-   *  - rel_error: Relative error measure between the two values.
-   */
-  rel_error = abs(x1-x2) / max(1e-8, abs(x1)+abs(x2))
-}
-
-check_rel_error = function(double x1, double x2, double thresh_error, double thresh_warn)
-    return (double rel_error) {
-  /*
-   * Check and report any issues with the relative error measure between
-   * two values.
-   *
-   * Issues an "ERROR" statement for relative errors > thresh_error,
-   * indicating that the implementation is likely incorrect.
-   *
-   * Issues a "WARNING" statement for relative errors < thresh_error
-   * but > thresh_warn, indicating that the implementation may be
-   * incorrect.
-   *
-   * Inputs:
-   *  - x1: First value.
-   *  - x2: Second value.
-   *  - thresh_error: Error threshold.
-   *  - thresh_warn: Warning threshold.
-   *
-   * Outputs:
-   *  - rel_error: Relative error measure between the two values.
-   */
-  # Compute relative error
-  rel_error = compute_rel_error(x1, x2)
-
-  # Evaluate relative error
-  if (rel_error > thresh_error) {
-    print("ERROR: Relative error " + rel_error + " > " + thresh_error + " with " + x1 +
-          " vs " + x2 + ".")
-  }
-  else if (rel_error > thresh_warn & rel_error <= thresh_error) {
-    print("WARNING: Relative error " + rel_error + " > " + thresh_warn + " & <= " + thresh_error +
-          " with " + x1 + " vs " + x2 + ".")
-  }
-}
-
 channel_sums = function(matrix[double] X, int C, int Hin, int Win)
     return (matrix[double] out) {
   /*

[2/7] incubator-systemml git commit: [SYSTEMML-1452] General code cleanup of SystemML-NN

Posted by du...@apache.org.

[SYSTEMML-1452] General code cleanup of SystemML-NN

This commmit performs a general code & documentation cleanup across the
library.

Closes #447.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/16b1cbd7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/16b1cbd7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/16b1cbd7

Branch: refs/heads/master
Commit: 16b1cbd72601afbed0b19c1d4125a898fd324b1c
Parents: 2e48d95
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Fri Mar 31 18:38:15 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Fri Mar 31 18:38:16 2017 -0700

----------------------------------------------------------------------
 projects/breast_cancer/hyperparam_tuning.dml    |   8 +-
 projects/breast_cancer/softmax_clf.dml          |  16 +--
 .../staging/SystemML-NN/nn/layers/affine.dml    |  36 ++++---
 .../SystemML-NN/nn/layers/batch_norm.dml        |  17 +--
 scripts/staging/SystemML-NN/nn/layers/conv.dml  |  50 ++++-----
 .../SystemML-NN/nn/layers/conv_builtin.dml      |  63 ++++++-----
 .../nn/layers/cross_entropy_loss.dml            |  29 +++--
 .../staging/SystemML-NN/nn/layers/dropout.dml   |  23 ++--
 .../staging/SystemML-NN/nn/layers/l1_loss.dml   |  29 +++--
 .../staging/SystemML-NN/nn/layers/l1_reg.dml    |  15 +--
 .../staging/SystemML-NN/nn/layers/l2_loss.dml   |  29 +++--
 .../staging/SystemML-NN/nn/layers/l2_reg.dml    |  15 +--
 .../staging/SystemML-NN/nn/layers/log_loss.dml  |  40 ++++---
 scripts/staging/SystemML-NN/nn/layers/lstm.dml  |  65 ++++++------
 .../staging/SystemML-NN/nn/layers/max_pool.dml  |  15 +--
 .../SystemML-NN/nn/layers/max_pool_builtin.dml  |  14 +--
 scripts/staging/SystemML-NN/nn/layers/relu.dml  |  22 ++--
 scripts/staging/SystemML-NN/nn/layers/rnn.dml   |  43 ++++----
 .../staging/SystemML-NN/nn/layers/sigmoid.dml   |  30 ++++--
 .../staging/SystemML-NN/nn/layers/softmax.dml   |  29 ++---
 .../nn/layers/spatial_batch_norm.dml            |  12 +--
 scripts/staging/SystemML-NN/nn/layers/tanh.dml  |  28 ++---
 .../staging/SystemML-NN/nn/optim/adagrad.dml    |  22 ++--
 scripts/staging/SystemML-NN/nn/optim/adam.dml   |  38 +++----
 .../staging/SystemML-NN/nn/optim/rmsprop.dml    |  24 +++--
 scripts/staging/SystemML-NN/nn/optim/sgd.dml    |  12 ++-
 .../SystemML-NN/nn/optim/sgd_momentum.dml       |  24 +++--
 .../SystemML-NN/nn/optim/sgd_nesterov.dml       |  23 ++--
 .../staging/SystemML-NN/nn/test/conv_simple.dml |  51 ++++-----
 .../staging/SystemML-NN/nn/test/grad_check.dml  | 106 ++++++++++---------
 .../SystemML-NN/nn/test/max_pool_simple.dml     |  18 ++--
 scripts/staging/SystemML-NN/nn/util.dml         |  46 ++++----
 32 files changed, 549 insertions(+), 443 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/projects/breast_cancer/hyperparam_tuning.dml
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/hyperparam_tuning.dml b/projects/breast_cancer/hyperparam_tuning.dml
index 464c659..4f054c3 100644
--- a/projects/breast_cancer/hyperparam_tuning.dml
+++ b/projects/breast_cancer/hyperparam_tuning.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -66,7 +66,9 @@ parfor(j in 1:10000) {
   log_interval = 10
 
   # Train
-  [Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2] = clf::train(X, Y, X_val, Y_val, C, Hin, Win, lr, mu, decay, lambda, batch_size, epochs, log_interval, dir)
+  [Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2] =
+      clf::train(X, Y, X_val, Y_val, C, Hin, Win, lr, mu, decay, lambda, batch_size, epochs,
+                 log_interval, dir)
 
   # Eval
   #probs = clf::predict(X, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/projects/breast_cancer/softmax_clf.dml
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/softmax_clf.dml b/projects/breast_cancer/softmax_clf.dml
index e106a36..35fd545 100644
--- a/projects/breast_cancer/softmax_clf.dml
+++ b/projects/breast_cancer/softmax_clf.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -79,7 +79,7 @@ train = function(matrix[double] X, matrix[double] Y,
   accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
   # Output results
   print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
-  
+
   # Optimize
   print("Starting optimization")
   iters = ceil(N / batch_size)
@@ -152,7 +152,7 @@ predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
    */
   N = nrow(X)  # num examples
   K = ncol(W)  # num classes
-  
+
   # Compute forward pass
   ## affine & softmax:
   out = affine::forward(X, W, b)
@@ -185,7 +185,7 @@ eval = function(matrix[double] probs, matrix[double] Y)
 generate_dummy_data = function()
     return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) {
   /*
-   * Generate a dummy dataset similar to the MNIST dataset.
+   * Generate a dummy dataset similar to the breast cancer dataset.
    *
    * Outputs:
    *  - X: Input data matrix, of shape (N, D).
@@ -196,9 +196,9 @@ generate_dummy_data = function()
    */
   # Generate dummy input data
   N = 1024  # num examples
-  C = 1  # num input channels
-  Hin = 28  # input height
-  Win = 28  # input width
+  C = 3  # num input channels
+  Hin = 256  # input height
+  Win = 256  # input width
   T = 10  # num targets
   X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
   classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/affine.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/affine.dml b/scripts/staging/SystemML-NN/nn/layers/affine.dml
index 6a4c210..f9f8559 100644
--- a/scripts/staging/SystemML-NN/nn/layers/affine.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/affine.dml
@@ -22,6 +22,7 @@
 /*
  * Fully-connected (affine) layer.
  */
+
 forward = function(matrix[double] X, matrix[double] W, matrix[double] b)
     return (matrix[double] out) {
   /*
@@ -29,9 +30,9 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b)
    * M neurons.  The input data has N examples, each with D features.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, D).
-   *  - W: Weights (parameters) matrix, of shape (D, M).
-   *  - b: Biases vector, of shape (1, M).
+   *  - X: Inputs, of shape (N, D).
+   *  - W: Weights, of shape (D, M).
+   *  - b: Biases, of shape (1, M).
    *
    * Outputs:
    *  - out: Outputs, of shape (N, M).
@@ -47,15 +48,15 @@ backward = function(matrix[double] dout, matrix[double] X,
    * with M neurons.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, M).
-   *  - X: Previous input data matrix, of shape (N, D).
-   *  - W: Weights (parameters) matrix, of shape (D, M).
-   *  - b: Biases vector, of shape (1, M).
+   *  - dout: Gradient wrt `out` from upstream, of shape (N, M).
+   *  - X: Inputs, of shape (N, D).
+   *  - W: Weights, of shape (D, M).
+   *  - b: Biases, of shape (1, M).
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, D).
-   *  - dW: Gradient wrt W, of shape (D, M).
-   *  - db: Gradient wrt b, of shape (1, M).
+   *  - dX: Gradient wrt `X`, of shape (N, D).
+   *  - dW: Gradient wrt `W`, of shape (D, M).
+   *  - db: Gradient wrt `b`, of shape (1, M).
    */
   dX = dout %*% t(W)
   dW = t(X) %*% dout
@@ -70,18 +71,19 @@ init = function(int D, int M)
    * Note: This is just a convenience function, and parameters
    * may be initialized manually if needed.
    *
-   * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852],
-   * which limits the magnification of inputs/gradients during
-   * forward/backward passes by scaling unit-Gaussian weights by a
-   * factor of sqrt(2/n), under the assumption of relu neurons.
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
    *
    * Inputs:
-   *  - D: Dimensionality of the input features.
+   *  - D: Dimensionality of the input features (number of features).
    *  - M: Number of neurons in this layer.
    *
    * Outputs:
-   *  - W: Weight matrix, of shape (D, M).
-   *  - b: Biases vector, of shape (1, M).
+   *  - W: Weights, of shape (D, M).
+   *  - b: Biases, of shape (1, M).
    */
   W = rand(rows=D, cols=M, pdf="normal") * sqrt(2.0/D)
   b = matrix(0, rows=1, cols=M)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
index d332e8c..82240f7 100644
--- a/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
@@ -22,6 +22,7 @@
 /*
  * Batch normalization layer.
  */
+
 forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
                    string mode, matrix[double] ema_mean, matrix[double] ema_var,
                    double mu, double epsilon)
@@ -36,7 +37,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
    * introduces learnable parameters (gamma, beta) to control the
    * amount of normalization.
    *
-   *    y = ((x-mean) / sqrt(var+eps)) * gamma + beta
+   *   `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
    *
    * This implementation maintains exponential moving averages of the
    * mean and variance during training for use during testing.
@@ -47,7 +48,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
    *    - https://arxiv.org/abs/1502.03167
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, D).
+   *  - X: Inputs, of shape (N, D).
    *  - gamma: Scale parameters, of shape (1, D).
    *  - beta: Shift parameters, of shape (1, D).
    *  - mode: 'train' or 'test' to indicate if the model is currently
@@ -118,7 +119,7 @@ backward = function(matrix[double] dout, matrix[double] out,
    * Computes the backward pass for a batch normalization layer.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, D).
+   *  - dout: Gradient wrt `out` from upstream, of shape (N, D).
    *  - out: Outputs from the forward pass, of shape (N, D).
    *  - ema_mean_upd: Updated exponential moving average of the mean
    *      from the forward pass, of shape (1, D).
@@ -133,7 +134,7 @@ backward = function(matrix[double] dout, matrix[double] out,
    *  - cache_norm: Cache of the normalized inputs from the forward
    *      pass, of shape (N, D).  Note: This is used for performance
    *      during training.
-   *  - X: Input data matrix to the forward pass, of shape (N, D).
+   *  - X: Inputs, of shape (N, D).
    *  - gamma: Scale parameters, of shape (1, D).
    *  - beta: Shift parameters, of shape (1, D).
    *  - mode: 'train' or 'test' to indicate if the model is currently
@@ -151,9 +152,9 @@ backward = function(matrix[double] dout, matrix[double] out,
    *      Typical values are in the range of [1e-5, 1e-3].
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, D).
-   *  - dgamma: Gradient wrt W, of shape (1, D).
-   *  - dbeta: Gradient wrt b, of shape (1, D).
+   *  - dX: Gradient wrt `X`, of shape (N, D).
+   *  - dgamma: Gradient wrt `W`, of shape (1, D).
+   *  - dbeta: Gradient wrt `b`, of shape (1, D).
    *
    */
   N = nrow(X)
@@ -190,7 +191,7 @@ init = function(int D)
    * may be initialized manually if needed.
    *
    * Inputs:
-   *  - D: Dimensionality of the input features.
+   *  - D: Dimensionality of the input features (number of features).
    *
    * Outputs:
    *  - gamma: Scale parameters, of shape (1, D).

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/conv.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv.dml b/scripts/staging/SystemML-NN/nn/layers/conv.dml
index cc60a46..435b3cf 100644
--- a/scripts/staging/SystemML-NN/nn/layers/conv.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/conv.dml
@@ -39,9 +39,9 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    * output maps.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -50,14 +50,14 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    *  - strideh: Stride over height.
    *  - stridew: Stride over width.
    *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set padh = (Hf - 1) / 2,
-   *      assuming strideh = 1.
-   *      More generally, padh = (Hin*(strideh-1) + Hf - strideh) / 2
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
    *      preserves the spatial dimensions of the input.
    *  - padw: Padding for left and right sides.
-   *      For same output width as input, set padw = (Wf - 1) / 2,
-   *      assuming stridew = 1.
-   *      More generally, padw = (Win*(stridew-1) + Wf - stridew) / 2
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
    *      preserves the spatial dimensions of the input.
    *
    * Outputs:
@@ -67,8 +67,8 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    */
   N = nrow(X)
   F = nrow(W)
-  Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
-  Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
 
   # Create output volume
   out = matrix(0, rows=N, cols=F*Hout*Wout)
@@ -101,12 +101,13 @@ backward = function(matrix[double] dout, int Hout, int Wout,
    * This implementation uses `im2col` and `col2im` internally.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, F*Hout*Wout).
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, F*Hout*Wout).
    *  - Hout: Output height.
    *  - Wout: Output width.
-   *  - X: Previous input data matrix, of shape (N, C*Hin*Win).
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -118,9 +119,9 @@ backward = function(matrix[double] dout, int Hout, int Wout,
    *  - padw: Padding for left and right sides.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt W, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt b, of shape (F, 1).
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt `b`, of shape (F, 1).
    */
   N = nrow(X)
   F = nrow(W)
@@ -171,10 +172,11 @@ init = function(int F, int C, int Hf, int Wf)
    * Note: This is just a convenience function, and parameters
    * may be initialized manually if needed.
    *
-   * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852],
-   * which limits the magnification of inputs/gradients during
-   * forward/backward passes by scaling unit-Gaussian weights by a
-   * factor of sqrt(2/n), under the assumption of relu neurons.
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
    *
    * Inputs:
    *  - F: Number of filters.
@@ -183,8 +185,8 @@ init = function(int F, int C, int Hf, int Wf)
    *  - Wf: Filter width.
    *
    * Outputs:
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
    */
   W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
   b = matrix(0, rows=F, cols=1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
index 44df74a..c2b809e 100644
--- a/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
@@ -22,6 +22,7 @@
 /*
  * 2D Convolutional layer.
  */
+
 forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
                    int C, int Hin, int Win, int Hf, int Wf,
                    int strideh, int stridew, int padh, int padw)
@@ -32,10 +33,10 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    * volume unrolled into a single vector.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of depth).
    *  - Hin: Input height.
    *  - Win: Input width.
    *  - Hf: Filter height.
@@ -43,14 +44,14 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    *  - strideh: Stride over height.
    *  - stridew: Stride over width.
    *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set padh = (Hf - 1) / 2,
-   *      assuming strideh = 1.
-   *      More generally, padh = (Hin*(strideh-1) + Hf - strideh) / 2
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
    *      preserves the spatial dimensions of the input.
    *  - padw: Padding for left and right sides.
-   *      For same output width as input, set padw = (Wf - 1) / 2,
-   *      assuming stridew = 1.
-   *      More generally, padw = (Win*(stridew-1) + Wf - stridew) / 2
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
    *      preserves the spatial dimensions of the input.
    *
    * Outputs:
@@ -60,8 +61,8 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    */
   N = nrow(X)
   F = nrow(W)
-  Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
-  Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
 
   # Convolution - built-in implementation
   out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
@@ -81,13 +82,14 @@ backward = function(matrix[double] dout, int Hout, int Wout,
    * with F filters.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, F*Hout*Wout).
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, F*Hout*Wout).
    *  - Hout: Output height.
    *  - Wout: Output width.
-   *  - X: Previous input data matrix, of shape (N, C*Hin*Win).
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of depth).
    *  - Hin: Input height.
    *  - Win: Input width.
    *  - Hf: Filter height.
@@ -95,12 +97,20 @@ backward = function(matrix[double] dout, int Hout, int Wout,
    *  - strideh: Stride over height.
    *  - stridew: Stride over width.
    *  - padh: Padding for top and bottom sides.
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+   *      preserves the spatial dimensions of the input.
    *  - padw: Padding for left and right sides.
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+   *      preserves the spatial dimensions of the input.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt W, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt b, of shape (F, 1).
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt `b`, of shape (F, 1).
    */
   N = nrow(X)
   F = nrow(W)
@@ -123,10 +133,11 @@ init = function(int F, int C, int Hf, int Wf)
    * Note: This is just a convenience function, and parameters
    * may be initialized manually if needed.
    *
-   * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852],
-   * which limits the magnification of inputs/gradients during
-   * forward/backward passes by scaling unit-Gaussian weights by a
-   * factor of sqrt(2/n), under the assumption of relu neurons.
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
    *
    * Inputs:
    *  - F: Number of filters.
@@ -135,8 +146,8 @@ init = function(int F, int C, int Hf, int Wf)
    *  - Wf: Filter width.
    *
    * Outputs:
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
    */
   W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
   b = matrix(0, rows=F, cols=1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
index f9cd507..55552e1 100644
--- a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
@@ -21,11 +21,8 @@
 
 /*
  * Cross-entropy loss function.
- *
- * L_i = -y_i^T * log(pred_i), where y_i and pred_i are K-dimensional
- *  vectors of class probs.
- * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
  */
+
 forward = function(matrix[double] pred, matrix[double] y)
     return (double loss) {
   /*
@@ -33,16 +30,26 @@ forward = function(matrix[double] pred, matrix[double] y)
    * inputs consist of N examples, each with K dimensions corresponding
    * to normalized probabilities of K classes.
    *
+   *   ```
+   *   L_i = -y_i^T * log(pred_i)
+   *   L = (1/N) sum(L_i) for i=1 to N
+   *   ```
+   *
+   * In these equations, `L` is the total loss, `L_i` is the loss for
+   * example `i`, `y_i` is the K-dimensional vector of target class
+   * probabilities, `pred_i` is K-dimensional vector of predicted
+   * class probabilities, and `N` is the number of examples.
+   *
    * This can be interpreted as the negative log-likelihood assuming
    * a Bernoulli distribution generalized to K dimensions, or a
-   * Multinomial with 1 observation.
+   * Multinomial with one observation.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, K).
-   *  - y: Target matrix, of shape (N, K).
+   *  - pred: Predictions, of shape (N, K).
+   *  - y: Targets, of shape (N, K).
    *
    * Outputs:
-   *  - loss: Scalar loss, of shape (1).
+   *  - loss: Average loss.
    */
   N = nrow(y)
   eps = 1e-10  # numerical stability to avoid log(0)
@@ -58,11 +65,11 @@ backward = function(matrix[double] pred, matrix[double] y)
    * to normalized probabilities of K classes.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, K).
-   *  - y: Target matrix, of shape (N, K).
+   *  - pred: Predictions, of shape (N, K).
+   *  - y: Targets, of shape (N, K).
    *
    * Outputs:
-   *  - dpred: Gradient wrt pred, of shape (N, K).
+   *  - dpred: Gradient wrt `pred`, of shape (N, K).
    */
   N = nrow(y)
   eps = 1e-10  # numerical stability to avoid divide-by-zero

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/dropout.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/dropout.dml b/scripts/staging/SystemML-NN/nn/layers/dropout.dml
index 2b1bd1d..b348642 100644
--- a/scripts/staging/SystemML-NN/nn/layers/dropout.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/dropout.dml
@@ -22,6 +22,7 @@
 /*
  * Dropout layer.
  */
+
 forward = function(matrix[double] X, double p, int seed)
     return (matrix[double] out, matrix[double] mask) {
   /*
@@ -32,14 +33,13 @@ forward = function(matrix[double] X, double p, int seed)
    * the outputs of neurons) at test time.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *  - p: Probability of keeping a neuron output.
-   *  - seed: [Optional: -1] Random number generator seed.  Setting this
-   *      allows for deterministic evaluation.  Set to -1 for a random
-   *      seed.
+   *  - seed: [Optional: -1] Random number generator seed to allow for
+   *      deterministic evaluation.  Set to -1 for a random seed.
    *
    * Outputs:
-   *  - out: Ouptuts, of same shape as X.
+   *  - out: Outputs, of same shape as `X`.
    *  - mask: Dropout mask used to compute the output.
    */
   # Normally, we might use something like
@@ -48,8 +48,7 @@ forward = function(matrix[double] X, double p, int seed)
   # the `rand` function that allows use to create a mask directly.
   if (seed == -1) {
     mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p)
-  }
-  else {
+  } else {
     mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed)
   }
   out = X * mask / p
@@ -64,13 +63,13 @@ backward = function(matrix[double] dout, matrix[double] X, double p, matrix[doub
    * maintain the expected values at test time.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of same shape as X.
-   *  - X: Previous input data matrix, of shape (any, any).
-   *  - p: Previous probability of keeping a neuron output.
-   *  - mask: Previous dropout mask used to compute the output.
+   *  - dout: Gradient wrt `out`, of same shape as `X`.
+   *  - X: Inputs, of shape (any, any).
+   *  - p: Probability of keeping a neuron output.
+   *  - mask: Dropout mask used to compute the output.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of same shape as X.
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
    */
   dX = mask / p * dout
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
index 7d6c821..24b15e2 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
@@ -21,28 +21,35 @@
 
 /*
  * L1 loss function.
- *
- * L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j.
- * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
  */
+
 forward = function(matrix[double] pred, matrix[double] y)
     return (double loss) {
   /*
    * Computes the forward pass for an L1 loss function.  The inputs
    * consist of N examples, each with M dimensions to predict.
    *
+   *   ```
+   *   L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j.
+   *   L = (1/N) sum(L_i) for i=1 to N
+   *   ```
+   *
+   * In these equations, `L` is the total loss, `L_i` is the loss for
+   * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
+   * prediction, and `N` is the number of examples.
+   *
    * This can be interpreted as the negative log-likelihood assuming
    * a Laplace distribution.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, M).
-   *  - y: Target matrix, of shape (N, M).
+   *  - pred: Predictions, of shape (N, M).
+   *  - y: Targets, of shape (N, M).
    *
    * Outputs:
-   *  - loss: Scalar loss, of shape (1).
+   *  - loss: Average loss.
    */
   N = nrow(y)
-  losses = rowSums(abs(pred - y))
+  losses = rowSums(abs(pred-y))
   loss = sum(losses) / N
 }
 
@@ -53,13 +60,13 @@ backward = function(matrix[double] pred, matrix[double] y)
    * consist of N examples, each with M dimensions to predict.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, M).
-   *  - y: Target matrix, of shape (N, M).
+   *  - pred: Predictions, of shape (N, M).
+   *  - y: Targets, of shape (N, M).
    *
    * Outputs:
-   *  - dpred: Gradient wrt pred, of shape (N, M).
+   *  - dpred: Gradient wrt `pred`, of shape (N, M).
    */
   N = nrow(y)
-  dpred = sign(pred - y) / N
+  dpred = sign(pred-y) / N
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
index b2175ab..f643274 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
@@ -22,31 +22,34 @@
 /*
  * L1 regularizataion.
  */
-forward = function(matrix[double] X, double lambda) return (double reg_loss) {
+
+forward = function(matrix[double] X, double lambda)
+    return (double reg_loss) {
   /*
    * Computes the forward pass for an L1 regularization function.
    *
    * Inputs:
-   *  - X: Parameters, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *  - lambda: Regularization strength.
    *      A typical value is 0.01.
    *
    * Outputs:
-   *  - reg_loss: Scalar L1 regularization loss, of shape (1).
+   *  - reg_loss: Total regularization loss.
    */
   reg_loss = lambda * sum(abs(X))
 }
 
-backward = function(matrix[double] X, double lambda) return (matrix[double] dX) {
+backward = function(matrix[double] X, double lambda)
+    return (matrix[double] dX) {
   /*
    * Computes the backward pass for an L1 regularization function.
    *
    * Inputs:
-   *  - X: Parameters, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *  - lambda: Regularization strength.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of same shape as X.
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
    */
   dX = lambda * sign(X)
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
index 9f27cc2..df8bc1c 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
@@ -21,28 +21,35 @@
 
 /*
  * L2 loss function.
- *
- * L_i = (1/2) 2norm(pred_i - y_i)^2
- * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
  */
+
 forward = function(matrix[double] pred, matrix[double] y)
     return (double loss) {
   /*
    * Computes the forward pass for an L2 loss function.  The inputs
    * consist of N examples, each with M dimensions to predict.
    *
+   *   ```
+   *   L_i = (1/2) norm(pred_i - y_i)^2
+   *   L = (1/N) sum(L_i) for i=1 to N
+   *   ```
+   *
+   * In these equations, `L` is the total loss, `L_i` is the loss for
+   * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
+   * prediction, and `N` is the number of examples.
+   *
    * This can be interpreted as the negative log-likelihood assuming
    * a Gaussian distribution.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, M).
-   *  - y: Target matrix, of shape (N, M).
+   *  - pred: Predictions, of shape (N, M).
+   *  - y: Targets, of shape (N, M).
    *
    * Outputs:
-   *  - loss: Scalar loss, of shape (1).
+   *  - loss: Average loss.
    */
   N = nrow(y)
-  losses = 0.5 * rowSums((pred - y)^2)
+  losses = 0.5 * rowSums((pred-y)^2)
   loss = sum(losses) / N
 }
 
@@ -53,13 +60,13 @@ backward = function(matrix[double] pred, matrix[double] y)
    * consist of N examples, each with M dimensions to predict.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, M).
-   *  - y: Target matrix, of shape (N, M).
+   *  - pred: Predictions, of shape (N, M).
+   *  - y: Targets, of shape (N, M).
    *
    * Outputs:
-   *  - dpred: Gradient wrt pred, of shape (N, M).
+   *  - dpred: Gradient wrt `pred`, of shape (N, M).
    */
   N = nrow(y)
-  dpred = (pred - y) / N
+  dpred = (pred-y) / N
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
index 44f2a54..5074c06 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
@@ -22,31 +22,34 @@
 /*
  * L2 regularizataion.
  */
-forward = function(matrix[double] X, double lambda) return (double reg_loss) {
+
+forward = function(matrix[double] X, double lambda)
+    return (double reg_loss) {
   /*
    * Computes the forward pass for an L2 regularization function.
    *
    * Inputs:
-   *  - X: Parameters, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *  - lambda: Regularization strength.
    *      A typical value is 0.01.
    *
    * Outputs:
-   *  - reg_loss: Scalar l2 regularization loss, of shape (1).
+   *  - reg_loss: Total regularization loss.
    */
   reg_loss = 0.5 * lambda * sum(X^2)
 }
 
-backward = function(matrix[double] X, double lambda) return (matrix[double] dX) {
+backward = function(matrix[double] X, double lambda)
+    return (matrix[double] dX) {
   /*
    * Computes the backward pass for an L2 regularization function.
    *
    * Inputs:
-   *  - X: Parameters, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *  - lambda: Regularization strength.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of same shape as X.
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
    */
   dX = lambda * X
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
index ad5e561..7dd85d3 100644
--- a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
@@ -21,30 +21,37 @@
 
 /*
  * Log loss function.
- *
- * L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i), where y_i is a
- *  binary target, and pred_i is a probability of y=1.
- * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
  */
+
 forward = function(matrix[double] pred, matrix[double] y)
     return (double loss) {
   /*
    * Computes the forward pass for a log loss function.
    *
+   *   ```
+   *   L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i)
+   *   L = (1/N) sum(L_i) for i=1 to N
+   *   ```
+   *
+   * In these equations, `L` is the total loss, `L_i` is the loss for
+   * example `i`, `y_i` is the binary target, `pred_i` is probability
+   * of the true class (i.e. `y=1`), and `N` is the number of examples.
+   *
    * This can be interpreted as the negative log-likelihood assuming
    * a Bernoulli distribution.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, 1).  Predictions should
-   *      be probabilities that y=1.
-   *  - y: Target matrix, of shape (N, 1).  Targets should be binary
-   *      in the set {0,1}.
+   *  - pred: Predictions, of shape (N, 1).
+   *      Predictions should be probabilities of the true
+   *      class (i.e. probability of `y=1`).
+   *  - y: Targets, of shape (N, 1).
+   *      Targets should be binary in the set {0, 1}.
    *
    * Outputs:
-   *  - loss: Scalar loss, of shape (1).
+   *  - loss: Average loss.
    */
   N = nrow(y)
-  losses = -y * log(pred) - (1-y) * log(1-pred)
+  losses = -y*log(pred) - (1-y)*log(1-pred)
   loss = sum(losses) / N
 }
 
@@ -54,15 +61,16 @@ backward = function(matrix[double] pred, matrix[double] y)
    * Computes the backward pass for a log loss function.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, 1).  Predictions should
-   *      be probabilities that y=1.
-   *  - y: Target matrix, of shape (N, 1).  Targets should be binary
-   *      in the set {0,1}.
+   *  - pred: Predictions, of shape (N, 1).
+   *      Predictions should be probabilities of the true
+   *      class (i.e. probability of `y=1`).
+   *  - y: Targets, of shape (N, 1).
+   *      Targets should be binary in the set {0, 1}.
    *
    * Outputs:
-   *  - dpred: Gradient wrt pred, of shape (N, 1).
+   *  - dpred: Gradient wrt `pred`, of shape (N, 1).
    */
   N = nrow(y)
-  dpred = (1/N) * (pred-y) / (pred * (1-pred))
+  dpred = (1/N) * (pred-y) / (pred*(1-pred))
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/lstm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/lstm.dml b/scripts/staging/SystemML-NN/nn/layers/lstm.dml
index 0dd9f4c..44f2ef2 100644
--- a/scripts/staging/SystemML-NN/nn/layers/lstm.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/lstm.dml
@@ -44,16 +44,16 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T,
    *    - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, T*D).
-   *  - W: Weights (parameters) matrix, of shape (D+M, 4M).
-   *  - b: Biases vector, of shape (1, 4M).
+   *  - X: Inputs, of shape (N, T*D).
+   *  - W: Weights, of shape (D+M, 4M).
+   *  - b: Biases, of shape (1, 4M).
    *  - T: Length of example sequences (number of timesteps).
-   *  - D: Dimensionality of the input features.
+   *  - D: Dimensionality of the input features (number of features).
    *  - return_sequences: Whether to return `out` at all timesteps,
    *      or just for the final timestep.
-   *  - out0: Output matrix at previous timestep, of shape (N, M).
+   *  - out0: Outputs from previous timestep, of shape (N, M).
    *      Note: This is *optional* and could just be an empty matrix.
-   *  - c0: Initial cell state matrix, of shape (N, M).
+   *  - c0: Initial cell state, of shape (N, M).
    *      Note: This is *optional* and could just be an empty matrix.
    *
    * Outputs:
@@ -123,23 +123,27 @@ backward = function(matrix[double] dout, matrix[double] dc,
    * Computes the backward pass for an LSTM layer with M neurons.
    *
    * Inputs:
-   *  - dout: Gradient on output from upstream.  If `given_sequences`
-   *      is True, contains gradients on outputs for all timesteps,
-   *      of shape (N, T*M).  Else, contains gradient on output for
-   *      the final timestep, of shape (N, M).
-   *  - dc: Gradient on final (current) cell state from later in time,
-   *      of shape (N, M).
-   *  - X: Input data matrix, of shape (N, T*D).
-   *  - W: Weights (parameters) matrix, of shape (D+M, 4M).
-   *  - b: Biases vector, of shape (1, 4M).
+   *  - dout: Gradient wrt `out`.  If `given_sequences` is `True`,
+   *      contains gradients on outputs for all timesteps, of
+   *      shape (N, T*M). Else, contains the gradient on the output
+   *      for the final timestep, of shape (N, M).
+   *  - dc: Gradient wrt `c` (from later in time), of shape (N, M).
+   *      This would come from later in time if the cell state was used
+   *      downstream as the initial cell state for another LSTM layer.
+   *      Typically, this would be used when a sequence was cut at
+   *      timestep `T` and then continued in the next batch.  If `c`
+   *      was not used downstream, then `dc` would be an empty matrix.
+   *  - X: Inputs, of shape (N, T*D).
+   *  - W: Weights, of shape (D+M, 4M).
+   *  - b: Biases, of shape (1, 4M).
    *  - T: Length of example sequences (number of timesteps).
    *  - D: Dimensionality of the input features.
    *  - given_sequences: Whether `dout` is for all timesteps,
    *      or just for the final timestep.  This is based on whether
    *      `return_sequences` was true in the forward pass.
-   *  - out0: Output matrix at previous timestep, of shape (N, M).
+   *  - out0: Outputs from previous timestep, of shape (N, M).
    *      Note: This is *optional* and could just be an empty matrix.
-   *  - c0: Initial cell state matrix, of shape (N, M).
+   *  - c0: Initial cell state, of shape (N, M).
    *      Note: This is *optional* and could just be an empty matrix.
    *  - cache_out: Cache of outputs, of shape (T, N*M).
    *      Note: This is used for performance during training.
@@ -149,11 +153,11 @@ backward = function(matrix[double] dout, matrix[double] dc,
    *      Note: This is used for performance during training.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, T*D).
-   *  - dW: Gradient wrt W, of shape (D+M, 4M).
-   *  - db: Gradient wrt b, of shape (1, 4M).
-   *  - dout0: Gradient wrt out0, of shape (N, M).
-   *  - dc0: Gradient wrt c0, of shape (N, M).
+   *  - dX: Gradient wrt `X`, of shape (N, T*D).
+   *  - dW: Gradient wrt `W`, of shape (D+M, 4M).
+   *  - db: Gradient wrt `b`, of shape (1, 4M).
+   *  - dout0: Gradient wrt `out0`, of shape (N, M).
+   *  - dc0: Gradient wrt `c0`, of shape (N, M).
    */
   N = nrow(X)
   M = as.integer(ncol(W)/4)
@@ -190,7 +194,7 @@ backward = function(matrix[double] dout, matrix[double] dc,
     g = ifog[,3*M+1:4*M]  # g gate, shape (N, M)
 
     tmp = tanh::backward(dout_t, ct)
-    dct = dct + o * tmp  # shape (N, M)
+    dct = dct + o*tmp  # shape (N, M)
     tmp = tanh::forward(ct)
     do = tmp * dout_t  # output gate, shape (N, M)
     df = c_prev * dct  # forget gate, shape (N, M)
@@ -201,7 +205,7 @@ backward = function(matrix[double] dout, matrix[double] dc,
     di_raw = i * (1-i) * di
     df_raw = f * (1-f) * df
     do_raw = o * (1-o) * do
-    dg_raw = (1 - g^2) * dg
+    dg_raw = (1-g^2) * dg
     difog_raw = cbind(di_raw, cbind(df_raw, cbind(do_raw, dg_raw)))  # shape (N, 4M)
 
     dW = dW + t(input) %*% difog_raw  # shape (D+M, 4M)
@@ -217,7 +221,7 @@ backward = function(matrix[double] dout, matrix[double] dc,
       dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev  # shape (N, M)
       dct = dc_prev  # shape (N, M)
     }
-    t = t-1
+    t = t - 1
   }
 }
 
@@ -232,17 +236,18 @@ init = function(int N, int D, int M)
    * We use the Glorot uniform heuristic which limits the magnification
    * of inputs/gradients during forward/backward passes by scaling
    * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
+   *  - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
    *
    * Inputs:
    *  - N: Number of examples in batch.
-   *  - D: Dimensionality of the input features.
+   *  - D: Dimensionality of the input features (number of features).
    *  - M: Number of neurons in this layer.
    *
    * Outputs:
-   *  - W: Weights (parameters) matrix, of shape (D+M, 4M).
-   *  - b: Biases vector, of shape (1, 4M).
-   *  - out0: Dummy output matrix at previous timestep, of shape (N, M).
-   *  - c0: Initial empty cell state matrix, of shape (N, M).
+   *  - W: Weights, of shape (D+M, 4M).
+   *  - b: Biases, of shape (1, 4M).
+   *  - out0: Empty previous timestep output matrix, of shape (N, M).
+   *  - c0: Empty initial cell state matrix, of shape (N, M).
    */
   fan_in = D+M
   fan_out = 4*M

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
index 22e1747..a12877f 100644
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
@@ -38,7 +38,7 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
    * the output maps.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -57,8 +57,8 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
    *  - Wout: Output width.
    */
   N = nrow(X)
-  Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
-  Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
   pad_value = -1/0  # in max pooling we pad with -infinity
 
   # Create output volume
@@ -96,7 +96,8 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
    * unrolled into a single vector.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, C*Hout*Wout).
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
    *  - Hout: Output height.
    *  - Wout: Output width.
    *  - X: Input data matrix, of shape (N, C*Hin*Win).
@@ -113,7 +114,7 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
    *      A typical value is 0.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
    */
   N = nrow(X)
   pad_value = -1/0  # in max pooling we pad with -infinity
@@ -134,9 +135,9 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
       img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
       dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
       for (hout in 1:Hout, check=0) {  # all output rows
-        hin = (hout-1) * strideh + 1
+        hin = (hout-1)*strideh + 1
         for (wout in 1:Wout) {  # all output columns
-          win = (wout-1) * stridew + 1
+          win = (wout-1)*stridew + 1
           img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
           max_val_ind = img_slice_patch == max(img_slice_patch)  # max value indicator matrix
           # gradient passes through only for the max value(s) in this patch

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
index ae2b4a1..f1cb863 100644
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
@@ -22,6 +22,7 @@
 /*
  * Max pooling layer.
  */
+
 forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
                    int strideh, int stridew, int padh, int padw)
     return (matrix[double] out, int Hout, int Wout) {
@@ -36,7 +37,7 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
    * the output maps.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -55,8 +56,8 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
    *  - Wout: Output width.
    */
   N = nrow(X)
-  Hout = as.integer((Hin - Hf) / strideh + 1)
-  Wout = as.integer((Win - Wf) / stridew + 1)
+  Hout = as.integer((Hin-Hf)/strideh + 1)
+  Wout = as.integer((Win-Wf)/stridew + 1)
 
   # Max pooling - built-in implementation
   out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
@@ -73,10 +74,11 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
    * unrolled into a single vector.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, C*Hout*Wout).
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
    *  - Hout: Output height.
    *  - Wout: Output width.
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -90,7 +92,7 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
    *      A typical value is 0.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
    */
   N = nrow(X)
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/relu.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/relu.dml b/scripts/staging/SystemML-NN/nn/layers/relu.dml
index a5c5230..6a4c15c 100644
--- a/scripts/staging/SystemML-NN/nn/layers/relu.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/relu.dml
@@ -22,33 +22,37 @@
 /*
  * Rectified Linear Unit (ReLU) nonlinearity layer.
  */
-forward = function(matrix[double] X) return (matrix[double] out) {
+
+forward = function(matrix[double] X)
+    return (matrix[double] out) {
   /*
    * Computes the forward pass for a ReLU nonlinearity layer.
    *
-   * Performs an element-wise evaluation of f(input) = max(0, input).
+   * Performs an element-wise evaluation of `f(input) = max(0, input)`.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *
    * Outputs:
-   *  - out: Ouptuts, of same shape as X.
+   *  - out: Outputs, of same shape as `X`.
    */
-  out = max(0.0, X)
+  out = max(X, 0)
 }
 
-backward = function(matrix[double] dout, matrix[double] X) return (matrix[double] dX) {
+backward = function(matrix[double] dout, matrix[double] X)
+    return (matrix[double] dX) {
   /*
    * Computes the backward pass for a ReLU nonlinearity layer.
    *
-   * Essentially performs a pass-through of the upstream gradient for cells > 0.
+   * Essentially performs a pass-through of the upstream gradient
+   * for cells > 0.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of same shape as X.
+   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
    *  - X: Previous input data matrix, of shape (any, any).
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of same shape as X.
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
    */
    dX = (X > 0) * dout
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/rnn.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/rnn.dml b/scripts/staging/SystemML-NN/nn/layers/rnn.dml
index cd3eefe..cdceab8 100644
--- a/scripts/staging/SystemML-NN/nn/layers/rnn.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/rnn.dml
@@ -35,14 +35,14 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T,
    * in as an additional input at the current timestep.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, T*D).
-   *  - W: Weights (parameters) matrix, of shape (D+M, M).
-   *  - b: Biases vector, of shape (1, M).
+   *  - X: Inputs, of shape (N, T*D).
+   *  - W: Weights, of shape (D+M, M).
+   *  - b: Biases, of shape (1, M).
    *  - T: Length of example sequences (number of timesteps).
-   *  - D: Dimensionality of the input features.
+   *  - D: Dimensionality of the input features (number of features).
    *  - return_sequences: Whether to return `out` at all timesteps,
    *      or just for the final timestep.
-   *  - out0: Output matrix at previous timestep, of shape (N, M).
+   *  - out0: Output matrix from previous timestep, of shape (N, M).
    *      Note: This is *optional* and could just be an empty matrix.
    *
    * Outputs:
@@ -88,28 +88,28 @@ backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, mat
    * Computes the backward pass for a simple RNN layer with M neurons.
    *
    * Inputs:
-   *  - dout: Gradient on output from upstream.  If `given_sequences`
+   *  - dout: Gradient wrt `out` from upstream.  If `given_sequences`
    *      is True, contains gradients on outputs for all timesteps,
    *      of shape (N, T*M).  Else, contains gradient on output for
    *      the final timestep, of shape (N, M).
-   *  - X: Input data matrix, of shape (N, T*D).
-   *  - W: Weights (parameters) matrix, of shape (D+M, M).
-   *  - b: Biases vector, of shape (1, M).
+   *  - X: Inputs, of shape (N, T*D).
+   *  - W: Weights, of shape (D+M, M).
+   *  - b: Biases, of shape (1, M).
    *  - T: Length of example sequences (number of timesteps).
-   *  - D: Dimensionality of the input features.
+   *  - D: Dimensionality of the input features (number of features).
    *  - given_sequences: Whether `dout` is for all timesteps,
    *      or just for the final timestep.  This is based on whether
    *      `return_sequences` was true in the forward pass.
-   *  - out0: Output matrix at previous timestep, of shape (N, M).
+   *  - out0: Output matrix from previous timestep, of shape (N, M).
    *      Note: This is *optional* and could just be an empty matrix.
    *  - cache_out: Cache of outputs, of shape (T, N*M).
    *      Note: This is used for performance during training.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, T*D).
-   *  - dW: Gradient wrt W, of shape (D+M, 4M).
-   *  - db: Gradient wrt b, of shape (1, 4M).
-   *  - dout0: Gradient wrt out0, of shape (N, M).
+   *  - dX: Gradient wrt `X`, of shape (N, T*D).
+   *  - dW: Gradient wrt `W`, of shape (D+M, 4M).
+   *  - db: Gradient wrt `b`, of shape (1, 4M).
+   *  - dout0: Gradient wrt `out0`, of shape (N, M).
    */
   N = nrow(X)
   M = ncol(W)
@@ -134,7 +134,7 @@ backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, mat
       out_prev = matrix(cache_out[t-1,], rows=N, cols=M)  # shape (N, M)
     }
     input = cbind(X_t, out_prev)  # shape (N, D+M)
-    dout_t_raw = (1 - out_t^2) * dout_t  # into tanh, shape (N, M)
+    dout_t_raw = (1-out_t^2) * dout_t  # into tanh, shape (N, M)
     dW = dW + t(input) %*% dout_t_raw  # shape (D+M, M)
     db = db + colSums(dout_t_raw)  # shape (1, M)
     dinput = dout_t_raw %*% t(W)  # shape (N, D+M)
@@ -146,7 +146,7 @@ backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, mat
     else {
       dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev  # shape (N, M)
     }
-    t = t-1
+    t = t - 1
   }
 }
 
@@ -161,16 +161,17 @@ init = function(int N, int D, int M)
    * We use the Glorot uniform heuristic which limits the magnification
    * of inputs/gradients during forward/backward passes by scaling
    * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
+   *  - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
    *
    * Inputs:
    *  - N: Number of examples in batch.
-   *  - D: Dimensionality of the input features.
+   *  - D: Dimensionality of the input features (number of features).
    *  - M: Number of neurons in this layer.
    *
    * Outputs:
-   *  - W: Weights (parameters) matrix, of shape (D+M, M).
-   *  - b: Biases vector, of shape (1, M).
-   *  - out0: Dummy output matrix at previous timestep, of shape (N, M).
+   *  - W: Weights, of shape (D+M, M).
+   *  - b: Biases, of shape (1, M).
+   *  - out0: Empty previous timestep output matrix, of shape (N, M).
    */
   fan_in = D+M
   fan_out = M

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml b/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
index a7066f2..185befb 100644
--- a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
@@ -22,33 +22,41 @@
 /*
  * Sigmoid nonlinearity layer.
  */
-forward = function(matrix[double] X) return (matrix[double] out) {
+
+forward = function(matrix[double] X)
+    return (matrix[double] out) {
   /*
    * Computes the forward pass for a sigmoid nonlinearity layer.
    *
-   * sigmoid(x) = 1 / (1 + e^-x)
+   *   `sigmoid(x) = 1 / (1 + e^-x)`
+   *
+   * If `X` contains a single feature column, the output of a sigmoid
+   * layer can be interpreted as a predicted probability of a true
+   * class when paired with a log loss function in a binary
+   * classification problem.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *
    * Outputs:
-   *  - out: Ouptuts, of same shape as X.
+   *  - out: Outputs, of same shape as `X`.
    */
-  out = 1 / (1 + exp(-X))
+  out = 1 / (1+exp(-X))
 }
 
-backward = function(matrix[double] dout, matrix[double] X) return (matrix[double] dX) {
+backward = function(matrix[double] dout, matrix[double] X)
+    return (matrix[double] dX) {
   /*
    * Computes the backward pass for a sigmoid nonlinearity layer.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of same shape as X.
-   *  - X: Previous input data matrix, of shape (any, any).
+   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+   *  - X: Inputs, of shape (any, any).
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of same shape as X.
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
    */
-  out = 1 / (1 + exp(-X))
-  dX = out * (1 - out) * dout
+  out = 1 / (1+exp(-X))
+  dX = out * (1-out) * dout
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/softmax.dml b/scripts/staging/SystemML-NN/nn/layers/softmax.dml
index 854e8a8..1751838 100644
--- a/scripts/staging/SystemML-NN/nn/layers/softmax.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/softmax.dml
@@ -22,7 +22,9 @@
 /*
  * Softmax classifier layer.
  */
-forward = function(matrix[double] scores) return (matrix[double] probs) {
+
+forward = function(matrix[double] scores)
+    return (matrix[double] probs) {
   /*
    * Computes the forward pass for a softmax classifier.  The inputs
    * are interpreted as unnormalized, log-probabilities for each of
@@ -32,10 +34,10 @@ forward = function(matrix[double] scores) return (matrix[double] probs) {
    * This can be interpreted as a generalization of the sigmoid
    * function to multiple classes.
    *
-   * probs_ij = e^scores_ij / sum(e^scores_i)
+   *   `probs_ij = e^scores_ij / sum(e^scores_i)`
    *
    * Inputs:
-   *  - scores: Input data matrix, of shape (N, D).
+   *  - scores: Inputs, of shape (N, D).
    *
    * Outputs:
    *  - probs: Outputs, of shape (N, D).
@@ -56,20 +58,23 @@ backward = function(matrix[double] dprobs, matrix[double] scores)
   /*
    * Computes the backward pass for a softmax classifier.
    *
-   * Note that dscores_ij has multiple sources:
+   * Note that dscores_ij has multiple source branches:
    *
-   * dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij)
-   * dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j
+   *   ```
+   *   dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij)
+   *   dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j
    *
-   * dloss/dscores_ij = dloss/dprobs_ij * dprobs_ij/dscores_ij +
-   *                    sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij)
+   *   dloss/dscores_ij =
+   *      (dloss/dprobs_ij * dprobs_ij/dscores_ij)
+   *      + sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij)
+   *   ```
    *
    * Inputs:
-   *  - dprobs: Derivatives from upstream, of shape (N, D).
-   *  - scores: Previous input data matrix, of shape (N, D).
+   *  - dprobs: Gradient wrt `probs` from upstream, of shape (N, D).
+   *  - scores: Inputs, of shape (N, D).
    *
    * Outputs:
-   *  - dscores: Gradient wrt scores, of shape (N, D).
+   *  - dscores: Gradient wrt `scores`, of shape (N, D).
    */
   scores = scores - rowMaxs(scores)  # numerical stability
   unnorm_probs = exp(scores)  # unnormalized probabilities
@@ -77,6 +82,6 @@ backward = function(matrix[double] dprobs, matrix[double] scores)
   # After some cancellation:
   # dscores = dprobs*probs - probs*rowSums(dprobs*probs)
   dtemp = dprobs * probs
-  dscores = dtemp - probs * rowSums(dtemp)
+  dscores = dtemp - probs*rowSums(dtemp)
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml b/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
index 53ca989..0185a2c 100644
--- a/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
@@ -39,7 +39,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
    * introduces learnable parameters (gamma, beta) to control the
    * amount of normalization.
    *
-   *    y = ((x-mean) / sqrt(var+eps)) * gamma + beta
+   *   `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
    *
    * This implementation maintains exponential moving averages of the
    * mean and variance during training for use during testing.
@@ -50,7 +50,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
    *    - https://arxiv.org/abs/1502.03167
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
    *  - gamma: Scale parameters, of shape (C, 1).
    *  - beta: Shift parameters, of shape (C, 1).
    *  - C: Number of input channels (dimensionality of input depth).
@@ -134,7 +134,7 @@ backward = function(matrix[double] dout, matrix[double] out,
    * Computes the backward pass for a spatial batch normalization layer.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, C*Hin*Win).
+   *  - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
    *  - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
    *  - ema_mean_upd: Updated exponential moving average of the mean
    *      from the forward pass, of shape (C, 1).
@@ -171,9 +171,9 @@ backward = function(matrix[double] dout, matrix[double] out,
    *      Typical values are in the range of [1e-5, 1e-3].
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
-   *  - dgamma: Gradient wrt W, of shape (C, 1).
-   *  - dbeta: Gradient wrt b, of shape (C, 1).
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dgamma: Gradient wrt `W`, of shape (C, 1).
+   *  - dbeta: Gradient wrt `b`, of shape (C, 1).
    *
    */
   N = nrow(X)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/tanh.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/tanh.dml b/scripts/staging/SystemML-NN/nn/layers/tanh.dml
index 9308a7c..589a574 100644
--- a/scripts/staging/SystemML-NN/nn/layers/tanh.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/tanh.dml
@@ -24,38 +24,42 @@
  */
 source("nn/layers/sigmoid.dml") as sigmoid
 
-forward = function(matrix[double] X) return (matrix[double] out) {
+forward = function(matrix[double] X)
+    return (matrix[double] out) {
   /*
    * Computes the forward pass for a tanh nonlinearity layer.
    *
-   * tanh(x) = (e^x - e^-x) / (e^x + e^-x)
-   *         = 2 * sigmoid(2x) - 1
+   *   ```
+   *   tanh(x) = (e^x - e^-x) / (e^x + e^-x)
+   *           = 2 * sigmoid(2x) - 1
+   *   ```
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *
    * Outputs:
-   *  - out: Ouptuts, of same shape as X.
+   *  - out: Outputs, of same shape as `X`.
    */
   # out = (exp(X) - exp(-X)) / (exp(X) + exp(-X))
   # Simplification of the above formulation to use the sigmoid function:
   sigma2X = sigmoid::forward(2*X)
-  out = 2 * sigma2X - 1
+  out = 2*sigma2X - 1
 }
 
-backward = function(matrix[double] dout, matrix[double] X) return (matrix[double] dX) {
+backward = function(matrix[double] dout, matrix[double] X)
+    return (matrix[double] dX) {
   /*
    * Computes the backward pass for a tanh nonlinearity layer.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of same shape as X.
-   *  - X: Previous input data matrix, of shape (any, any).
+   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+   *  - X: Inputs, of shape (any, any).
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of same shape as X.
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
    */
   sigma2X = sigmoid::forward(2*X)
-  out = 2 * sigma2X - 1
-  dX = (1 - out^2) * dout
+  out = 2*sigma2X - 1
+  dX = (1-out^2) * dout
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml b/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
index 688109b..20b26c4 100644
--- a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
@@ -22,6 +22,7 @@
 /*
  * Adagrad optimizer.
  */
+
 update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon,
                   matrix[double] cache)
     return (matrix[double] X, matrix[double] cache) {
@@ -39,24 +40,25 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient of X wrt to a loss function being optimized, of
-   *      same shape as X.
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
    *  - lr: Learning rate.
    *  - epsilon: Smoothing term to avoid divide by zero errors.
    *      Typical values are in the range of [1e-8, 1e-4].
    *  - cache: State that maintains per-parameter sum of squared
-   *      gradients, of same shape as X.
+   *      gradients, of same shape as `X`.
    *
    * Outputs:
-   *  - X: Updated parameters X, of same shape as input X.
-   *  - v: Updated velocity of the parameters X, of same shape as
-   *      input v.
+   *  - X: Updated parameters `X`, of same shape as input `X`.
+   *  - cache: State that maintains per-parameter sum of squared
+   *      gradients, of same shape as `X`.
    */
   cache = cache + dX^2
-  X = X - lr * dX / (sqrt(cache) + epsilon)
+  X = X - (lr * dX / (sqrt(cache)+epsilon))
 }
 
-init = function(matrix[double] X) return (matrix[double] cache) {
+init = function(matrix[double] X)
+    return (matrix[double] cache) {
   /*
    * Initialize the state for this optimizer.
    *
@@ -65,10 +67,10 @@ init = function(matrix[double] X) return (matrix[double] cache) {
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   * 
+   *
    * Outputs:
    *  - cache: State that maintains per-parameter sum of squared
-   *      gradients, of same shape as X.
+   *      gradients, of same shape as `X`.
    */
   cache = matrix(0, rows=nrow(X), cols=ncol(X))
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/adam.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/adam.dml b/scripts/staging/SystemML-NN/nn/optim/adam.dml
index a25f74d..0607fa5 100644
--- a/scripts/staging/SystemML-NN/nn/optim/adam.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/adam.dml
@@ -22,6 +22,7 @@
 /*
  * Adam optimizer.
  */
+
 update = function(matrix[double] X, matrix[double] dX, double lr, double beta1, double beta2,
                   double epsilon, int t, matrix[double] m, matrix[double] v)
     return (matrix[double] X, matrix[double] m, matrix[double] v) {
@@ -34,8 +35,8 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double beta1,
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient of X wrt to a loss function being optimized, of
-   *      same shape as X.
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
    *  - lr: Learning rate.  Recommended value is 0.001.
    *  - beta1: Exponential decay rate for the 1st moment estimates.
    *      Recommended value is 0.9.
@@ -46,32 +47,33 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double beta1,
    *  - t: Timestep, starting at 0.
    *  - m: State containing the 1st moment (mean) estimate by
    *      maintaining exponential moving averages of the gradients, of
-   *      same shape as X.
+   *      same shape as `X`.
    *  - v: State containing the 2nd raw moment (uncentered variance)
    *      estimate by maintaining exponential moving averages of the
-   *      squared gradients, of same shape as X.
+   *      squared gradients, of same shape as `X`.
    *
    * Outputs:
-   *  - X: Updated parameters X, of same shape as input X.
+   *  - X: Updated parameters `X`, of same shape as input `X`.
    *  - m: Updated state containing the 1st moment (mean) estimate by
    *      maintaining exponential moving averages of the gradients, of
-   *      same shape as X.
+   *      same shape as `X`.
    *  - v: Updated state containing the 2nd raw moment (uncentered
    *      variance) estimate by maintaining exponential moving averages
-   *      of the squared gradients, of same shape as X.
+   *      of the squared gradients, of same shape as `X`.
    */
   t = t + 1
-  m = beta1 * m + (1 - beta1) * dX  # update biased 1st moment estimate
-  v = beta2 * v + (1 - beta2) * dX^2  # update biased 2nd raw moment estimate
-  #m = m / (1 - beta1^t)  # compute bias-corrected 1st moment estimate
-  #v = v / (1 - beta2^t)  # compute bias-corrected 2nd raw moment estimate
-  #X = X - lr * m / (sqrt(v) + epsilon)  # param update
+  m = beta1*m + (1-beta1)*dX  # update biased 1st moment estimate
+  v = beta2*v + (1-beta2)*dX^2  # update biased 2nd raw moment estimate
+  # m = m / (1-beta1^t)  # compute bias-corrected 1st moment estimate
+  # v = v / (1-beta2^t)  # compute bias-corrected 2nd raw moment estimate
+  # X = X - (lr * m / (sqrt(v)+epsilon))  # param update
   # Simplified for computational efficiency:
-  lr = lr * sqrt(1 - beta2^t) / (1 - beta1^t)
-  X = X - lr * m / (sqrt(v) + epsilon)
+  lr = lr * sqrt(1-beta2^t) / (1-beta1^t)
+  X = X - (lr * m / (sqrt(v)+epsilon))
 }
 
-init = function(matrix[double] X) return (matrix[double] m, matrix[double] v) {
+init = function(matrix[double] X)
+    return (matrix[double] m, matrix[double] v) {
   /*
    * Initialize the state for this optimizer.
    *
@@ -80,14 +82,14 @@ init = function(matrix[double] X) return (matrix[double] m, matrix[double] v) {
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   * 
+   *
    * Outputs:
    *  - m: Initial state containing the 1st moment (mean) estimate by
    *      maintaining exponential moving averages of the gradients, of
-   *      same shape as X.
+   *      same shape as `X`.
    *  - v: Initial state containing the 2nd raw moment (uncentered
    *      variance) estimate by maintaining exponential moving averages
-   *      of the squared gradients, of same shape as X.
+   *      of the squared gradients, of same shape as `X`.
    */
   m = matrix(0, rows=nrow(X), cols=ncol(X))
   v = matrix(0, rows=nrow(X), cols=ncol(X))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml b/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
index e256000..80c75a0 100644
--- a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
@@ -22,6 +22,7 @@
 /*
  * RMSprop optimizer.
  */
+
 update = function(matrix[double] X, matrix[double] dX, double lr, double decay_rate,
                   double epsilon, matrix[double] cache)
     return (matrix[double] X, matrix[double] cache) {
@@ -39,26 +40,27 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double decay_r
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient of X wrt to a loss function being optimized, of
-   *      same shape as X.
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
    *  - lr: Learning rate.
    *  - decay_rate: Term controlling the rate of the moving average.
    *      Typical values are in the range of [0.9, 0.999].
    *  - epsilon: Smoothing term to avoid divide by zero errors.
    *      Typical values are in the range of [1e-8, 1e-4].
    *  - cache: State that maintains the moving average of the squared
-   *      gradients, of same shape as X.
+   *      gradients, of same shape as `X`.
    *
    * Outputs:
-   *  - X: Updated parameters X, of same shape as input X.
-   *  - v: Updated velocity of the parameters X, of same shape as
-   *      input v.
+   *  - X: Updated parameters `X`, of same shape as input `X`.
+   *  - cache: Updated state that maintains the moving average of the
+   *      squared gradients, of same shape as `X`.
    */
-  cache = decay_rate * cache + (1 - decay_rate) * dX^2
-  X = X - lr * dX / (sqrt(cache) + epsilon)
+  cache = decay_rate*cache + (1-decay_rate)*dX^2
+  X = X - (lr * dX / (sqrt(cache)+epsilon))
 }
 
-init = function(matrix[double] X) return (matrix[double] cache) {
+init = function(matrix[double] X)
+    return (matrix[double] cache) {
   /*
    * Initialize the state for this optimizer.
    *
@@ -67,10 +69,10 @@ init = function(matrix[double] X) return (matrix[double] cache) {
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   * 
+   *
    * Outputs:
    *  - cache: State that maintains the moving average of the squared
-   *      gradients, of same shape as X.
+   *      gradients, of same shape as `X`.
    */
   cache = matrix(0, rows=nrow(X), cols=ncol(X))
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/sgd.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd.dml b/scripts/staging/SystemML-NN/nn/optim/sgd.dml
index 554569a..a3fc744 100644
--- a/scripts/staging/SystemML-NN/nn/optim/sgd.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/sgd.dml
@@ -22,19 +22,21 @@
 /*
  * Stochastic Gradient Descent (SGD) optimizer.
  */
-update = function(matrix[double] X, matrix[double] dX, double lr) return (matrix[double] X) {
+
+update = function(matrix[double] X, matrix[double] dX, double lr)
+    return (matrix[double] X) {
   /*
    * Performs a vanilla SGD update.
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient of X wrt to a loss function being optimized, of
-   *      same shape as X.
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
    *  - lr: Learning rate.
    *
    * Outputs:
-   *  - X: Updated parameters X, of same shape as input X.
+   *  - X: Updated parameters `X`, of same shape as input `X`.
    */
-  X = X - lr * dX
+  X = X - lr*dX
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
index c2a441b..2cb9890 100644
--- a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
@@ -22,6 +22,7 @@
 /*
  * Stochastic Gradient Descent with momentum (SGD-momentum) optimizer.
  */
+
 update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
     return (matrix[double] X, matrix[double] v) {
   /*
@@ -33,25 +34,26 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double mu, mat
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient of X wrt to a loss function being optimized, of
-   *      same shape as X.
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
    *  - lr: Learning rate.
    *  - mu: Momentum value.
    *      Typical values are in the range of [0.5, 0.99], usually
    *      started at the lower end and annealed towards the higher end.
-   *  - v: State maintaining the velocity of the parameters X, of same
-   *      shape as X.
+   *  - v: State maintaining the velocity of the parameters `X`, of same
+   *      shape as `X`.
    *
    * Outputs:
-   *  - X: Updated parameters X, of same shape as input X.
-   *  - v: Updated velocity of the parameters X, of same shape as
-   *      input v.
+   *  - X: Updated parameters `X`, of same shape as input `X`.
+   *  - v: Updated velocity of the parameters `X`, of same shape as
+   *      input `X`.
    */
-  v = mu * v - lr * dX  # update velocity
+  v = mu*v - lr*dX  # update velocity
   X = X + v  # update position
 }
 
-init = function(matrix[double] X) return (matrix[double] v) {
+init = function(matrix[double] X)
+    return (matrix[double] v) {
   /*
    * Initialize the state for this optimizer.
    *
@@ -60,9 +62,9 @@ init = function(matrix[double] X) return (matrix[double] v) {
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   * 
+   *
    * Outputs:
-   *  - v: Initial velocity of the parameters X.
+   *  - v: Initial velocity of the parameters `X`.
    */
   v = matrix(0, rows=nrow(X), cols=ncol(X))
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
index 56c6ab0..fee6585 100644
--- a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
@@ -22,6 +22,7 @@
 /*
  * Stochastic Gradient Descent with Nesterov momentum (SGD-Nesterov) optimizer.
  */
+
 update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
     return (matrix[double] X, matrix[double] v) {
   /*
@@ -36,19 +37,20 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double mu, mat
    * store the parameters in their position after momentum.
    *
    * Reference:
-   *  - Advances in optimizing Recurrent Networks, Bengio et al., section 3.5.
+   *  - Advances in optimizing Recurrent Networks, Bengio et al.,
+   *    section 3.5.
    *    - http://arxiv.org/abs/1212.0901
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient of X wrt to a loss function being optimized, of
-   *      same shape as X.
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
    *  - lr: Learning rate.
    *  - mu: Momentum value.
    *      Typical values are in the range of [0.5, 0.99], usually
    *      started at the lower end and annealed towards the higher end.
-   *  - v: State maintaining the velocity of the parameters X, of same
-   *      shape as X.
+   *  - v: State maintaining the velocity of the parameters `X`, of same
+   *      shape as `X`.
    *
    * Outputs:
    *  - X: Updated parameters X, of same shape as input X.
@@ -56,11 +58,12 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double mu, mat
    *      input v.
    */
   v_prev = v
-  v = mu * v - lr * dX  # update velocity
-  X = X - mu * v_prev + (1 + mu) * v  # update position, including momentum
+  v = mu*v - lr*dX  # update velocity
+  X = X - mu*v_prev + (1+mu)*v  # update position, including momentum
 }
 
-init = function(matrix[double] X) return (matrix[double] v) {
+init = function(matrix[double] X)
+    return (matrix[double] v) {
   /*
    * Initialize the state for this optimizer.
    *
@@ -69,9 +72,9 @@ init = function(matrix[double] X) return (matrix[double] v) {
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   * 
+   *
    * Outputs:
-   *  - v: Initial velocity of the parameters X.
+   *  - v: Initial velocity of the parameters `X`.
    */
   v = matrix(0, rows=nrow(X), cols=ncol(X))
 }

[7/7] incubator-systemml git commit: [MINOR] Comments and whitespace fixes.

Posted by du...@apache.org.

[MINOR] Comments and whitespace fixes.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/ac8ee2be
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/ac8ee2be
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/ac8ee2be

Branch: refs/heads/master
Commit: ac8ee2befb651ae89c481b63b4a8aa842585f7e4
Parents: 07039ca
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Fri Mar 31 18:39:19 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Fri Mar 31 18:39:19 2017 -0700

----------------------------------------------------------------------
 .../staging/SystemML-NN/examples/mnist_lenet-predict.dml |  4 ++--
 .../staging/SystemML-NN/examples/mnist_lenet-train.dml   |  4 ++--
 scripts/staging/SystemML-NN/examples/mnist_lenet.dml     |  4 ++--
 .../SystemML-NN/examples/mnist_softmax-predict.dml       |  4 ++--
 .../staging/SystemML-NN/examples/mnist_softmax-train.dml |  4 ++--
 scripts/staging/SystemML-NN/examples/mnist_softmax.dml   |  4 ++--
 scripts/staging/SystemML-NN/nn/layers/affine.dml         | 11 ++++++-----
 scripts/staging/SystemML-NN/nn/layers/batch_norm.dml     |  6 +++---
 scripts/staging/SystemML-NN/nn/layers/conv2d.dml         |  4 ++--
 scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml |  4 ++--
 .../staging/SystemML-NN/nn/layers/cross_entropy_loss.dml |  6 +++---
 scripts/staging/SystemML-NN/nn/layers/dropout.dml        |  4 ++--
 scripts/staging/SystemML-NN/nn/layers/l1_loss.dml        |  4 ++--
 scripts/staging/SystemML-NN/nn/layers/l1_reg.dml         |  6 +++---
 scripts/staging/SystemML-NN/nn/layers/l2_loss.dml        |  4 ++--
 scripts/staging/SystemML-NN/nn/layers/l2_reg.dml         |  6 +++---
 scripts/staging/SystemML-NN/nn/layers/log_loss.dml       |  4 ++--
 scripts/staging/SystemML-NN/nn/layers/lstm.dml           |  4 ++--
 scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml     |  4 ++--
 .../staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml |  4 ++--
 scripts/staging/SystemML-NN/nn/layers/relu.dml           |  4 ++--
 scripts/staging/SystemML-NN/nn/layers/rnn.dml            |  4 ++--
 scripts/staging/SystemML-NN/nn/layers/sigmoid.dml        |  4 ++--
 scripts/staging/SystemML-NN/nn/layers/softmax.dml        |  4 ++--
 .../staging/SystemML-NN/nn/layers/spatial_batch_norm.dml |  6 +++---
 scripts/staging/SystemML-NN/nn/layers/tanh.dml           |  4 ++--
 scripts/staging/SystemML-NN/nn/optim/adagrad.dml         |  4 ++--
 scripts/staging/SystemML-NN/nn/optim/adam.dml            |  4 ++--
 scripts/staging/SystemML-NN/nn/optim/rmsprop.dml         |  4 ++--
 scripts/staging/SystemML-NN/nn/optim/sgd.dml             |  4 ++--
 scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml    |  4 ++--
 scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml    |  4 ++--
 scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml    |  4 ++--
 scripts/staging/SystemML-NN/nn/test/grad_check.dml       |  4 ++--
 .../staging/SystemML-NN/nn/test/max_pool2d_simple.dml    |  4 ++--
 scripts/staging/SystemML-NN/nn/test/run_tests.dml        |  4 ++--
 scripts/staging/SystemML-NN/nn/test/test.dml             |  4 ++--
 scripts/staging/SystemML-NN/nn/test/util.dml             |  4 ++--
 scripts/staging/SystemML-NN/nn/util.dml                  |  4 ++--
 39 files changed, 87 insertions(+), 86 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml b/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml
index 775926c..51bb6f5 100644
--- a/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml
+++ b/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml b/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml
index c23029f..03c3467 100644
--- a/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml
+++ b/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet.dml b/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
index e2895b8..a261b41 100644
--- a/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
+++ b/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml b/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml
index 52f31fd..353efd1 100644
--- a/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml
+++ b/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml b/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml
index dff192e..fe3a9b2 100644
--- a/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml
+++ b/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/examples/mnist_softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_softmax.dml b/scripts/staging/SystemML-NN/examples/mnist_softmax.dml
index ee0d3cb..dc712f6 100644
--- a/scripts/staging/SystemML-NN/examples/mnist_softmax.dml
+++ b/scripts/staging/SystemML-NN/examples/mnist_softmax.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/affine.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/affine.dml b/scripts/staging/SystemML-NN/nn/layers/affine.dml
index f9f8559..c9a740b 100644
--- a/scripts/staging/SystemML-NN/nn/layers/affine.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/affine.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -20,14 +20,15 @@
 #-------------------------------------------------------------
 
 /*
- * Fully-connected (affine) layer.
+ * Affine (fully-connected) layer.
  */
 
 forward = function(matrix[double] X, matrix[double] W, matrix[double] b)
     return (matrix[double] out) {
   /*
-   * Computes the forward pass for a fully-connected (affine) layer with
-   * M neurons.  The input data has N examples, each with D features.
+   * Computes the forward pass for an affine (fully-connected) layer
+   * with M neurons.  The input data has N examples, each with D
+   * features.
    *
    * Inputs:
    *  - X: Inputs, of shape (N, D).

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
index 82240f7..caad100 100644
--- a/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -20,7 +20,7 @@
 #-------------------------------------------------------------
 
 /*
- * Batch normalization layer.
+ * Batch Normalization layer.
  */
 
 forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d.dml b/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
index 435b3cf..7aeec16 100644
--- a/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
index 29021cf..e7771ba 100644
--- a/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
index 55552e1..63db502 100644
--- a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -20,7 +20,7 @@
 #-------------------------------------------------------------
 
 /*
- * Cross-entropy loss function.
+ * Cross-Entropy loss function.
  */
 
 forward = function(matrix[double] pred, matrix[double] y)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/dropout.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/dropout.dml b/scripts/staging/SystemML-NN/nn/layers/dropout.dml
index b348642..a36878b 100644
--- a/scripts/staging/SystemML-NN/nn/layers/dropout.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/dropout.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
index 24b15e2..b74566d 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
index f643274..2b81c0b 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -20,7 +20,7 @@
 #-------------------------------------------------------------
 
 /*
- * L1 regularizataion.
+ * L1 regularization.
  */
 
 forward = function(matrix[double] X, double lambda)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
index df8bc1c..0482f25 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
index 5074c06..7255efe 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -20,7 +20,7 @@
 #-------------------------------------------------------------
 
 /*
- * L2 regularizataion.
+ * L2 regularization.
  */
 
 forward = function(matrix[double] X, double lambda)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
index 7dd85d3..15914f7 100644
--- a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/lstm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/lstm.dml b/scripts/staging/SystemML-NN/nn/layers/lstm.dml
index 44f2ef2..a75add4 100644
--- a/scripts/staging/SystemML-NN/nn/layers/lstm.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/lstm.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
index 229b7b9..ef1499a 100644
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
index be4e195..65ba71f 100644
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/relu.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/relu.dml b/scripts/staging/SystemML-NN/nn/layers/relu.dml
index 6a4c15c..93a6e90 100644
--- a/scripts/staging/SystemML-NN/nn/layers/relu.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/relu.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/rnn.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/rnn.dml b/scripts/staging/SystemML-NN/nn/layers/rnn.dml
index cdceab8..3c6faae 100644
--- a/scripts/staging/SystemML-NN/nn/layers/rnn.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/rnn.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml b/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
index 185befb..2d85adc 100644
--- a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/softmax.dml b/scripts/staging/SystemML-NN/nn/layers/softmax.dml
index 1751838..68a7bc7 100644
--- a/scripts/staging/SystemML-NN/nn/layers/softmax.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/softmax.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml b/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
index 0185a2c..6e57b05 100644
--- a/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -20,7 +20,7 @@
 #-------------------------------------------------------------
 
 /*
- * Spatial batch normalization layer.
+ * Spatial Batch Normalization layer.
  */
 source("nn/util.dml") as util
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/layers/tanh.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/tanh.dml b/scripts/staging/SystemML-NN/nn/layers/tanh.dml
index 589a574..d849d70 100644
--- a/scripts/staging/SystemML-NN/nn/layers/tanh.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/tanh.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml b/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
index 20b26c4..85b1c41 100644
--- a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/optim/adam.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/adam.dml b/scripts/staging/SystemML-NN/nn/optim/adam.dml
index 0607fa5..4b6fa2a 100644
--- a/scripts/staging/SystemML-NN/nn/optim/adam.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/adam.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml b/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
index 80c75a0..1feccaf 100644
--- a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/optim/sgd.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd.dml b/scripts/staging/SystemML-NN/nn/optim/sgd.dml
index a3fc744..3ba7eba 100644
--- a/scripts/staging/SystemML-NN/nn/optim/sgd.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/sgd.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
index 2cb9890..85922da 100644
--- a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
index fee6585..3b62c6e 100644
--- a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml b/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
index efd99c3..05f0f7d 100644
--- a/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
+++ b/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/grad_check.dml b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
index 27f4420..1b42b67 100644
--- a/scripts/staging/SystemML-NN/nn/test/grad_check.dml
+++ b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml b/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
index 47dab3a..dee1a48 100644
--- a/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
+++ b/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/test/run_tests.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/run_tests.dml b/scripts/staging/SystemML-NN/nn/test/run_tests.dml
index 644662c..dc53cb9 100644
--- a/scripts/staging/SystemML-NN/nn/test/run_tests.dml
+++ b/scripts/staging/SystemML-NN/nn/test/run_tests.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/test/test.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/test.dml b/scripts/staging/SystemML-NN/nn/test/test.dml
index 64fc519..958c2c5 100644
--- a/scripts/staging/SystemML-NN/nn/test/test.dml
+++ b/scripts/staging/SystemML-NN/nn/test/test.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/test/util.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/util.dml b/scripts/staging/SystemML-NN/nn/test/util.dml
index 128e4db..e32a885 100644
--- a/scripts/staging/SystemML-NN/nn/test/util.dml
+++ b/scripts/staging/SystemML-NN/nn/test/util.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ac8ee2be/scripts/staging/SystemML-NN/nn/util.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/util.dml b/scripts/staging/SystemML-NN/nn/util.dml
index 405d208..62a90f2 100644
--- a/scripts/staging/SystemML-NN/nn/util.dml
+++ b/scripts/staging/SystemML-NN/nn/util.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

[6/7] incubator-systemml git commit: [SYSTEMML-1453] Update Conv & Max Pooling layer names to include "2D"

Posted by du...@apache.org.

[SYSTEMML-1453] Update Conv & Max Pooling layer names to include "2D"

This updates `conv*.dml` and `max_pool*.dml` to `conv2d*.dml` and
`max_pool2d*.dml` to allow for 1D and 3D variants in the future.

Closes #447.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/07039caa
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/07039caa
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/07039caa

Branch: refs/heads/master
Commit: 07039caa9629dd3a26aa66c9ec860cf7f7917724
Parents: 5c59e03
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Fri Mar 31 18:39:11 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Fri Mar 31 18:39:11 2017 -0700

----------------------------------------------------------------------
 projects/breast_cancer/convnet.dml              | 101 +++++----
 .../SystemML-NN/examples/mnist_lenet.dml        |  56 ++---
 scripts/staging/SystemML-NN/nn/layers/conv.dml  | 194 -----------------
 .../staging/SystemML-NN/nn/layers/conv2d.dml    | 194 +++++++++++++++++
 .../SystemML-NN/nn/layers/conv2d_builtin.dml    | 160 ++++++++++++++
 .../SystemML-NN/nn/layers/conv_builtin.dml      | 155 -------------
 .../staging/SystemML-NN/nn/layers/max_pool.dml  | 159 --------------
 .../SystemML-NN/nn/layers/max_pool2d.dml        | 159 ++++++++++++++
 .../nn/layers/max_pool2d_builtin.dml            | 103 +++++++++
 .../SystemML-NN/nn/layers/max_pool_builtin.dml  | 103 ---------
 .../SystemML-NN/nn/test/conv2d_simple.dml       | 215 +++++++++++++++++++
 .../staging/SystemML-NN/nn/test/conv_simple.dml | 215 -------------------
 .../staging/SystemML-NN/nn/test/grad_check.dml  | 170 +++++++--------
 .../SystemML-NN/nn/test/max_pool2d_simple.dml   | 172 +++++++++++++++
 .../SystemML-NN/nn/test/max_pool_simple.dml     | 172 ---------------
 .../staging/SystemML-NN/nn/test/run_tests.dml   |  16 +-
 scripts/staging/SystemML-NN/nn/test/test.dml    | 115 +++++-----
 17 files changed, 1248 insertions(+), 1211 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/projects/breast_cancer/convnet.dml
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/convnet.dml b/projects/breast_cancer/convnet.dml
index 5f115a2..85c7dd8 100644
--- a/projects/breast_cancer/convnet.dml
+++ b/projects/breast_cancer/convnet.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -24,11 +24,11 @@
  */
 # Imports
 source("nn/layers/affine.dml") as affine
-source("nn/layers/conv_builtin.dml") as conv
+source("nn/layers/conv2d_builtin.dml") as conv2d
 source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
 source("nn/layers/dropout.dml") as dropout
 source("nn/layers/l2_reg.dml") as l2_reg
-source("nn/layers/max_pool_builtin.dml") as max_pool
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
 source("nn/layers/relu.dml") as relu
 source("nn/layers/softmax.dml") as softmax
 #source("nn/optim/adam.dml") as adam
@@ -96,9 +96,9 @@ train = function(matrix[double] X, matrix[double] Y,
   F3 = 32  # num conv filters in conv3
   N1 = 512  # num nodes in affine1
   # Note: affine2 has K nodes, which is equal to the number of target dimensions (num classes)
-  [Wc1, bc1] = conv::init(F1, C, Hf, Wf)  # inputs: (N, C*Hin*Win)
-  [Wc2, bc2] = conv::init(F2, F1, Hf, Wf)  # inputs: (N, F1*(Hin/2)*(Win/2))
-  [Wc3, bc3] = conv::init(F3, F2, Hf, Wf)  # inputs: (N, F2*(Hin/2^2)*(Win/2^2))
+  [Wc1, bc1] = conv2d::init(F1, C, Hf, Wf)  # inputs: (N, C*Hin*Win)
+  [Wc2, bc2] = conv2d::init(F2, F1, Hf, Wf)  # inputs: (N, F1*(Hin/2)*(Win/2))
+  [Wc3, bc3] = conv2d::init(F3, F2, Hf, Wf)  # inputs: (N, F2*(Hin/2^2)*(Win/2^2))
   [Wa1, ba1] = affine::init(F3*(Hin/2^3)*(Win/2^3), N1)  # inputs: (N, F3*(Hin/2^3)*(Win/2^3))
   [Wa2, ba2] = affine::init(N1, K)  # inputs: (N, N1)
   Wa2 = Wa2 / sqrt(2)  # different initialization, since being fed into softmax, instead of relu
@@ -145,17 +145,23 @@ train = function(matrix[double] X, matrix[double] Y,
 
       # Compute forward pass
       ## conv layer 1: conv1 -> relu1 -> pool1
-      [outc1, Houtc1, Woutc1] = conv::forward(X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf,
+                                                stride, stride, pad, pad)
       outc1r = relu::forward(outc1)
-      [outc1p, Houtc1p, Woutc1p] = max_pool::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2, strideh=2, stridew=2) 
+      [outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+                                                       strideh=2, stridew=2)
       ## conv layer 2: conv2 -> relu2 -> pool2
-      [outc2, Houtc2, Woutc2] = conv::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf, stride, stride, pad, pad)
+      [outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf,
+                                                stride, stride, pad, pad)
       outc2r = relu::forward(outc2)
-      [outc2p, Houtc2p, Woutc2p] = max_pool::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2, strideh=2, stridew=2) 
+      [outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+                                                       strideh=2, stridew=2)
       ## conv layer 3: conv3 -> relu3 -> pool3
-      [outc3, Houtc3, Woutc3] = conv::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf, stride, stride, pad, pad)
+      [outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf,
+                                                stride, stride, pad, pad)
       outc3r = relu::forward(outc3)
-      [outc3p, Houtc3p, Woutc3p] = max_pool::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2, strideh=2, stridew=2) 
+      [outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2,
+                                                       strideh=2, stridew=2)
       ## affine layer 1:  affine1 -> relu1 -> dropout1
       outa1 = affine::forward(outc3p, Wa1, ba1)
       outa1r = relu::forward(outa1)
@@ -176,17 +182,23 @@ train = function(matrix[double] X, matrix[double] Y,
       douta1 = relu::backward(douta1r, outa1)
       [doutc3p, dWa1, dba1] = affine::backward(douta1, outc3p, Wa1, ba1)
       ## conv layer 3: conv3 -> relu3 -> pool3
-      doutc3r = max_pool::backward(doutc3p, Houtc3p, Woutc3p, outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2, strideh=2, stridew=2)
+      doutc3r = max_pool2d::backward(doutc3p, Houtc3p, Woutc3p, outc3r, F3, Houtc3, Woutc3,
+                                     Hf=2, Wf=2, strideh=2, stridew=2)
       doutc3 = relu::backward(doutc3r, outc3)
-      [doutc2p, dWc3, dbc3] = conv::backward(doutc3, Houtc3, Woutc3, outc2p, Wc3, bc2, F2, Houtc2p, Woutc2p, Hf, Wf, stride, stride, pad, pad)
+      [doutc2p, dWc3, dbc3] = conv2d::backward(doutc3, Houtc3, Woutc3, outc2p, Wc3, bc2, F2,
+                                               Houtc2p, Woutc2p, Hf, Wf, stride, stride, pad, pad)
       ## conv layer 2: conv2 -> relu2 -> pool2
-      doutc2r = max_pool::backward(doutc2p, Houtc2p, Woutc2p, outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2, strideh=2, stridew=2)
+      doutc2r = max_pool2d::backward(doutc2p, Houtc2p, Woutc2p, outc2r, F2, Houtc2, Woutc2,
+                                     Hf=2, Wf=2, strideh=2, stridew=2)
       doutc2 = relu::backward(doutc2r, outc2)
-      [doutc1p, dWc2, dbc2] = conv::backward(doutc2, Houtc2, Woutc2, outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf, stride, stride, pad, pad)
+      [doutc1p, dWc2, dbc2] = conv2d::backward(doutc2, Houtc2, Woutc2, outc1p, Wc2, bc2, F1,
+                                               Houtc1p, Woutc1p, Hf, Wf, stride, stride, pad, pad)
       ## conv layer 1: conv1 -> relu1 -> pool1
-      doutc1r = max_pool::backward(doutc1p, Houtc1p, Woutc1p, outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2, strideh=2, stridew=2)
+      doutc1r = max_pool2d::backward(doutc1p, Houtc1p, Woutc1p, outc1r, F1, Houtc1, Woutc1,
+                                     Hf=2, Wf=2, strideh=2, stridew=2)
       doutc1 = relu::backward(doutc1r, outc1)
-      [dX_batch, dWc1, dbc1] = conv::backward(doutc1, Houtc1, Woutc1, X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      [dX_batch, dWc1, dbc1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, Wc1, bc1, C,
+                                                Hin, Win, Hf, Wf, stride, stride, pad, pad)
 
       # Compute regularization backward pass
       dWc1_reg = l2_reg::backward(Wc1, lambda)
@@ -222,7 +234,7 @@ train = function(matrix[double] X, matrix[double] Y,
       #[ba1, mba1, vba1] = adam::update(ba1, dba1, lr, beta1, beta2, eps, t, mba1, vba1)
       #[Wa2, mWa2, vWa2] = adam::update(Wa2, dWa2, lr, beta1, beta2, eps, t, mWa2, vWa2)
       #[ba2, mba2, vba2] = adam::update(ba2, dba2, lr, beta1, beta2, eps, t, mba2, vba2)
-        
+
       # Compute loss & accuracy for training & validation data every `log_interval` iterations.
       if (i %% log_interval == 0) {
         # Compute training loss & accuracy
@@ -348,7 +360,8 @@ predict = function(matrix[double] X, int C, int Hin, int Win,
   N = nrow(X)
 
   # Network:
-  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> pool3 -> affine1 -> relu1 -> affine2 -> softmax
+  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> pool3
+  #  -> affine1 -> relu1 -> affine2 -> softmax
   Hf = 3  # filter height
   Wf = 3  # filter width
   stride = 1
@@ -365,17 +378,23 @@ predict = function(matrix[double] X, int C, int Hin, int Win,
   # so that it can be efficiently used for parallel predictions.
   ## Compute forward pass
   ### conv layer 1: conv1 -> relu1 -> pool1
-  #[outc1, Houtc1, Woutc1] = conv::forward(X, Wc1, bc1, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  #[outc1, Houtc1, Woutc1] = conv2d::forward(X, Wc1, bc1, C, Hin, Win, Hf, Wf, stride, stride,
+  #                                          pad, pad)
   #outc1r = relu::forward(outc1)
-  #[outc1p, Houtc1p, Woutc1p] = max_pool::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2, strideh=2, stridew=2) 
+  #[outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+  #                                                 strideh=2, stridew=2)
   ### conv layer 2: conv2 -> relu2 -> pool2
-  #[outc2, Houtc2, Woutc2] = conv::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf, stride, stride, pad, pad)
+  #[outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf,
+  #                                          stride, stride, pad, pad)
   #outc2r = relu::forward(outc2)
-  #[outc2p, Houtc2p, Woutc2p] = max_pool::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2, strideh=2, stridew=2) 
+  #[outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+  #                                                 strideh=2, stridew=2)
   ### conv layer 3: conv3 -> relu3 -> pool3
-  #[outc3, Houtc3, Woutc3] = conv::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf, stride, stride, pad, pad)
+  #[outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf,
+  #                                          stride, stride, pad, pad)
   #outc3r = relu::forward(outc3)
-  #[outc3p, Houtc3p, Woutc3p] = max_pool::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2, strideh=2, stridew=2) 
+  #[outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2,
+  #                                                 strideh=2, stridew=2)
   ### affine layer 1:  affine1 -> relu1 -> dropout
   #outa1 = affine::forward(outc3p, Wa1, ba1)
   #outa1r = relu::forward(outa1)
@@ -398,17 +417,23 @@ predict = function(matrix[double] X, int C, int Hin, int Win,
 
     # Compute forward pass
     ## conv layer 1: conv1 -> relu1 -> pool1
-    [outc1, Houtc1, Woutc1] = conv::forward(X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+    [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf,
+                                              stride, stride, pad, pad)
     outc1r = relu::forward(outc1)
-    [outc1p, Houtc1p, Woutc1p] = max_pool::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2, strideh=2, stridew=2) 
+    [outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+                                                     strideh=2, stridew=2)
     ## conv layer 2: conv2 -> relu2 -> pool2
-    [outc2, Houtc2, Woutc2] = conv::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf, stride, stride, pad, pad)
+    [outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf,
+                                              stride, stride, pad, pad)
     outc2r = relu::forward(outc2)
-    [outc2p, Houtc2p, Woutc2p] = max_pool::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2, strideh=2, stridew=2) 
+    [outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+                                                     strideh=2, stridew=2)
     ## conv layer 3: conv3 -> relu3 -> pool3
-    [outc3, Houtc3, Woutc3] = conv::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf, stride, stride, pad, pad)
+    [outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf,
+                                              stride, stride, pad, pad)
     outc3r = relu::forward(outc3)
-    [outc3p, Houtc3p, Woutc3p] = max_pool::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2, strideh=2, stridew=2) 
+    [outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2,
+                                                     strideh=2, stridew=2)
     ## affine layer 1:  affine1 -> relu1 -> dropout
     outa1 = affine::forward(outc3p, Wa1, ba1)
     outa1r = relu::forward(outa1)
@@ -433,7 +458,7 @@ eval = function(matrix[double] probs, matrix[double] Y)
    *
    * Inputs:
    *  - probs: Class probabilities, of shape (N, K).
-   *  - Y: Target matrix, of shape (N, 
+   *  - Y: Target matrix, of shape (N,
    *
    * Outputs:
    *  - loss: Scalar loss, of shape (1).
@@ -448,7 +473,7 @@ eval = function(matrix[double] probs, matrix[double] Y)
 generate_dummy_data = function()
     return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) {
   /*
-   * Generate a dummy dataset similar to the MNIST dataset.
+   * Generate a dummy dataset similar to the breast cancer dataset.
    *
    * Outputs:
    *  - X: Input data matrix, of shape (N, D).
@@ -459,9 +484,9 @@ generate_dummy_data = function()
    */
   # Generate dummy input data
   N = 1024  # num examples
-  C = 1  # num input channels
-  Hin = 64  # input height
-  Win = 64  # input width
+  C = 3  # num input channels
+  Hin = 256  # input height
+  Win = 256  # input width
   K = 3  # num target classes
   X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
   classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet.dml b/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
index f991487..e2895b8 100644
--- a/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
+++ b/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
@@ -24,11 +24,11 @@
  */
 # Imports
 source("nn/layers/affine.dml") as affine
-source("nn/layers/conv_builtin.dml") as conv
+source("nn/layers/conv2d_builtin.dml") as conv2d
 source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
 source("nn/layers/dropout.dml") as dropout
 source("nn/layers/l2_reg.dml") as l2_reg
-source("nn/layers/max_pool_builtin.dml") as max_pool
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
 source("nn/layers/relu.dml") as relu
 source("nn/layers/softmax.dml") as softmax
 source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
@@ -81,8 +81,8 @@ train = function(matrix[double] X, matrix[double] y,
   N3 = 512  # num nodes in affine3
   # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)
 
-  [W1, b1] = conv::init(F1, C, Hf, Wf)  # inputs: (N, C*Hin*Win)
-  [W2, b2] = conv::init(F2, F1, Hf, Wf)  # inputs: (N, F1*(Hin/2)*(Win/2))
+  [W1, b1] = conv2d::init(F1, C, Hf, Wf)  # inputs: (N, C*Hin*Win)
+  [W2, b2] = conv2d::init(F2, F1, Hf, Wf)  # inputs: (N, F1*(Hin/2)*(Win/2))
   [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3)  # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
   [W4, b4] = affine::init(N3, K)  # inputs: (N, N3)
   W4 = W4 / sqrt(2)  # different initialization, since being fed into softmax, instead of relu
@@ -114,17 +114,17 @@ train = function(matrix[double] X, matrix[double] y,
 
       # Compute forward pass
       ## layer 1: conv1 -> relu1 -> pool1
-      [outc1, Houtc1, Woutc1] = conv::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
-                                              pad, pad)
+      [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
+                                                pad, pad)
       outr1 = relu::forward(outc1)
-      [outp1, Houtp1, Woutp1] = max_pool::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
-                                                  strideh=2, stridew=2, pad=0, pad=0)
+      [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+                                                    strideh=2, stridew=2, pad=0, pad=0)
       ## layer 2: conv2 -> relu2 -> pool2
-      [outc2, Houtc2, Woutc2] = conv::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
-                                              stride, stride, pad, pad)
+      [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
+                                                stride, stride, pad, pad)
       outr2 = relu::forward(outc2)
-      [outp2, Houtp2, Woutp2] = max_pool::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
-                                                  strideh=2, stridew=2, pad=0, pad=0)
+      [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+                                                    strideh=2, stridew=2, pad=0, pad=0)
       ## layer 3:  affine3 -> relu3 -> dropout
       outa3 = affine::forward(outp2, W3, b3)
       outr3 = relu::forward(outa3)
@@ -165,17 +165,17 @@ train = function(matrix[double] X, matrix[double] y,
       douta3 = relu::backward(doutr3, outa3)
       [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
       ## layer 2: conv2 -> relu2 -> pool2
-      doutr2 = max_pool::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
-                                  strideh=2, stridew=2, pad=0, pad=0)
+      doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+                                    strideh=2, stridew=2, pad=0, pad=0)
       doutc2 = relu::backward(doutr2, outc2)
-      [doutp1, dW2, db2] = conv::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
-                                          Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
+      [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
+                                            Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
       ## layer 1: conv1 -> relu1 -> pool1
-      doutr1 = max_pool::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
-                                  strideh=2, stridew=2, pad=0, pad=0)
+      doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+                                    strideh=2, stridew=2, pad=0, pad=0)
       doutc1 = relu::backward(doutr1, outc1)
-      [dX_batch, dW1, db1] = conv::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win,
-                                            Hf, Wf, stride, stride, pad, pad)
+      [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win,
+                                              Hf, Wf, stride, stride, pad, pad)
 
       # Compute regularization backward pass
       dW1_reg = l2_reg::backward(W1, lambda)
@@ -260,17 +260,17 @@ predict = function(matrix[double] X, int C, int Hin, int Win,
 
     # Compute forward pass
     ## layer 1: conv1 -> relu1 -> pool1
-    [outc1, Houtc1, Woutc1] = conv::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
-                                            pad, pad)
+    [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
+                                              pad, pad)
     outr1 = relu::forward(outc1)
-    [outp1, Houtp1, Woutp1] = max_pool::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
-                                                strideh=2, stridew=2, pad=0, pad=0)
+    [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+                                                  strideh=2, stridew=2, pad=0, pad=0)
     ## layer 2: conv2 -> relu2 -> pool2
-    [outc2, Houtc2, Woutc2] = conv::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
-                                            stride, stride, pad, pad)
+    [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
+                                              stride, stride, pad, pad)
     outr2 = relu::forward(outc2)
-    [outp2, Houtp2, Woutp2] = max_pool::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
-                                                strideh=2, stridew=2, pad=0, pad=0)
+    [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+                                                  strideh=2, stridew=2, pad=0, pad=0)
     ## layer 3:  affine3 -> relu3
     outa3 = affine::forward(outp2, W3, b3)
     outr3 = relu::forward(outa3)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/conv.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv.dml b/scripts/staging/SystemML-NN/nn/layers/conv.dml
deleted file mode 100644
index 435b3cf..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/conv.dml
+++ /dev/null
@@ -1,194 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
-                   int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial convolutional layer with
-   * F filters.  The input data has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * This implementation uses `im2col` internally for each image to
-   * extract local image regions (patches) into columns, and then
-   * performs a matrix multiplication with the filters to compute the
-   * output maps.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set `padh = (Hf - 1) / 2`,
-   *      assuming `strideh = 1`.
-   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
-   *      preserves the spatial dimensions of the input.
-   *  - padw: Padding for left and right sides.
-   *      For same output width as input, set `padw = (Wf - 1) / 2`,
-   *      assuming `stridew = 1`.
-   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
-   *      preserves the spatial dimensions of the input.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  F = nrow(W)
-  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
-  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
-
-  # Create output volume
-  out = matrix(0, rows=N, cols=F*Hout*Wout)
-
-  # Convolution - im2col implementation
-  parfor (n in 1:N) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
-
-    # Pad image
-    Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0)  # shape (C, (Hin+2*padh)*(Win+2*padw))
-
-    # Extract local image patches into columns with im2col, of shape (C*Hf*Wf, Hout*Wout)
-    Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
-
-    # Convolve patches with filters
-    outn = W %*% Xn_padded_cols + b  # shape (F, Hout*Wout)
-    out[n,] = matrix(outn, rows=1, cols=F*Hout*Wout)  # reshape
-  }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
-                    matrix[double] X, matrix[double] W, matrix[double] b,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
-  /*
-   * Computes the backward pass for a 2D spatial convolutional layer
-   * with F filters.
-   *
-   * This implementation uses `im2col` and `col2im` internally.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *  - padw: Padding for left and right sides.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt `b`, of shape (F, 1).
-   */
-  N = nrow(X)
-  F = nrow(W)
-
-  # Create gradient volumes
-  # Note: Create convenience gradient volumes for dW and db that will
-  # allow for one gradient to be stored per example, allowing for
-  # parallel computation at the expense of memory.  We will reduce at
-  # the end.
-  dX = matrix(0, rows=N, cols=C*Hin*Win)
-  dWN = matrix(0, rows=N, cols=F*C*Hf*Wf)  # dW = matrix(0, rows=F, cols=C*Hf*Wf)
-  dbN = matrix(0, rows=N, cols=F)  # db = matrix(0, rows=F, cols=1)
-
-  # Partial derivatives for convolution - im2col implementation
-  parfor (n in 1:N) {  # all examples
-    doutn = matrix(dout[n,], rows=F, cols=Hout*Wout)
-
-    # Compute dW
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
-    Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0)  # shape (C, (Hin+2*padh)*(Win+2*padw))
-    Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
-    # dW = dW + doutn %*% t(Xn_padded_cols)
-    dWN[n,] = matrix(doutn %*% t(Xn_padded_cols), rows=1, cols=F*C*Hf*Wf)
-
-    # Compute db
-    # db = db + rowSums(doutn)
-    dbN[n,] = matrix(rowSums(doutn), rows=1, cols=F)
-
-    # Compute dX
-    dXn_padded_cols = t(W) %*% doutn  # shape (C*Hf*Wf, Hout*Wout)
-    dXn_padded = util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, Wf,
-                              strideh, stridew, "add")
-    dXn = util::unpad_image(dXn_padded, Hin, Win, padh, padw)
-    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)  # reshape
-  }
-
-  # Reduce convenience gradient volumes with one gradient per example
-  # into single gradients for W and b.
-  dW = matrix(colSums(dWN), rows=F, cols=C*Hf*Wf)
-  db = matrix(colSums(dbN), rows=F, cols=1)
-}
-
-init = function(int F, int C, int Hf, int Wf)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * We use the heuristic by He et al., which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * unit-Gaussian weights by a factor of sqrt(2/n), under the
-   * assumption of relu neurons.
-   *  - http://arxiv.org/abs/1502.01852
-   *
-   * Inputs:
-   *  - F: Number of filters.
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   */
-  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
-  b = matrix(0, rows=F, cols=1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d.dml b/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
new file mode 100644
index 0000000..435b3cf
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
@@ -0,0 +1,194 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+                   int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial convolutional layer with
+   * F filters.  The input data has N examples, each represented as a 3D
+   * volume unrolled into a single vector.
+   *
+   * This implementation uses `im2col` internally for each image to
+   * extract local image regions (patches) into columns, and then
+   * performs a matrix multiplication with the filters to compute the
+   * output maps.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+   *      preserves the spatial dimensions of the input.
+   *  - padw: Padding for left and right sides.
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+   *      preserves the spatial dimensions of the input.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
+
+  # Create output volume
+  out = matrix(0, rows=N, cols=F*Hout*Wout)
+
+  # Convolution - im2col implementation
+  parfor (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
+
+    # Pad image
+    Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0)  # shape (C, (Hin+2*padh)*(Win+2*padw))
+
+    # Extract local image patches into columns with im2col, of shape (C*Hf*Wf, Hout*Wout)
+    Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
+
+    # Convolve patches with filters
+    outn = W %*% Xn_padded_cols + b  # shape (F, Hout*Wout)
+    out[n,] = matrix(outn, rows=1, cols=F*Hout*Wout)  # reshape
+  }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+                    matrix[double] X, matrix[double] W, matrix[double] b,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+  /*
+   * Computes the backward pass for a 2D spatial convolutional layer
+   * with F filters.
+   *
+   * This implementation uses `im2col` and `col2im` internally.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt `b`, of shape (F, 1).
+   */
+  N = nrow(X)
+  F = nrow(W)
+
+  # Create gradient volumes
+  # Note: Create convenience gradient volumes for dW and db that will
+  # allow for one gradient to be stored per example, allowing for
+  # parallel computation at the expense of memory.  We will reduce at
+  # the end.
+  dX = matrix(0, rows=N, cols=C*Hin*Win)
+  dWN = matrix(0, rows=N, cols=F*C*Hf*Wf)  # dW = matrix(0, rows=F, cols=C*Hf*Wf)
+  dbN = matrix(0, rows=N, cols=F)  # db = matrix(0, rows=F, cols=1)
+
+  # Partial derivatives for convolution - im2col implementation
+  parfor (n in 1:N) {  # all examples
+    doutn = matrix(dout[n,], rows=F, cols=Hout*Wout)
+
+    # Compute dW
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
+    Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0)  # shape (C, (Hin+2*padh)*(Win+2*padw))
+    Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
+    # dW = dW + doutn %*% t(Xn_padded_cols)
+    dWN[n,] = matrix(doutn %*% t(Xn_padded_cols), rows=1, cols=F*C*Hf*Wf)
+
+    # Compute db
+    # db = db + rowSums(doutn)
+    dbN[n,] = matrix(rowSums(doutn), rows=1, cols=F)
+
+    # Compute dX
+    dXn_padded_cols = t(W) %*% doutn  # shape (C*Hf*Wf, Hout*Wout)
+    dXn_padded = util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, Wf,
+                              strideh, stridew, "add")
+    dXn = util::unpad_image(dXn_padded, Hin, Win, padh, padw)
+    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)  # reshape
+  }
+
+  # Reduce convenience gradient volumes with one gradient per example
+  # into single gradients for W and b.
+  dW = matrix(colSums(dWN), rows=F, cols=C*Hf*Wf)
+  db = matrix(colSums(dbN), rows=F, cols=1)
+}
+
+init = function(int F, int C, int Hf, int Wf)
+    return (matrix[double] W, matrix[double] b) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
+   *
+   * Inputs:
+   *  - F: Number of filters.
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *
+   * Outputs:
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   */
+  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+  b = matrix(0, rows=F, cols=1)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
new file mode 100644
index 0000000..29021cf
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
@@ -0,0 +1,160 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ *
+ * This implementation uses a built-in operator for higher performance.
+ */
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+                   int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial convolutional layer with
+   * F filters.  The input data has N examples, each represented as a 3D
+   * volume unrolled into a single vector.
+   *
+   * This implementation uses a built-in operator for higher
+   * performance.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+   *      preserves the spatial dimensions of the input.
+   *  - padw: Padding for left and right sides.
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+   *      preserves the spatial dimensions of the input.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
+
+  # Convolution - built-in implementation
+  out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
+               stride=[strideh,stridew], padding=[padh,padw])
+
+  # Add bias term to each output filter
+  out = bias_add(out, b)
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+                    matrix[double] X, matrix[double] W, matrix[double] b,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+  /*
+   * Computes the backward pass for a 2D spatial convolutional layer
+   * with F filters.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+   *      preserves the spatial dimensions of the input.
+   *  - padw: Padding for left and right sides.
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+   *      preserves the spatial dimensions of the input.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt `b`, of shape (F, 1).
+   */
+  N = nrow(X)
+  F = nrow(W)
+
+  # Partial derivatives for convolution - built-in implementation
+  dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
+                              input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
+  dX = conv2d_backward_data(W, dout, stride=[strideh, stridew], padding=[padh,padw],
+                            input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
+
+  # Partial derivatives for bias vector
+  db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))
+}
+
+init = function(int F, int C, int Hf, int Wf)
+    return (matrix[double] W, matrix[double] b) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
+   *
+   * Inputs:
+   *  - F: Number of filters.
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *
+   * Outputs:
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   */
+  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+  b = matrix(0, rows=F, cols=1)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
deleted file mode 100644
index c2b809e..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
+++ /dev/null
@@ -1,155 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- */
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
-                   int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial convolutional layer with
-   * F filters.  The input data has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set `padh = (Hf - 1) / 2`,
-   *      assuming `strideh = 1`.
-   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
-   *      preserves the spatial dimensions of the input.
-   *  - padw: Padding for left and right sides.
-   *      For same output width as input, set `padw = (Wf - 1) / 2`,
-   *      assuming `stridew = 1`.
-   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
-   *      preserves the spatial dimensions of the input.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  F = nrow(W)
-  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
-  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
-
-  # Convolution - built-in implementation
-  out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
-               stride=[strideh,stridew], padding=[padh,padw])
-
-  # Add bias term to each output filter
-  out = bias_add(out, b)
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
-                    matrix[double] X, matrix[double] W, matrix[double] b,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
-  /*
-   * Computes the backward pass for a 2D spatial convolutional layer
-   * with F filters.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set `padh = (Hf - 1) / 2`,
-   *      assuming `strideh = 1`.
-   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
-   *      preserves the spatial dimensions of the input.
-   *  - padw: Padding for left and right sides.
-   *      For same output width as input, set `padw = (Wf - 1) / 2`,
-   *      assuming `stridew = 1`.
-   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
-   *      preserves the spatial dimensions of the input.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt `b`, of shape (F, 1).
-   */
-  N = nrow(X)
-  F = nrow(W)
-
-  # Partial derivatives for convolution - built-in implementation
-  dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
-                              input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
-  dX = conv2d_backward_data(W, dout, stride=[strideh, stridew], padding=[padh,padw],
-                            input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
-
-  # Partial derivatives for bias vector
-  db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))
-}
-
-init = function(int F, int C, int Hf, int Wf)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * We use the heuristic by He et al., which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * unit-Gaussian weights by a factor of sqrt(2/n), under the
-   * assumption of relu neurons.
-   *  - http://arxiv.org/abs/1502.01852
-   *
-   * Inputs:
-   *  - F: Number of filters.
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   */
-  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
-  b = matrix(0, rows=F, cols=1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
deleted file mode 100644
index a12877f..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
+++ /dev/null
@@ -1,159 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Max pooling layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * This implementation uses `im2col` internally for each image to
-   * extract local image regions (patches) of each channel slice into
-   * columns, and then performs max pooling over the patches to compute
-   * the output maps.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
-  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
-  pad_value = -1/0  # in max pooling we pad with -infinity
-
-  # Create output volume
-  out = matrix(0, rows=N, cols=C*Hout*Wout)
-
-  # Max pooling - im2col implementation
-  parfor (n in 1:N) {  # all examples
-    img = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
-
-    if (padh > 0 | padw > 0) {
-      # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
-      img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
-    }
-
-    img_maxes = matrix(0, rows=C, cols=Hout*Wout)  # zeros
-    parfor (c in 1:C) {  # all channels
-      # Extract local image slice patches into columns with im2col, of shape (Hf*Wf, Hout*Wout)
-      img_slice_cols = util::im2col(img[c,], Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
-
-      # Max pooling on patches
-      img_maxes[c,] = colMaxs(img_slice_cols)
-    }
-
-    out[n,] = matrix(img_maxes, rows=1, cols=C*Hout*Wout)
-  }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   */
-  N = nrow(X)
-  pad_value = -1/0  # in max pooling we pad with -infinity
-
-  # Create gradient volume
-  dX = matrix(0, rows=N, cols=C*Hin*Win)
-
-  # Gradient of max pooling
-  parfor (n in 1:N, check=0) {  # all examples
-    img = matrix(X[n,], rows=C, cols=Hin*Win)
-    if (padh > 0 | padw > 0) {
-      # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
-      img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
-    }
-
-    dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
-    parfor (c in 1:C, check=0) {  # all channels
-      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
-      for (hout in 1:Hout, check=0) {  # all output rows
-        hin = (hout-1)*strideh + 1
-        for (wout in 1:Wout) {  # all output columns
-          win = (wout-1)*stridew + 1
-          img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
-          max_val_ind = img_slice_patch == max(img_slice_patch)  # max value indicator matrix
-          # gradient passes through only for the max value(s) in this patch
-          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
-          dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
-                                                   + dimg_slice_patch
-        }
-      }
-      dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
-    }
-
-    if (padh > 0 | padw > 0) {
-      # Unpad image gradient
-      dimg = util::unpad_image(dimg, Hin, Win, padh, padw)  # shape (C, (Hin+2*padh)*(Win+2*padw))
-    }
-    dX[n,] = matrix(dimg, rows=1, cols=C*Hin*Win)
-  }
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
new file mode 100644
index 0000000..229b7b9
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
@@ -0,0 +1,159 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Max Pooling layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * This implementation uses `im2col` internally for each image to
+   * extract local image regions (patches) of each channel slice into
+   * columns, and then performs max pooling over the patches to compute
+   * the output maps.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
+  pad_value = -1/0  # in max pooling we pad with -infinity
+
+  # Create output volume
+  out = matrix(0, rows=N, cols=C*Hout*Wout)
+
+  # Max pooling - im2col implementation
+  parfor (n in 1:N) {  # all examples
+    img = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
+
+    if (padh > 0 | padw > 0) {
+      # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
+      img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
+    }
+
+    img_maxes = matrix(0, rows=C, cols=Hout*Wout)  # zeros
+    parfor (c in 1:C) {  # all channels
+      # Extract local image slice patches into columns with im2col, of shape (Hf*Wf, Hout*Wout)
+      img_slice_cols = util::im2col(img[c,], Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
+
+      # Max pooling on patches
+      img_maxes[c,] = colMaxs(img_slice_cols)
+    }
+
+    out[n,] = matrix(img_maxes, rows=1, cols=C*Hout*Wout)
+  }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   */
+  N = nrow(X)
+  pad_value = -1/0  # in max pooling we pad with -infinity
+
+  # Create gradient volume
+  dX = matrix(0, rows=N, cols=C*Hin*Win)
+
+  # Gradient of max pooling
+  parfor (n in 1:N, check=0) {  # all examples
+    img = matrix(X[n,], rows=C, cols=Hin*Win)
+    if (padh > 0 | padw > 0) {
+      # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
+      img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
+    }
+
+    dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+    parfor (c in 1:C, check=0) {  # all channels
+      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+      for (hout in 1:Hout, check=0) {  # all output rows
+        hin = (hout-1)*strideh + 1
+        for (wout in 1:Wout) {  # all output columns
+          win = (wout-1)*stridew + 1
+          img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
+          max_val_ind = img_slice_patch == max(img_slice_patch)  # max value indicator matrix
+          # gradient passes through only for the max value(s) in this patch
+          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
+          dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
+                                                   + dimg_slice_patch
+        }
+      }
+      dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
+    }
+
+    if (padh > 0 | padw > 0) {
+      # Unpad image gradient
+      dimg = util::unpad_image(dimg, Hin, Win, padh, padw)  # shape (C, (Hin+2*padh)*(Win+2*padw))
+    }
+    dX[n,] = matrix(dimg, rows=1, cols=C*Hin*Win)
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
new file mode 100644
index 0000000..be4e195
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
@@ -0,0 +1,103 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Max Pooling layer.
+ *
+ * This implementation uses a built-in operator for higher performance.
+ */
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * This implementation uses a built-in operator for higher
+   * performance.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  Hout = as.integer((Hin-Hf)/strideh + 1)
+  Wout = as.integer((Win-Wf)/stridew + 1)
+
+  # Max pooling - built-in implementation
+  out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
+                 stride=[strideh,stridew], padding=[padh,padw])
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   */
+  N = nrow(X)
+
+  # Gradient of max pooling
+  dX = max_pool_backward(X, dout, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
+                         stride=[strideh,stridew], padding=[padh,padw])
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
deleted file mode 100644
index f1cb863..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
+++ /dev/null
@@ -1,103 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Max pooling layer.
- */
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * This implementation uses `im2col` internally for each image to
-   * extract local image regions (patches) of each channel slice into
-   * columns, and then performs max pooling over the patches to compute
-   * the output maps.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  Hout = as.integer((Hin-Hf)/strideh + 1)
-  Wout = as.integer((Win-Wf)/stridew + 1)
-
-  # Max pooling - built-in implementation
-  out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
-                 stride=[strideh,stridew], padding=[padh,padw])
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   */
-  N = nrow(X)
-
-  # Gradient of max pooling
-  dX = max_pool_backward(X, dout, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
-                         stride=[strideh,stridew], padding=[padh,padw])
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml b/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
new file mode 100644
index 0000000..efd99c3
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
@@ -0,0 +1,215 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ *
+ * This implementation is intended to be a simple, reference version.
+ */
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+                   int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial convolutional layer with
+   * F filters.  The input data has N examples, each represented as a 3D
+   * volume unrolled into a single vector.
+   *
+   * This implementation is intended to be a simple, reference version.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
+
+  # Create output volume
+  out = matrix(0, rows=N, cols=F*Hout*Wout)
+
+  # Convolution - Simple reference implementation
+  parfor (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+    # Pad image
+    Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+    parfor (c in 1:C) {
+      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+    }
+    # Convolve image with filters
+    parfor (f in 1:F, check=0) {  # all filters
+      parfor (hout in 1:Hout, check=0) {  # all output rows
+        h0 = (hout-1)*strideh + 1
+        parfor (wout in 1:Wout, check=0) {  # all output columns
+          w0 = (wout-1)*stridew + 1
+          # Create a patch of the input example corresponding spatially to the filter sizes
+          Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
+          parfor (c in 1:C, check=0) {
+            Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)  # reshape
+            Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf], rows=1,
+                                         cols=Hf*Wf)  # reshape
+          }
+          out[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] =
+              W[f,] %*% matrix(Xn_padded_patch, rows=C*Hf*Wf, cols=1) + b[f,]
+        }
+      }
+    }
+  }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+                    matrix[double] X, matrix[double] W, matrix[double] b,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+  /*
+   * Computes the backward pass for a 2D spatial convolutional layer
+   * with F filters.
+   *
+   * This implementation is intended to be a simple, reference version.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt `b`, of shape (F, 1).
+   */
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
+
+  # Create gradient volumes
+  dX = matrix(0, rows=N, cols=C*Hin*Win)
+  dW = matrix(0, rows=F, cols=C*Hf*Wf)
+  db = matrix(0, rows=F, cols=1)
+
+  # Partial derivatives for convolution - Simple reference implementation
+  for (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+    # Pad image
+    Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+    parfor (c in 1:C) {
+      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+    }
+    dXn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+    for (f in 1:F) {  # all filters
+      for (hout in 1:Hout) {  # all output rows
+        h0 = (hout-1) * strideh + 1
+        for (wout in 1:Wout) {  # all output columns
+          w0 = (wout-1) * stridew + 1
+          # Create a patch of the input example corresponding spatially to the filter sizes
+          Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
+          dXn_padded_patch = matrix(W[f,] * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout],
+                                    rows=C, cols=Hf*Wf)  # reshape
+          for (c in 1:C) {
+            Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)  # reshape
+            Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf],
+                                         rows=1, cols=Hf*Wf)  # reshape
+            dXn_padded_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+            dXn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf] = matrix(dXn_padded_patch[c,],
+                                                              rows=Hf, cols=Wf)  # reshape
+            dXn_padded[c,] = dXn_padded[c,] + matrix(dXn_padded_slice,
+                                                     rows=1, cols=(Hin+2*padh)*(Win+2*padw))
+          }
+          dW[f,] = dW[f,]
+                   + matrix(Xn_padded_patch, rows=1, cols=C*Hf*Wf)
+                   * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
+          db[f,] = db[f,] + dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
+        }
+      }
+    }
+    # Unpad derivs on input
+    dXn = matrix(0, rows=C, cols=Hin*Win)
+    parfor (c in 1:C, check=0) {
+      dXn_padded_slice = matrix(dXn_padded[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
+      dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
+      dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
+    }
+    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
+  }
+}
+
+init = function(int F, int C, int Hf, int Wf)
+    return (matrix[double] W, matrix[double] b) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
+   *
+   * Inputs:
+   *  - F: Number of filters.
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *
+   * Outputs:
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   */
+  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+  b = matrix(0, rows=F, cols=1)
+}
+

[5/7] incubator-systemml git commit: [SYSTEMML-1453] Update Conv & Max Pooling layer names to include "2D"

Posted by du...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/test/conv_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/conv_simple.dml b/scripts/staging/SystemML-NN/nn/test/conv_simple.dml
deleted file mode 100644
index efd99c3..0000000
--- a/scripts/staging/SystemML-NN/nn/test/conv_simple.dml
+++ /dev/null
@@ -1,215 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- *
- * This implementation is intended to be a simple, reference version.
- */
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
-                   int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial convolutional layer with
-   * F filters.  The input data has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * This implementation is intended to be a simple, reference version.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *  - padw: Padding for left and right sides.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  F = nrow(W)
-  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
-  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
-
-  # Create output volume
-  out = matrix(0, rows=N, cols=F*Hout*Wout)
-
-  # Convolution - Simple reference implementation
-  parfor (n in 1:N) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
-    # Pad image
-    Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
-    parfor (c in 1:C) {
-      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
-      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
-    }
-    # Convolve image with filters
-    parfor (f in 1:F, check=0) {  # all filters
-      parfor (hout in 1:Hout, check=0) {  # all output rows
-        h0 = (hout-1)*strideh + 1
-        parfor (wout in 1:Wout, check=0) {  # all output columns
-          w0 = (wout-1)*stridew + 1
-          # Create a patch of the input example corresponding spatially to the filter sizes
-          Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
-          parfor (c in 1:C, check=0) {
-            Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)  # reshape
-            Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf], rows=1,
-                                         cols=Hf*Wf)  # reshape
-          }
-          out[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] =
-              W[f,] %*% matrix(Xn_padded_patch, rows=C*Hf*Wf, cols=1) + b[f,]
-        }
-      }
-    }
-  }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
-                    matrix[double] X, matrix[double] W, matrix[double] b,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
-  /*
-   * Computes the backward pass for a 2D spatial convolutional layer
-   * with F filters.
-   *
-   * This implementation is intended to be a simple, reference version.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *  - padw: Padding for left and right sides.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt `b`, of shape (F, 1).
-   */
-  N = nrow(X)
-  F = nrow(W)
-  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
-  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
-
-  # Create gradient volumes
-  dX = matrix(0, rows=N, cols=C*Hin*Win)
-  dW = matrix(0, rows=F, cols=C*Hf*Wf)
-  db = matrix(0, rows=F, cols=1)
-
-  # Partial derivatives for convolution - Simple reference implementation
-  for (n in 1:N) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
-    # Pad image
-    Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
-    parfor (c in 1:C) {
-      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
-      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
-    }
-    dXn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
-    for (f in 1:F) {  # all filters
-      for (hout in 1:Hout) {  # all output rows
-        h0 = (hout-1) * strideh + 1
-        for (wout in 1:Wout) {  # all output columns
-          w0 = (wout-1) * stridew + 1
-          # Create a patch of the input example corresponding spatially to the filter sizes
-          Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
-          dXn_padded_patch = matrix(W[f,] * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout],
-                                    rows=C, cols=Hf*Wf)  # reshape
-          for (c in 1:C) {
-            Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)  # reshape
-            Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf],
-                                         rows=1, cols=Hf*Wf)  # reshape
-            dXn_padded_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
-            dXn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf] = matrix(dXn_padded_patch[c,],
-                                                              rows=Hf, cols=Wf)  # reshape
-            dXn_padded[c,] = dXn_padded[c,] + matrix(dXn_padded_slice,
-                                                     rows=1, cols=(Hin+2*padh)*(Win+2*padw))
-          }
-          dW[f,] = dW[f,]
-                   + matrix(Xn_padded_patch, rows=1, cols=C*Hf*Wf)
-                   * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
-          db[f,] = db[f,] + dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
-        }
-      }
-    }
-    # Unpad derivs on input
-    dXn = matrix(0, rows=C, cols=Hin*Win)
-    parfor (c in 1:C, check=0) {
-      dXn_padded_slice = matrix(dXn_padded[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
-      dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
-      dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
-    }
-    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
-  }
-}
-
-init = function(int F, int C, int Hf, int Wf)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * We use the heuristic by He et al., which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * unit-Gaussian weights by a factor of sqrt(2/n), under the
-   * assumption of relu neurons.
-   *  - http://arxiv.org/abs/1502.01852
-   *
-   * Inputs:
-   *  - F: Number of filters.
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   */
-  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
-  b = matrix(0, rows=F, cols=1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/grad_check.dml b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
index ba9a317..27f4420 100644
--- a/scripts/staging/SystemML-NN/nn/test/grad_check.dml
+++ b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
@@ -24,8 +24,8 @@
  */
 source("nn/layers/affine.dml") as affine
 source("nn/layers/batch_norm.dml") as batch_norm
-source("nn/layers/conv.dml") as conv
-source("nn/layers/conv_builtin.dml") as conv_builtin
+source("nn/layers/conv2d.dml") as conv2d
+source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
 source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
 source("nn/layers/dropout.dml") as dropout
 source("nn/layers/l1_loss.dml") as l1_loss
@@ -34,16 +34,16 @@ source("nn/layers/l2_loss.dml") as l2_loss
 source("nn/layers/l2_reg.dml") as l2_reg
 source("nn/layers/log_loss.dml") as log_loss
 source("nn/layers/lstm.dml") as lstm
-source("nn/layers/max_pool.dml") as max_pool
-source("nn/layers/max_pool_builtin.dml") as max_pool_builtin
+source("nn/layers/max_pool2d.dml") as max_pool2d
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
 source("nn/layers/relu.dml") as relu
 source("nn/layers/rnn.dml") as rnn
 source("nn/layers/sigmoid.dml") as sigmoid
 source("nn/layers/softmax.dml") as softmax
 source("nn/layers/spatial_batch_norm.dml") as spatial_batch_norm
 source("nn/layers/tanh.dml") as tanh
-source("nn/test/conv_simple.dml") as conv_simple
-source("nn/test/max_pool_simple.dml") as max_pool_simple
+source("nn/test/conv2d_simple.dml") as conv2d_simple
+source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
 source("nn/test/util.dml") as test_util
 
 affine = function() {
@@ -229,11 +229,11 @@ batch_norm = function() {
   }
 }
 
-conv = function() {
+conv2d = function() {
   /*
-   * Gradient check for the convolutional layer using `im2col`.
+   * Gradient check for the 2D convolutional layer using `im2col`.
    */
-  print("Grad checking the `im2col` convolutional layer with L2 loss.")
+  print("Grad checking the `im2col` 2D convolutional layer with L2 loss.")
 
   # Generate data
   N = 2  # num examples
@@ -249,13 +249,13 @@ conv = function() {
   y = rand(rows=N, cols=F*Hin*Win)
 
   # Create layers
-  [W, b] = conv::init(F, C, Hf, Wf)
+  [W, b] = conv2d::init(F, C, Hf, Wf)
 
   # Compute analytical gradients of loss wrt parameters
-  [out, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
   dout = l2_loss::backward(out, y)
-  [dX, dW, db] = conv::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                pad, pad)
+  [dX, dW, db] = conv2d::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                  pad, pad)
 
   # Grad check
   h = 1e-5
@@ -265,10 +265,10 @@ conv = function() {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
-      [outmh, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
-      [outph, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
@@ -284,10 +284,10 @@ conv = function() {
       # Compute numerical derivative
       old = as.scalar(W[i,j])
       W[i,j] = old - h
-      [outmh, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       W[i,j] = old + h
-      [outph, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
@@ -303,10 +303,10 @@ conv = function() {
       # Compute numerical derivative
       old = as.scalar(b[i,j])
       b[i,j] = old - h
-      [outmh, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       b[i,j] = old + h
-      [outph, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative
@@ -317,12 +317,12 @@ conv = function() {
   }
 }
 
-conv_builtin = function() {
+conv2d_builtin = function() {
   /*
-   * Gradient check for the convolutional layer using built-in
+   * Gradient check for the 2D convolutional layer using built-in
    * functions.
    */
-  print("Grad checking the built-in convolutional layer with L2 loss.")
+  print("Grad checking the built-in 2D convolutional layer with L2 loss.")
 
   # Generate data
   N = 2  # num examples
@@ -338,13 +338,14 @@ conv_builtin = function() {
   y = rand(rows=N, cols=F*Hin*Win)
 
   # Create layers
-  [W, b] = conv_builtin::init(F, C, Hf, Wf)
+  [W, b] = conv2d_builtin::init(F, C, Hf, Wf)
 
   # Compute analytical gradients of loss wrt parameters
-  [out, Hout, Wout] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  [out, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                              pad, pad)
   dout = l2_loss::backward(out, y)
-  [dX, dW, db] = conv_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
-                                        stride, stride, pad, pad)
+  [dX, dW, db] = conv2d_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
+                                          stride, stride, pad, pad)
 
   # Grad check
   h = 1e-5
@@ -354,12 +355,12 @@ conv_builtin = function() {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
-      [outmh, Hout, Wout] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                  pad, pad)
+      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
-      [outph, Hout, Wout] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                  pad, pad)
+      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
@@ -375,12 +376,12 @@ conv_builtin = function() {
       # Compute numerical derivative
       old = as.scalar(W[i,j])
       W[i,j] = old - h
-      [outmh, Hout, Wout] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                  pad, pad)
+      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       W[i,j] = old + h
-      [outph, Hout, Wout] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                  pad, pad)
+      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
@@ -396,12 +397,12 @@ conv_builtin = function() {
       # Compute numerical derivative
       old = as.scalar(b[i,j])
       b[i,j] = old - h
-      [outmh, Hout, Wout] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                  pad, pad)
+      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       b[i,j] = old + h
-      [outph, Hout, Wout] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                  pad, pad)
+      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative
@@ -412,11 +413,11 @@ conv_builtin = function() {
   }
 }
 
-conv_simple = function() {
+conv2d_simple = function() {
   /*
-   * Gradient check for the simple reference convolutional layer.
+   * Gradient check for the simple reference 2D convolutional layer.
    */
-  print("Grad checking the simple reference convolutional layer with L2 loss.")
+  print("Grad checking the simple reference 2D convolutional layer with L2 loss.")
 
   # Generate data
   N = 2  # num examples
@@ -432,13 +433,13 @@ conv_simple = function() {
   y = rand(rows=N, cols=F*Hin*Win)
 
   # Create layers
-  [W, b] = conv_simple::init(F, C, Hf, Wf)
+  [W, b] = conv2d_simple::init(F, C, Hf, Wf)
 
   # Compute analytical gradients of loss wrt parameters
-  [out, Hout, Wout] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  [out, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
   dout = l2_loss::backward(out, y)
-  [dX, dW, db] = conv_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
-                                       stride, stride, pad, pad)
+  [dX, dW, db] = conv2d_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
+                                         stride, stride, pad, pad)
 
   # Grad check
   h = 1e-5
@@ -448,12 +449,12 @@ conv_simple = function() {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
-      [outmh, Hout, Wout] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                 pad, pad)
+      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
-      [outph, Hout, Wout] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                 pad, pad)
+      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
@@ -469,12 +470,12 @@ conv_simple = function() {
       # Compute numerical derivative
       old = as.scalar(W[i,j])
       W[i,j] = old - h
-      [outmh, Hout, Wout] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                 pad, pad)
+      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       W[i,j] = old + h
-      [outph, Hout, Wout] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                 pad, pad)
+      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
       lossph = l2_loss::forward(outph, y)
       W[i,j] = old  # reset
       dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
@@ -490,12 +491,12 @@ conv_simple = function() {
       # Compute numerical derivative
       old = as.scalar(b[i,j])
       b[i,j] = old - h
-      [outmh, Hout, Wout] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                 pad, pad)
+      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
       lossmh = l2_loss::forward(outmh, y)
       b[i,j] = old + h
-      [outph, Hout, Wout] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                 pad, pad)
+      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
       lossph = l2_loss::forward(outph, y)
       b[i,j] = old  # reset
       db_num = (lossph-lossmh) / (2*h)  # numerical derivative
@@ -898,11 +899,11 @@ lstm = function() {
   }
 }
 
-max_pool = function() {
+max_pool2d = function() {
   /*
-   * Gradient check for the max pooling layer.
+   * Gradient check for the 2D max pooling layer.
    */
-  print("Grad checking the max pooling layer with L2 loss.")
+  print("Grad checking the 2D max pooling layer with L2 loss.")
 
   # Generate data
   N = 2  # num examples
@@ -921,9 +922,9 @@ max_pool = function() {
     y = rand(rows=N, cols=C*Hout*Wout)
 
     # Compute analytical gradients of loss wrt parameters
-    [out, Hout, Wout] = max_pool::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+    [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
     dout = l2_loss::backward(out, y)
-    dX = max_pool::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+    dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
 
     # Grad check
     h = 1e-5
@@ -932,10 +933,10 @@ max_pool = function() {
         # Compute numerical derivative
         old = as.scalar(X[i,j])
         X[i,j] = old - h
-        [outmh, Hout, Wout] = max_pool::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+        [outmh, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
         lossmh = l2_loss::forward(outmh, y)
         X[i,j] = old + h
-        [outph, Hout, Wout] = max_pool::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+        [outph, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
@@ -947,11 +948,11 @@ max_pool = function() {
   }
 }
 
-max_pool_builtin = function() {
+max_pool2d_builtin = function() {
   /*
-   * Gradient check for the max pooling layer.
+   * Gradient check for the 2D max pooling layer.
    */
-  print("Grad checking the built-in max pooling layer with L2 loss.")
+  print("Grad checking the built-in 2D max pooling layer with L2 loss.")
 
   # Generate data
   N = 2  # num examples
@@ -970,10 +971,11 @@ max_pool_builtin = function() {
     y = rand(rows=N, cols=C*Hout*Wout)
 
     # Compute analytical gradients of loss wrt parameters
-    [out, Hout, Wout] = max_pool_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+    [out, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
     dout = l2_loss::backward(out, y)
-    dX = max_pool_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
-                                    pad, pad)
+    dX = max_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
+                                      pad, pad)
 
     # Grad check
     h = 1e-5
@@ -982,12 +984,12 @@ max_pool_builtin = function() {
         # Compute numerical derivative
         old = as.scalar(X[i,j])
         X[i,j] = old - h
-        [outmh, Hout, Wout] = max_pool_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
-                                                        pad, pad)
+        [outmh, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                          pad, pad)
         lossmh = l2_loss::forward(outmh, y)
         X[i,j] = old + h
-        [outph, Hout, Wout] = max_pool_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
-                                                        pad, pad)
+        [outph, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                          pad, pad)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
@@ -999,11 +1001,11 @@ max_pool_builtin = function() {
   }
 }
 
-max_pool_simple = function() {
+max_pool2d_simple = function() {
   /*
-   * Gradient check for the simple reference max pooling layer.
+   * Gradient check for the simple reference 2D max pooling layer.
    */
-  print("Grad checking the simple reference max pooling layer with L2 loss.")
+  print("Grad checking the simple reference 2D max pooling layer with L2 loss.")
 
   # Generate data
   N = 2  # num examples
@@ -1022,10 +1024,10 @@ max_pool_simple = function() {
     y = rand(rows=N, cols=C*Hout*Wout)
 
     # Compute analytical gradients of loss wrt parameters
-    [out, Hout, Wout] = max_pool_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+    [out, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
     dout = l2_loss::backward(out, y)
-    dX = max_pool_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
-                                   pad, pad)
+    dX = max_pool2d_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
+                                     pad, pad)
 
     # Grad check
     h = 1e-5
@@ -1034,12 +1036,12 @@ max_pool_simple = function() {
         # Compute numerical derivative
         old = as.scalar(X[i,j])
         X[i,j] = old - h
-        [outmh, Hout, Wout] = max_pool_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
-                                                       pad, pad)
+        [outmh, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                         pad, pad)
         lossmh = l2_loss::forward(outmh, y)
         X[i,j] = old + h
-        [outph, Hout, Wout] = max_pool_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
-                                                       pad, pad)
+        [outph, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                         pad, pad)
         lossph = l2_loss::forward(outph, y)
         X[i,j] = old  # reset
         dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml b/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
new file mode 100644
index 0000000..47dab3a
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
@@ -0,0 +1,172 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Max Pooling layer.
+ *
+ * This implementation is intended to be a simple, reference version.
+ */
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * This implementation is intended to be a simple, reference version.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
+
+  # Create output volume
+  out = matrix(0, rows=N, cols=C*Hout*Wout)
+
+  # Max pooling
+  parfor (n in 1:N, check=0) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+
+    # Pad image
+    pad_value = -1/0
+    Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+    parfor (c in 1:C) {
+      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+    }
+    img = Xn_padded  # shape (C, (Hin+2*padh)*(Win+2*padw))
+
+    parfor (c in 1:C, check=0) {  # all channels
+      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      parfor (hout in 1:Hout, check=0) {  # all output rows
+        hin = (hout-1) * strideh + 1
+        parfor (wout in 1:Wout, check=0) {  # all output columns
+          win = (wout-1) * stridew + 1
+          out[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout] = max(img_slice[hin:hin+Hf-1,
+                                                               win:win+Wf-1])
+        }
+      }
+    }
+  }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   */
+  N = nrow(X)
+
+  # Create gradient volume
+  dX = matrix(0, rows=N, cols=C*Hin*Win)
+
+  # Gradient of max pooling
+  for (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+
+    # Pad image
+    pad_value = -1/0
+    Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+    parfor (c in 1:C) {
+      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+    }
+    img = Xn_padded
+
+    dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+    for (c in 1:C) {  # all channels
+      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+      for (hout in 1:Hout, check=0) {  # all output rows
+        hin = (hout-1) * strideh + 1
+        for (wout in 1:Wout) {  # all output columns
+          win = (wout-1) * stridew + 1
+          img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
+          max_val_ind = img_slice_patch == max(img_slice_patch)  # max value indicator matrix
+          # gradient passes through only for the max value(s) in this patch
+          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
+          dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
+                                                   + dimg_slice_patch
+        }
+      }
+      dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
+    }
+
+    # Unpad derivs on input
+    dXn = matrix(0, rows=C, cols=Hin*Win)
+    parfor (c in 1:C, check=0) {
+      dXn_padded_slice = matrix(dimg[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
+      dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
+      dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
+    }
+    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/test/max_pool_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/max_pool_simple.dml b/scripts/staging/SystemML-NN/nn/test/max_pool_simple.dml
deleted file mode 100644
index 786b0a1..0000000
--- a/scripts/staging/SystemML-NN/nn/test/max_pool_simple.dml
+++ /dev/null
@@ -1,172 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Max pooling layer.
- *
- * This implementation is intended to be a simple, reference version.
- */
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * This implementation is intended to be a simple, reference version.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
-  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
-
-  # Create output volume
-  out = matrix(0, rows=N, cols=C*Hout*Wout)
-
-  # Max pooling
-  parfor (n in 1:N, check=0) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
-
-    # Pad image
-    pad_value = -1/0
-    Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
-    parfor (c in 1:C) {
-      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
-      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
-    }
-    img = Xn_padded  # shape (C, (Hin+2*padh)*(Win+2*padw))
-
-    parfor (c in 1:C, check=0) {  # all channels
-      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      parfor (hout in 1:Hout, check=0) {  # all output rows
-        hin = (hout-1) * strideh + 1
-        parfor (wout in 1:Wout, check=0) {  # all output columns
-          win = (wout-1) * stridew + 1
-          out[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout] = max(img_slice[hin:hin+Hf-1,
-                                                               win:win+Wf-1])
-        }
-      }
-    }
-  }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   */
-  N = nrow(X)
-
-  # Create gradient volume
-  dX = matrix(0, rows=N, cols=C*Hin*Win)
-
-  # Gradient of max pooling
-  for (n in 1:N) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
-
-    # Pad image
-    pad_value = -1/0
-    Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
-    parfor (c in 1:C) {
-      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
-      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
-    }
-    img = Xn_padded
-
-    dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
-    for (c in 1:C) {  # all channels
-      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
-      for (hout in 1:Hout, check=0) {  # all output rows
-        hin = (hout-1) * strideh + 1
-        for (wout in 1:Wout) {  # all output columns
-          win = (wout-1) * stridew + 1
-          img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
-          max_val_ind = img_slice_patch == max(img_slice_patch)  # max value indicator matrix
-          # gradient passes through only for the max value(s) in this patch
-          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
-          dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
-                                                   + dimg_slice_patch
-        }
-      }
-      dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
-    }
-
-    # Unpad derivs on input
-    dXn = matrix(0, rows=C, cols=Hin*Win)
-    parfor (c in 1:C, check=0) {
-      dXn_padded_slice = matrix(dimg[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
-      dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
-      dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
-    }
-    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
-  }
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/test/run_tests.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/run_tests.dml b/scripts/staging/SystemML-NN/nn/test/run_tests.dml
index 86bb77b..644662c 100644
--- a/scripts/staging/SystemML-NN/nn/test/run_tests.dml
+++ b/scripts/staging/SystemML-NN/nn/test/run_tests.dml
@@ -38,16 +38,16 @@ tmp = grad_check::log_loss()
 # Other layers
 tmp = grad_check::affine()
 tmp = grad_check::batch_norm()
-tmp = grad_check::conv_simple()
-tmp = grad_check::conv()
-tmp = grad_check::conv_builtin()
+tmp = grad_check::conv2d_simple()
+tmp = grad_check::conv2d()
+tmp = grad_check::conv2d_builtin()
 tmp = grad_check::dropout()
 tmp = grad_check::l1_reg()
 tmp = grad_check::l2_reg()
 tmp = grad_check::lstm()
-tmp = grad_check::max_pool_simple()
-tmp = grad_check::max_pool()
-tmp = grad_check::max_pool_builtin()
+tmp = grad_check::max_pool2d_simple()
+tmp = grad_check::max_pool2d()
+tmp = grad_check::max_pool2d_builtin()
 tmp = grad_check::relu()
 tmp = grad_check::rnn()
 tmp = grad_check::sigmoid()
@@ -72,9 +72,9 @@ print("---")
 tmp = test::batch_norm()
 tmp = test::im2col()
 tmp = test::padding()
-tmp = test::conv()
+tmp = test::conv2d()
 tmp = test::cross_entropy_loss()
-tmp = test::max_pool()
+tmp = test::max_pool2d()
 tmp = test::spatial_batch_norm()
 tmp = test::tanh()
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/test/test.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/test.dml b/scripts/staging/SystemML-NN/nn/test/test.dml
index 8fb0d04..64fc519 100644
--- a/scripts/staging/SystemML-NN/nn/test/test.dml
+++ b/scripts/staging/SystemML-NN/nn/test/test.dml
@@ -23,23 +23,23 @@
  * Various tests, not including gradient checks.
  */
 source("nn/layers/batch_norm.dml") as batch_norm
-source("nn/layers/conv.dml") as conv
-source("nn/layers/conv_builtin.dml") as conv_builtin
+source("nn/layers/conv2d.dml") as conv2d
+source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
 source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/max_pool.dml") as max_pool
-source("nn/layers/max_pool_builtin.dml") as max_pool_builtin
+source("nn/layers/max_pool2d.dml") as max_pool2d
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
 source("nn/layers/spatial_batch_norm.dml") as spatial_batch_norm
 source("nn/layers/tanh.dml") as tanh
-source("nn/test/conv_simple.dml") as conv_simple
-source("nn/test/max_pool_simple.dml") as max_pool_simple
+source("nn/test/conv2d_simple.dml") as conv2d_simple
+source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
 source("nn/test/util.dml") as test_util
 source("nn/util.dml") as util
 
 batch_norm = function() {
   /*
-   * Test for the `batch_norm` function.
+   * Test for the batch normalization function.
    */
-  print("Testing the batch_norm function.")
+  print("Testing the batch normalization function.")
 
   # Generate data
   N = 4  # Number of examples
@@ -68,11 +68,11 @@ batch_norm = function() {
   }
 }
 
-conv = function() {
+conv2d = function() {
   /*
-   * Test for the `conv` functions.
+   * Test for the 2D convolution functions.
    */
-  print("Testing the conv functions.")
+  print("Testing the 2D convolution functions.")
 
   # Generate data
   N = 2  # num examples
@@ -87,14 +87,14 @@ conv = function() {
   X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
 
   # Create layer
-  [W, b] = conv::init(F, C, Hf, Wf)
+  [W, b] = conv2d::init(F, C, Hf, Wf)
 
   # Forward
-  [out, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-  [out_simple, Hout_simple, Wout_simple] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf,
-                                                                stride, stride, pad, pad)
-  [out_builtin, Hout_builtin, Wout_builtin] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf,
-                                                                    stride, stride, pad, pad)
+  [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  [out_simple, Hout_simple, Wout_simple] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf,
+                                                                  stride, stride, pad, pad)
+  [out_builtin, Hout_builtin, Wout_builtin] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf,
+                                                                      stride, stride, pad, pad)
 
   # Equivalency check
   out = matrix(out, rows=1, cols=N*F*Hout*Wout)
@@ -110,7 +110,7 @@ conv = function() {
 
 cross_entropy_loss = function() {
   /*
-   * Test for the `cross-entropy` loss function.
+   * Test for the cross-entropy loss function.
    *
    * Here we make sure that the cross-entropy loss function does
    * not propagate `infinity` values in the case that a prediction is
@@ -206,11 +206,11 @@ padding = function() {
   }
 }
 
-max_pool = function() {
+max_pool2d = function() {
   /*
-   * Test for the `max_pool` functions.
+   * Test for the 2D max pooling functions.
    */
-  print("Testing the max pool functions.")
+  print("Testing the 2D max pooling functions.")
 
   # Generate data
   N = 2  # num examples
@@ -227,12 +227,14 @@ max_pool = function() {
       print(" - Testing w/ padh="+padh+" & padw="+padw+".")
       #if (1==1) {}  # force correct printing
       #print("   - Testing forward")
-      [out, Hout, Wout] = max_pool::forward(X, C, Hin, Win, Hf, Wf, stride, stride, padh, padw)
-      [out_simple, Hout_simple, Wout_simple] = max_pool_simple::forward(X, C, Hin, Win, Hf, Wf,
-                                                                        stride, stride, padh, padw)
-      [out_builtin, Hout_builtin, Wout_builtin] = max_pool_builtin::forward(X, C, Hin, Win, Hf, Wf,
-                                                                            stride, stride,
-                                                                            padh, padw)
+      [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, padh, padw)
+      [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+                                                                          stride, stride,
+                                                                          padh, padw)
+      [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win,
+                                                                              Hf, Wf,
+                                                                              stride, stride,
+                                                                              padh, padw)
 
       # Equivalency check
       out = matrix(out, rows=1, cols=N*C*Hout*Wout)
@@ -247,11 +249,12 @@ max_pool = function() {
 
       #print("   - Testing backward")
       dout = rand(rows=N, cols=C*Hout*Wout, pdf="normal")
-      dX = max_pool::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, padh, padw)
-      dX_simple = max_pool_simple::backward(dout, Hout_simple, Wout_simple, X, C, Hin, Win, Hf, Wf,
-                                            stride, stride, padh, padw)
-      dX_builtin = max_pool_builtin::backward(dout, Hout_builtin, Wout_builtin, X, C, Hin, Win,
+      dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
+                                padh, padw)
+      dX_simple = max_pool2d_simple::backward(dout, Hout_simple, Wout_simple, X, C, Hin, Win,
                                               Hf, Wf, stride, stride, padh, padw)
+      dX_builtin = max_pool2d_builtin::backward(dout, Hout_builtin, Wout_builtin, X, C, Hin, Win,
+                                                Hf, Wf, stride, stride, padh, padw)
 
       # Equivalency check
       dX = matrix(dX, rows=1, cols=N*C*Hin*Win)
@@ -288,11 +291,11 @@ max_pool = function() {
   pad = 0
 
   # forward
-  [out, Hout, Wout] = max_pool::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-  [out_simple, Hout_simple, Wout_simple] = max_pool_simple::forward(X, C, Hin, Win, Hf, Wf,
-                                                                    stride, stride, pad, pad)
-  [out_builtin, Hout_builtin, Wout_builtin] = max_pool_builtin::forward(X, C, Hin, Win, Hf, Wf,
-                                                                        stride, stride, pad, pad)
+  [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+                                                                      stride, stride, pad, pad)
+  [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
+                                                                          stride, stride, pad, pad)
 
   # equivalency check
   # -- channel 1
@@ -326,11 +329,11 @@ max_pool = function() {
   pad = 1
 
   # forward
-  [out, Hout, Wout] = max_pool::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-  [out_simple, Hout_simple, Wout_simple] = max_pool_simple::forward(X, C, Hin, Win, Hf, Wf,
-                                                                    stride, stride, pad, pad)
-  [out_builtin, Hout_builtin, Wout_builtin] = max_pool_builtin::forward(X, C, Hin, Win, Hf, Wf,
-                                                                        stride, stride, pad, pad)
+  [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+                                                                      stride, stride, pad, pad)
+  [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
+                                                                          stride, stride, pad, pad)
 
   # equivalency check
   # -- channel 1
@@ -363,11 +366,11 @@ max_pool = function() {
   pad = 0
 
   # forward
-  [out, Hout, Wout] = max_pool::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-  [out_simple, Hout_simple, Wout_simple] = max_pool_simple::forward(X, C, Hin, Win, Hf, Wf,
-                                                                    stride, stride, pad, pad)
-  [out_builtin, Hout_builtin, Wout_builtin] = max_pool_builtin::forward(X, C, Hin, Win, Hf, Wf,
-                                                                        stride, stride, pad, pad)
+  [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+                                                                      stride, stride, pad, pad)
+  [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
+                                                                          stride, stride, pad, pad)
 
   # equivalency check
   # -- channel 1
@@ -402,11 +405,11 @@ max_pool = function() {
   pad = 1
 
   # forward
-  [out, Hout, Wout] = max_pool::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-  [out_simple, Hout_simple, Wout_simple] = max_pool_simple::forward(X, C, Hin, Win, Hf, Wf,
-                                                                    stride, stride, pad, pad)
-  [out_builtin, Hout_builtin, Wout_builtin] = max_pool_builtin::forward(X, C, Hin, Win, Hf, Wf,
-                                                                        stride, stride, pad, pad)
+  [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+                                                                      stride, stride, pad, pad)
+  [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
+                                                                          stride, stride, pad, pad)
 
   # equivalency check
   # -- channel 1
@@ -417,7 +420,8 @@ max_pool = function() {
   #  0  0  0
   #  0 -6  0
   #  0  0  0
-  target = matrix("-1 -2 -4 -5 -6 -8 -13 -14 -16 -1 -5 -13 -2 -6 -14 -4 -8 -16", rows=1, cols=C*Hout*Wout)
+  target = matrix("-1 -2 -4 -5 -6 -8 -13 -14 -16 -1 -5 -13 -2 -6 -14 -4 -8 -16",
+                  rows=1, cols=C*Hout*Wout)
   target = rbind(target, target)  # n=2
   tmp = test_util::check_all_equal(out, target)
   tmp = test_util::check_all_equal(out_simple, target)
@@ -426,9 +430,9 @@ max_pool = function() {
 
 spatial_batch_norm = function() {
   /*
-   * Test for the `spatial_batch_norm` function.
+   * Test for the spatial batch normalization function.
    */
-  print("Testing the spatial_batch_norm function.")
+  print("Testing the spatial batch normalization function.")
 
   # Generate data
   N = 2  # Number of examples
@@ -532,7 +536,8 @@ tanh = function() {
   # Equivalency check
   for (i in 1:nrow(out)) {
     for (j in 1:ncol(out)) {
-      rel_error = test_util::check_rel_error(as.scalar(out[i,j]), as.scalar(out_ref[i,j]), 1e-10, 1e-12)
+      rel_error = test_util::check_rel_error(as.scalar(out[i,j]), as.scalar(out_ref[i,j]),
+                                             1e-10, 1e-12)
     }
   }
 }

[3/7] incubator-systemml git commit: [SYSTEMML-1412] Rename `nn/test/tests.dml` to `nn/test/run_tests.dml`

Posted by du...@apache.org.

[SYSTEMML-1412] Rename `nn/test/tests.dml` to `nn/test/run_tests.dml`

Closes #447.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/7744924e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/7744924e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/7744924e

Branch: refs/heads/master
Commit: 7744924e96701e79573f5f839cac0c7bbe97554b
Parents: 16b1cbd
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Fri Mar 31 18:38:56 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Fri Mar 31 18:38:56 2017 -0700

----------------------------------------------------------------------
 .../staging/SystemML-NN/nn/test/run_tests.dml   | 85 ++++++++++++++++++++
 scripts/staging/SystemML-NN/nn/test/tests.dml   | 85 --------------------
 2 files changed, 85 insertions(+), 85 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7744924e/scripts/staging/SystemML-NN/nn/test/run_tests.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/run_tests.dml b/scripts/staging/SystemML-NN/nn/test/run_tests.dml
new file mode 100644
index 0000000..86bb77b
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/test/run_tests.dml
@@ -0,0 +1,85 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Script to run tests.
+ */
+source("nn/test/grad_check.dml") as grad_check
+source("nn/test/test.dml") as test
+
+print("")
+print("Starting grad checks.")
+print("---")
+
+# Loss functions
+tmp = grad_check::cross_entropy_loss()
+tmp = grad_check::l1_loss()
+tmp = grad_check::l2_loss()
+tmp = grad_check::log_loss()
+
+# Other layers
+tmp = grad_check::affine()
+tmp = grad_check::batch_norm()
+tmp = grad_check::conv_simple()
+tmp = grad_check::conv()
+tmp = grad_check::conv_builtin()
+tmp = grad_check::dropout()
+tmp = grad_check::l1_reg()
+tmp = grad_check::l2_reg()
+tmp = grad_check::lstm()
+tmp = grad_check::max_pool_simple()
+tmp = grad_check::max_pool()
+tmp = grad_check::max_pool_builtin()
+tmp = grad_check::relu()
+tmp = grad_check::rnn()
+tmp = grad_check::sigmoid()
+tmp = grad_check::softmax()
+tmp = grad_check::spatial_batch_norm()
+tmp = grad_check::tanh()
+
+# Example model
+tmp = grad_check::two_layer_affine_l2_net()
+
+print("---")
+print("Grad checks complete -- look for any ERRORs or WARNINGs.")
+print("If any tests involving ReLUs failed, try a few times " +
+      "to ensure that they were not false negatives due to " +
+      "kinks being crossed.")
+print("")
+
+print("")
+print("Starting other tests.")
+print("---")
+
+tmp = test::batch_norm()
+tmp = test::im2col()
+tmp = test::padding()
+tmp = test::conv()
+tmp = test::cross_entropy_loss()
+tmp = test::max_pool()
+tmp = test::spatial_batch_norm()
+tmp = test::tanh()
+
+print("---")
+print("Other tests complete -- look for any ERRORs or WARNINGs.")
+print("")
+print("")
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7744924e/scripts/staging/SystemML-NN/nn/test/tests.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/tests.dml b/scripts/staging/SystemML-NN/nn/test/tests.dml
deleted file mode 100644
index 86bb77b..0000000
--- a/scripts/staging/SystemML-NN/nn/test/tests.dml
+++ /dev/null
@@ -1,85 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Script to run tests.
- */
-source("nn/test/grad_check.dml") as grad_check
-source("nn/test/test.dml") as test
-
-print("")
-print("Starting grad checks.")
-print("---")
-
-# Loss functions
-tmp = grad_check::cross_entropy_loss()
-tmp = grad_check::l1_loss()
-tmp = grad_check::l2_loss()
-tmp = grad_check::log_loss()
-
-# Other layers
-tmp = grad_check::affine()
-tmp = grad_check::batch_norm()
-tmp = grad_check::conv_simple()
-tmp = grad_check::conv()
-tmp = grad_check::conv_builtin()
-tmp = grad_check::dropout()
-tmp = grad_check::l1_reg()
-tmp = grad_check::l2_reg()
-tmp = grad_check::lstm()
-tmp = grad_check::max_pool_simple()
-tmp = grad_check::max_pool()
-tmp = grad_check::max_pool_builtin()
-tmp = grad_check::relu()
-tmp = grad_check::rnn()
-tmp = grad_check::sigmoid()
-tmp = grad_check::softmax()
-tmp = grad_check::spatial_batch_norm()
-tmp = grad_check::tanh()
-
-# Example model
-tmp = grad_check::two_layer_affine_l2_net()
-
-print("---")
-print("Grad checks complete -- look for any ERRORs or WARNINGs.")
-print("If any tests involving ReLUs failed, try a few times " +
-      "to ensure that they were not false negatives due to " +
-      "kinks being crossed.")
-print("")
-
-print("")
-print("Starting other tests.")
-print("---")
-
-tmp = test::batch_norm()
-tmp = test::im2col()
-tmp = test::padding()
-tmp = test::conv()
-tmp = test::cross_entropy_loss()
-tmp = test::max_pool()
-tmp = test::spatial_batch_norm()
-tmp = test::tanh()
-
-print("---")
-print("Other tests complete -- look for any ERRORs or WARNINGs.")
-print("")
-print("")
-