You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/04/26 21:42:27 UTC

[01/11] incubator-systemml git commit: [SYSTEMML-1524] Move `examples` into `nn`

Repository: incubator-systemml
Updated Branches:
  refs/heads/master aa2211ac0 -> 43c321d18


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml
new file mode 100644
index 0000000..e5755c4
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml
@@ -0,0 +1,331 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * MNIST LeNet Example
+ */
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/conv2d_builtin.dml") as conv2d
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/dropout.dml") as dropout
+source("nn/layers/l2_reg.dml") as l2_reg
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
+source("nn/layers/relu.dml") as relu
+source("nn/layers/softmax.dml") as softmax
+source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+
+train = function(matrix[double] X, matrix[double] y,
+                 matrix[double] X_val, matrix[double] y_val,
+                 int C, int Hin, int Win, int epochs)
+    return (matrix[double] W1, matrix[double] b1,
+            matrix[double] W2, matrix[double] b2,
+            matrix[double] W3, matrix[double] b3,
+            matrix[double] W4, matrix[double] b4) {
+  /*
+   * Trains a convolutional net using the "LeNet" architecture.
+   *
+   * The input matrix, X, has N examples, each represented as a 3D
+   * volume unrolled into a single vector.  The targets, y, have K
+   * classes, and are one-hot encoded.
+   *
+   * Inputs:
+   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - y: Target matrix, of shape (N, K).
+   *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
+   *  - y_val: Target validation matrix, of shape (N, K).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - epochs: Total number of full training loops over the full data set.
+   *
+   * Outputs:
+   *  - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
+   *  - b1: 1st layer biases vector, of shape (F1, 1).
+   *  - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
+   *  - b2: 2nd layer biases vector, of shape (F2, 1).
+   *  - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
+   *  - b3: 3rd layer biases vector, of shape (1, N3).
+   *  - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
+   *  - b4: 4th layer biases vector, of shape (1, K).
+   */
+  N = nrow(X)
+  K = ncol(y)
+
+  # Create network:
+  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
+  Hf = 5  # filter height
+  Wf = 5  # filter width
+  stride = 1
+  pad = 2  # For same dimensions, (Hf - stride) / 2
+
+  F1 = 32  # num conv filters in conv1
+  F2 = 64  # num conv filters in conv2
+  N3 = 512  # num nodes in affine3
+  # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)
+
+  [W1, b1] = conv2d::init(F1, C, Hf, Wf)  # inputs: (N, C*Hin*Win)
+  [W2, b2] = conv2d::init(F2, F1, Hf, Wf)  # inputs: (N, F1*(Hin/2)*(Win/2))
+  [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3)  # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
+  [W4, b4] = affine::init(N3, K)  # inputs: (N, N3)
+  W4 = W4 / sqrt(2)  # different initialization, since being fed into softmax, instead of relu
+
+  # Initialize SGD w/ Nesterov momentum optimizer
+  lr = 0.01  # learning rate
+  mu = 0.9  #0.5  # momentum
+  decay = 0.95  # learning rate decay constant
+  vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
+  vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
+  vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
+  vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)
+
+  # Regularization
+  lambda = 5e-04
+
+  # Optimize
+  print("Starting optimization")
+  batch_size = 64
+  iters = ceil(N / batch_size)
+  for (e in 1:epochs) {
+    for(i in 1:iters) {
+      # Get next batch
+      beg = ((i-1) * batch_size) %% N + 1
+      end = min(N, beg + batch_size - 1)
+      X_batch = X[beg:end,]
+      y_batch = y[beg:end,]
+
+      # Compute forward pass
+      ## layer 1: conv1 -> relu1 -> pool1
+      [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
+                                                pad, pad)
+      outr1 = relu::forward(outc1)
+      [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+                                                    strideh=2, stridew=2, pad=0, pad=0)
+      ## layer 2: conv2 -> relu2 -> pool2
+      [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
+                                                stride, stride, pad, pad)
+      outr2 = relu::forward(outc2)
+      [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+                                                    strideh=2, stridew=2, pad=0, pad=0)
+      ## layer 3:  affine3 -> relu3 -> dropout
+      outa3 = affine::forward(outp2, W3, b3)
+      outr3 = relu::forward(outa3)
+      [outd3, maskd3] = dropout::forward(outr3, 0.5, -1)
+      ## layer 4:  affine4 -> softmax
+      outa4 = affine::forward(outd3, W4, b4)
+      probs = softmax::forward(outa4)
+
+      # Compute loss & accuracy for training & validation data every 100 iterations.
+      if (i %% 100 == 0) {
+        # Compute training loss & accuracy
+        loss_data = cross_entropy_loss::forward(probs, y_batch)
+        loss_reg_W1 = l2_reg::forward(W1, lambda)
+        loss_reg_W2 = l2_reg::forward(W2, lambda)
+        loss_reg_W3 = l2_reg::forward(W3, lambda)
+        loss_reg_W4 = l2_reg::forward(W4, lambda)
+        loss = loss_data + loss_reg_W1 + loss_reg_W2 + loss_reg_W3 + loss_reg_W4
+        accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
+
+        # Compute validation loss & accuracy
+        probs_val = predict(X_val, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
+        loss_val = cross_entropy_loss::forward(probs_val, y_val)
+        accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
+
+        # Output results
+        print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
+              + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+      }
+
+      # Compute data backward pass
+      ## loss:
+      dprobs = cross_entropy_loss::backward(probs, y_batch)
+      ## layer 4:  affine4 -> softmax
+      douta4 = softmax::backward(dprobs, outa4)
+      [doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4)
+      ## layer 3:  affine3 -> relu3 -> dropout
+      doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3)
+      douta3 = relu::backward(doutr3, outa3)
+      [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
+      ## layer 2: conv2 -> relu2 -> pool2
+      doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+                                    strideh=2, stridew=2, pad=0, pad=0)
+      doutc2 = relu::backward(doutr2, outc2)
+      [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
+                                            Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
+      ## layer 1: conv1 -> relu1 -> pool1
+      doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+                                    strideh=2, stridew=2, pad=0, pad=0)
+      doutc1 = relu::backward(doutr1, outc1)
+      [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win,
+                                              Hf, Wf, stride, stride, pad, pad)
+
+      # Compute regularization backward pass
+      dW1_reg = l2_reg::backward(W1, lambda)
+      dW2_reg = l2_reg::backward(W2, lambda)
+      dW3_reg = l2_reg::backward(W3, lambda)
+      dW4_reg = l2_reg::backward(W4, lambda)
+      dW1 = dW1 + dW1_reg
+      dW2 = dW2 + dW2_reg
+      dW3 = dW3 + dW3_reg
+      dW4 = dW4 + dW4_reg
+
+      # Optimize with SGD w/ Nesterov momentum
+      [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
+      [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
+      [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
+      [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
+      [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
+      [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
+      [W4, vW4] = sgd_nesterov::update(W4, dW4, lr, mu, vW4)
+      [b4, vb4] = sgd_nesterov::update(b4, db4, lr, mu, vb4)
+    }
+    # Anneal momentum towards 0.999
+    #mu = mu + (0.999 - mu)/(1+epochs-e)
+    # Decay learning rate
+    lr = lr * decay
+  }
+}
+
+predict = function(matrix[double] X, int C, int Hin, int Win,
+                   matrix[double] W1, matrix[double] b1,
+                   matrix[double] W2, matrix[double] b2,
+                   matrix[double] W3, matrix[double] b3,
+                   matrix[double] W4, matrix[double] b4)
+    return (matrix[double] probs) {
+  /*
+   * Computes the class probability predictions of a convolutional
+   * net using the "LeNet" architecture.
+   *
+   * The input matrix, X, has N examples, each represented as a 3D
+   * volume unrolled into a single vector.
+   *
+   * Inputs:
+   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
+   *  - b1: 1st layer biases vector, of shape (F1, 1).
+   *  - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
+   *  - b2: 2nd layer biases vector, of shape (F2, 1).
+   *  - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
+   *  - b3: 3rd layer biases vector, of shape (1, N3).
+   *  - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
+   *  - b4: 4th layer biases vector, of shape (1, K).
+   *
+   * Outputs:
+   *  - probs: Class probabilities, of shape (N, K).
+   */
+  N = nrow(X)
+
+  # Network:
+  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
+  Hf = 5  # filter height
+  Wf = 5  # filter width
+  stride = 1
+  pad = 2  # For same dimensions, (Hf - stride) / 2
+
+  F1 = nrow(W1)  # num conv filters in conv1
+  F2 = nrow(W2)  # num conv filters in conv2
+  N3 = ncol(W3)  # num nodes in affine3
+  K = ncol(W4)  # num nodes in affine4, equal to number of target dimensions (num classes)
+
+  # Compute predictions over mini-batches
+  probs = matrix(0, rows=N, cols=K)
+  batch_size = 64
+  iters = ceil(N / batch_size)
+  for(i in 1:iters) {
+    # Get next batch
+    beg = ((i-1) * batch_size) %% N + 1
+    end = min(N, beg + batch_size - 1)
+    X_batch = X[beg:end,]
+
+    # Compute forward pass
+    ## layer 1: conv1 -> relu1 -> pool1
+    [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
+                                              pad, pad)
+    outr1 = relu::forward(outc1)
+    [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+                                                  strideh=2, stridew=2, pad=0, pad=0)
+    ## layer 2: conv2 -> relu2 -> pool2
+    [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
+                                              stride, stride, pad, pad)
+    outr2 = relu::forward(outc2)
+    [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+                                                  strideh=2, stridew=2, pad=0, pad=0)
+    ## layer 3:  affine3 -> relu3
+    outa3 = affine::forward(outp2, W3, b3)
+    outr3 = relu::forward(outa3)
+    ## layer 4:  affine4 -> softmax
+    outa4 = affine::forward(outr3, W4, b4)
+    probs_batch = softmax::forward(outa4)
+
+    # Store predictions
+    probs[beg:end,] = probs_batch
+  }
+}
+
+eval = function(matrix[double] probs, matrix[double] y)
+    return (double loss, double accuracy) {
+  /*
+   * Evaluates a convolutional net using the "LeNet" architecture.
+   *
+   * The probs matrix contains the class probability predictions
+   * of K classes over N examples.  The targets, y, have K classes,
+   * and are one-hot encoded.
+   *
+   * Inputs:
+   *  - probs: Class probabilities, of shape (N, K).
+   *  - y: Target matrix, of shape (N, K).
+   *
+   * Outputs:
+   *  - loss: Scalar loss, of shape (1).
+   *  - accuracy: Scalar accuracy, of shape (1).
+   */
+  # Compute loss & accuracy
+  loss = cross_entropy_loss::forward(probs, y)
+  correct_pred = rowIndexMax(probs) == rowIndexMax(y)
+  accuracy = mean(correct_pred)
+}
+
+generate_dummy_data = function()
+    return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
+  /*
+   * Generate a dummy dataset similar to the MNIST dataset.
+   *
+   * Outputs:
+   *  - X: Input data matrix, of shape (N, D).
+   *  - y: Target matrix, of shape (N, K).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   */
+  # Generate dummy input data
+  N = 1024  # num examples
+  C = 1  # num input channels
+  Hin = 28  # input height
+  Win = 28  # input width
+  K = 10  # num target classes
+  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+  classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))
+  y = table(seq(1, N), classes)  # one-hot encoding
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml
new file mode 100644
index 0000000..4c8c434
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml
@@ -0,0 +1,77 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST Softmax - Predict
+#
+# This script computes the class probability predictions of a
+# trained softmax classifier on images of handwritten digits.
+#
+# Inputs:
+#  - X: File containing training images.
+#     The format is "pixel_1, pixel_2, ..., pixel_n".
+#  - model_dir: Directory containing the trained weights and biases
+#     of the model.
+#  - out_dir: Directory to store class probability predictions for
+#     each image.
+#  - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
+#     Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+#  - probs: File containing class probability predictions for each
+#     image.
+#
+# Data:
+# The X file should contain images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels.
+#
+# Sample Invocation:
+# 1. Download images.
+#
+#   For example, save images to `nn/examples/data/mnist/images.csv`.
+#
+# 2. Execute using Spark
+#   ```
+#   spark-submit --master local[*] --driver-memory 5G
+#   --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+#   $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_softmax-predict.dml
+#   -nvargs X=nn/examples/data/mnist/images.csv
+#   model_dir=nn/examples/model/mnist_softmax out_dir=nn/examples/data/mnist
+#
+source("nn/examples/mnist_softmax.dml") as mnist_softmax
+
+# Read training data
+fmt = ifdef($fmt, "csv")
+X = read($X, format=fmt)
+
+# Scale images to [0,1], and one-hot encode the labels
+X = X / 255.0
+
+# Read model coefficients
+W = read($model_dir+"/W")
+b = read($model_dir+"/b")
+
+# Predict classes
+probs = mnist_softmax::predict(X, W, b)
+
+# Output results
+write(probs, $out_dir+"/probs."+fmt, format=fmt)
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml
new file mode 100644
index 0000000..09970f0
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml
@@ -0,0 +1,110 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST Softmax - Train
+#
+# This script trains a softmax classifier on images of handwritten
+# digits.
+#
+# Inputs:
+#  - train: File containing labeled MNIST training images.
+#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
+#  - test: File containing labeled MNIST test images.
+#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
+#  - out_dir: Directory to store weights and bias matrices of
+#     trained model, as well as final test accuracy.
+#  - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
+#     Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+#  - W: File containing the trained weights of the model.
+#  - b: File containing the trained biases of the model.
+#  - accuracy: File containing the final accuracy on the test data.
+#
+# Data:
+# The MNIST dataset contains labeled images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels, and each label is
+# one of 10 possible digits in [0,9].
+#
+# Sample Invocation (running from wihtin the `examples` folder):
+# 1. Download data (60,000 training examples, and 10,000 test examples)
+#   ```
+#   nn/examples/get_mnist_data.sh
+#   ```
+#
+# 2. Execute using Spark
+#   ```
+#   spark-submit --master local[*] --driver-memory 10G
+#   --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+#   $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_softmax-train.dml
+#   -nvargs train=nn/examples/data/mnist/mnist_train.csv test=nn/examples/data/mnist/mnist_test.csv
+#   epochs=1 out_dir=nn/examples/model/mnist_softmax
+#   ```
+#
+source("nn/examples/mnist_softmax.dml") as mnist_softmax
+
+# Read training data
+fmt = ifdef($fmt, "csv")
+train = read($train, format=fmt)
+test = read($test, format=fmt)
+epochs = ifdef($epochs, 1)
+out_dir = ifdef($out_dir, ".")
+
+# Extract images and labels
+images = train[,2:ncol(train)]
+labels = train[,1]
+X_test = test[,2:ncol(test)]
+y_test = test[,1]
+
+# Scale images to [0,1], and one-hot encode the labels
+n = nrow(train)
+n_test = nrow(test)
+classes = 10
+images = images / 255.0
+labels = table(seq(1, n), labels+1, n, classes)
+X_test = X_test / 255.0
+y_test = table(seq(1, n_test), y_test+1, n_test, classes)
+
+# Split into training (55,000 examples) and validation (5,000 examples)
+X = images[5001:nrow(images),]
+X_val = images[1:5000,]
+y = labels[5001:nrow(images),]
+y_val = labels[1:5000,]
+
+# Train
+[W, b] = mnist_softmax::train(X, y, X_val, y_val, epochs)
+
+# Write model out
+write(W, out_dir+"/W")
+write(b, out_dir+"/b")
+
+# Eval on test set
+probs = mnist_softmax::predict(X_test, W, b)
+[loss, accuracy] = mnist_softmax::eval(probs, y_test)
+
+# Output results
+print("Test Accuracy: " + accuracy)
+write(accuracy, out_dir+"/accuracy")
+
+print("")
+print("")
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
new file mode 100644
index 0000000..a529a12
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
@@ -0,0 +1,178 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * MNIST Softmax Example
+ */
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/softmax.dml") as softmax
+source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+
+train = function(matrix[double] X, matrix[double] y,
+                 matrix[double] X_val, matrix[double] y_val,
+                 int epochs)
+    return (matrix[double] W, matrix[double] b) {
+  /*
+   * Trains a softmax classifier.
+   *
+   * The input matrix, X, has N examples, each with D features.
+   * The targets, y, have K classes, and are one-hot encoded.
+   *
+   * Inputs:
+   *  - X: Input data matrix, of shape (N, D).
+   *  - y: Target matrix, of shape (N, K).
+   *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
+   *  - y_val: Target validation matrix, of shape (N, K).
+   *  - epochs: Total number of full training loops over the full data set.
+   *
+   * Outputs:
+   *  - W: Weights (parameters) matrix, of shape (D, M).
+   *  - b: Biases vector, of shape (1, M).
+   */
+  N = nrow(X)  # num examples
+  D = ncol(X)  # num features
+  K = ncol(y)  # num classes
+
+  # Create softmax classifier:
+  # affine -> softmax
+  [W, b] = affine::init(D, K)
+  W = W / sqrt(2.0/(D)) * sqrt(1/(D))
+
+  # Initialize SGD w/ Nesterov momentum optimizer
+  lr = 0.2  # learning rate
+  mu = 0  # momentum
+  decay = 0.99  # learning rate decay constant
+  vW = sgd_nesterov::init(W)  # optimizer momentum state for W
+  vb = sgd_nesterov::init(b)  # optimizer momentum state for b
+
+  # Optimize
+  print("Starting optimization")
+  batch_size = 50
+  iters = 1000 #ceil(N / batch_size)
+  for (e in 1:epochs) {
+    for(i in 1:iters) {
+      # Get next batch
+      beg = ((i-1) * batch_size) %% N + 1
+      end = min(N, beg + batch_size - 1)
+      X_batch = X[beg:end,]
+      y_batch = y[beg:end,]
+
+      # Compute forward pass
+      ## affine & softmax:
+      out = affine::forward(X_batch, W, b)
+      probs = softmax::forward(out)
+
+      # Compute loss & accuracy for training & validation data
+      loss = cross_entropy_loss::forward(probs, y_batch)
+      accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
+      probs_val = predict(X_val, W, b)
+      loss_val = cross_entropy_loss::forward(probs_val, y_val)
+      accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
+      print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
+            accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+
+      # Compute backward pass
+      ## loss:
+      dprobs = cross_entropy_loss::backward(probs, y_batch)
+      ## affine & softmax:
+      dout = softmax::backward(dprobs, out)
+      [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b)
+
+      # Optimize with SGD w/ Nesterov momentum
+      [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW)
+      [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb)
+    }
+    # Anneal momentum towards 0.999
+    mu = mu + (0.999 - mu)/(1+epochs-e)
+    # Decay learning rate
+    lr = lr * decay
+  }
+}
+
+predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
+    return (matrix[double] probs) {
+  /*
+   * Computes the class probability predictions of a softmax classifier.
+   *
+   * The input matrix, X, has N examples, each with D features.
+   *
+   * Inputs:
+   *  - X: Input data matrix, of shape (N, D).
+   *  - W: Weights (parameters) matrix, of shape (D, M).
+   *  - b: Biases vector, of shape (1, M).
+   *
+   * Outputs:
+   *  - probs: Class probabilities, of shape (N, K).
+   */
+  # Compute forward pass
+  ## affine & softmax:
+  out = affine::forward(X, W, b)
+  probs = softmax::forward(out)
+}
+
+eval = function(matrix[double] probs, matrix[double] y)
+    return (double loss, double accuracy) {
+  /*
+   * Evaluates a softmax classifier.
+   *
+   * The probs matrix contains the class probability predictions
+   * of K classes over N examples.  The targets, y, have K classes,
+   * and are one-hot encoded.
+   *
+   * Inputs:
+   *  - probs: Class probabilities, of shape (N, K).
+   *  - y: Target matrix, of shape (N, K).
+   *
+   * Outputs:
+   *  - loss: Scalar loss, of shape (1).
+   *  - accuracy: Scalar accuracy, of shape (1).
+   */
+  # Compute loss & accuracy
+  loss = cross_entropy_loss::forward(probs, y)
+  correct_pred = rowIndexMax(probs) == rowIndexMax(y)
+  accuracy = mean(correct_pred)
+}
+
+generate_dummy_data = function()
+    return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
+  /*
+   * Generate a dummy dataset similar to the MNIST dataset.
+   *
+   * Outputs:
+   *  - X: Input data matrix, of shape (N, D).
+   *  - y: Target matrix, of shape (N, K).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   */
+  # Generate dummy input data
+  N = 1024  # num examples
+  C = 1  # num input channels
+  Hin = 28  # input height
+  Win = 28  # input width
+  T = 10  # num targets
+  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+  classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
+  y = table(seq(1, N), classes)  # one-hot encoding
+}
+


[03/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn` library to `scripts/nn`

Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/test.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/test.dml b/scripts/staging/SystemML-NN/nn/test/test.dml
deleted file mode 100644
index a5cb497..0000000
--- a/scripts/staging/SystemML-NN/nn/test/test.dml
+++ /dev/null
@@ -1,549 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Various tests, not including gradient checks.
- */
-source("nn/layers/batch_norm1d.dml") as batch_norm1d
-source("nn/layers/batch_norm2d.dml") as batch_norm2d
-source("nn/layers/conv2d.dml") as conv2d
-source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/max_pool2d.dml") as max_pool2d
-source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
-source("nn/layers/tanh.dml") as tanh
-source("nn/test/conv2d_simple.dml") as conv2d_simple
-source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
-source("nn/test/util.dml") as test_util
-source("nn/util.dml") as util
-
-batch_norm1d = function() {
-  /*
-   * Test for the 1D batch normalization function.
-   */
-  print("Testing the 1D batch normalization function.")
-
-  # Generate data
-  N = 4  # Number of examples
-  D = 4  # Number of features
-  mode = 'train'  # execution mode
-  mu = 0.9  # momentum of moving averages
-  eps = 1e-5  # smoothing term
-  X = matrix(seq(1,16), rows=N, cols=D)
-
-  # Create layer
-  [gamma, beta, ema_mean, ema_var] = batch_norm1d::init(D)
-
-  # Forward
-  [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-      batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-
-  # Equivalency check
-  target = matrix("-1.34160721 -1.34160721 -1.34160733 -1.34160709
-                   -0.44720244 -0.44720244 -0.44720244 -0.44720232
-                    0.44720244  0.44720232  0.44720244  0.44720244
-                    1.34160733  1.34160721  1.34160733  1.34160733", rows=1, cols=N*D)
-  out = matrix(out, rows=1, cols=N*D)
-  for (i in 1:length(out)) {
-    rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
-                                           as.scalar(target[1,i]), 1e-3, 1e-4)
-  }
-}
-
-conv2d = function() {
-  /*
-   * Test for the 2D convolution functions.
-   */
-  print("Testing the 2D convolution functions.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 3  # num channels
-  Hin = 5  # input height
-  Win = 5  # input width
-  F = 2  # num filters
-  Hf = 3  # filter height
-  Wf = 3  # filter width
-  stride = 1
-  pad = 1
-  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
-
-  # Create layer
-  [W, b] = conv2d::init(F, C, Hf, Wf)
-
-  # Forward
-  [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-  [out_simple, Hout_simple, Wout_simple] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf,
-                                                                  stride, stride, pad, pad)
-  [out_builtin, Hout_builtin, Wout_builtin] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf,
-                                                                      stride, stride, pad, pad)
-
-  # Equivalency check
-  out = matrix(out, rows=1, cols=N*F*Hout*Wout)
-  out_simple = matrix(out_simple, rows=1, cols=N*F*Hout*Wout)
-  out_builtin = matrix(out_builtin, rows=1, cols=N*F*Hout*Wout)
-  for (i in 1:length(out)) {
-    rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
-                                           as.scalar(out_simple[1,i]), 1e-10, 1e-12)
-    rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
-                                           as.scalar(out_builtin[1,i]), 1e-10, 1e-12)
-  }
-}
-
-cross_entropy_loss = function() {
-  /*
-   * Test for the cross-entropy loss function.
-   *
-   * Here we make sure that the cross-entropy loss function does
-   * not propagate `infinity` values in the case that a prediction is
-`  * exactly equal to 0.
-   */
-  print("Testing the cross-entropy loss function with zero-valued predictions.")
-
-  # Generate data
-  N = 3 # num examples
-  K = 10 # num targets
-  pred = matrix(0, rows=N, cols=K)
-  y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
-  y = y / rowSums(y)  # normalized probs
-
-  loss = cross_entropy_loss::forward(pred, y)
-
-  inf = 1/0
-  if (loss == inf) {
-    print("ERROR: The cross-entropy loss function ouptuts infinity for all-zero predictions.")
-  }
-}
-
-im2col = function() {
-  /*
-   * Test for the `im2col` and `col2im` functions.
-   */
-  print("Testing the im2col and col2im functions.")
-
-	# Generate data
-  C = 3  # num channels
-  Hin = 5  # input height
-  Win = 5  # input width
-  Hf = 3  # filter height
-  Wf = 3  # filter width
-  stride = 2
-  pad = (Hin * stride - Hin + Hf - stride) / 2
-  Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
-  Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
-  x = rand(rows=C, cols=Hin*Win)
-
-  # pad
-  x_pad = util::pad_image(x, Hin, Win, pad, pad, 0)
-
-  # im2col
-  x_cols = util::im2col(x_pad, Hin+2*pad, Win+2*pad, Hf, Wf, stride, stride)
-
-  if (ncol(x_cols) != Hout*Wout) {
-    print("ERROR: im2col does not yield the correct output size: "
-          + ncol(x_cols)+" (actual) vs. "+Hout*Wout+" (correct).")
-  }
-
-  # col2im
-  x_pad2 = util::col2im(x_cols, C, Hin+2*pad, Win+2*pad, Hf, Wf, stride, stride, "none")
-
-  # Equivalency check
-  equivalent = test_util::all_equal(x_pad, x_pad2)
-  if (!equivalent) {
-    print("ERROR: im2col and then col2im does not yield the original image.")
-  }
-}
-
-padding = function() {
-  /*
-   * Test for the `pad_image` and `unpad_image` functions.
-   */
-  print("Testing the padding and unpadding functions.")
-
-  # Generate data
-  C = 3  # num channels
-  Hin = 5  # input height
-  Win = 5  # input width
-  pad = 3  # padding
-  x = rand(rows=C, cols=Hin*Win)
-
-  # Pad image
-  x_pad = util::pad_image(x, Hin, Win, pad, pad, 0)
-
-  # Check for padded rows & columns
-  for (c in 1:C) {
-    x_pad_slice = matrix(x_pad[c,], rows=Hin+2*pad, cols=Win+2*pad)
-    for (i in 1:pad) {
-      rowsum = sum(x_pad_slice[i,])
-      colsum = sum(x_pad_slice[,i])
-      if (rowsum != 0)
-        print("ERROR: Padding was not applied to row " + i + ".")
-      if (colsum != 0)
-        print("ERROR: Padding was not applied to column " + i + ".")
-    }
-  }
-
-  # Unpad image
-  x1 = util::unpad_image(x_pad, Hin, Win, pad, pad)
-
-  # Equivalency check
-  equivalent = test_util::all_equal(x, x1)
-  if (!equivalent) {
-    print("ERROR: Padding and then unpadding does not yield the original image.")
-  }
-}
-
-max_pool2d = function() {
-  /*
-   * Test for the 2D max pooling functions.
-   */
-  print("Testing the 2D max pooling functions.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 3  # num channels
-  Hin = 8  # input height
-  Win = 8  # input width
-  Hf = 2  # filter height
-  Wf = 2  # filter width
-  stride = 2
-  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
-
-  for (padh in 0:3) {
-    for (padw in 0:3) {
-      print(" - Testing w/ padh="+padh+" & padw="+padw+".")
-      #if (1==1) {}  # force correct printing
-      #print("   - Testing forward")
-      [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, padh, padw)
-      [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
-                                                                          stride, stride,
-                                                                          padh, padw)
-      [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win,
-                                                                              Hf, Wf,
-                                                                              stride, stride,
-                                                                              padh, padw)
-
-      # Equivalency check
-      out = matrix(out, rows=1, cols=N*C*Hout*Wout)
-      out_simple = matrix(out_simple, rows=1, cols=N*C*Hout*Wout)
-      out_builtin = matrix(out_builtin, rows=1, cols=N*C*Hout*Wout)
-      for (i in 1:length(out)) {
-        rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
-                                               as.scalar(out_simple[1,i]), 1e-10, 1e-12)
-        rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
-                                               as.scalar(out_builtin[1,i]), 1e-10, 1e-12)
-      }
-
-      #print("   - Testing backward")
-      dout = rand(rows=N, cols=C*Hout*Wout, pdf="normal")
-      dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
-                                padh, padw)
-      dX_simple = max_pool2d_simple::backward(dout, Hout_simple, Wout_simple, X, C, Hin, Win,
-                                              Hf, Wf, stride, stride, padh, padw)
-      dX_builtin = max_pool2d_builtin::backward(dout, Hout_builtin, Wout_builtin, X, C, Hin, Win,
-                                                Hf, Wf, stride, stride, padh, padw)
-
-      # Equivalency check
-      dX = matrix(dX, rows=1, cols=N*C*Hin*Win)
-      dX_simple = matrix(dX_simple, rows=1, cols=N*C*Hin*Win)
-      dX_builtin = matrix(dX_builtin, rows=1, cols=N*C*Hin*Win)
-      for (i in 1:length(dX)) {
-        rel_error = test_util::check_rel_error(as.scalar(dX[1,i]),
-                                               as.scalar(dX_simple[1,i]), 1e-10, 1e-12)
-        rel_error = test_util::check_rel_error(as.scalar(dX[1,i]),
-                                               as.scalar(dX_builtin[1,i]), 1e-10, 1e-12)
-      }
-    }
-  }
-
-  # ---
-  print(" - Testing for correct behavior against known answer w/ pad=0.")
-  # generate data
-  # -- channel 1
-  #  1  2  3  4
-  #  5  6  7  8
-  #  9 10 11 12
-  # 13 14 15 16
-  # -- channel 2
-  #  1  5  9 13
-  #  2  6 10 14
-  #  3  7 11 15
-  #  4  8 12 16
-  C = 2  # num channels
-  Hin = 4  # input height
-  Win = 4  # input width
-  X = matrix(seq(1,16,1), rows=Hin, cols=Win)
-  X = matrix(rbind(X, t(X)), rows=1, cols=C*Hin*Win)  # C=2
-  X = rbind(X, X)  # n=2
-  pad = 0
-
-  # forward
-  [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-  [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
-                                                                      stride, stride, pad, pad)
-  [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
-                                                                          stride, stride, pad, pad)
-
-  # equivalency check
-  # -- channel 1
-  #   6  8
-  #  14 16
-  # -- channel 2
-  #  6  14
-  #  8  16
-  target = matrix("6 8 14 16 6 14 8 16", rows=1, cols=C*Hout*Wout)
-  target = rbind(target, target)  # n=2
-  tmp = test_util::check_all_equal(out, target)
-  tmp = test_util::check_all_equal(out_simple, target)
-  tmp = test_util::check_all_equal(out_builtin, target)
-
-  print(" - Testing for correct behavior against known answer w/ pad=1.")
-  # generate data
-  # -- channel 1
-  #  0  0  0  0  0  0
-  #  0  1  2  3  4  0
-  #  0  5  6  7  8  0
-  #  0  9 10 11 12  0
-  #  0 13 14 15 16  0
-  #  0  0  0  0  0  0
-  # -- channel 2
-  #  0  0  0  0  0  0
-  #  0  1  5  9 13  0
-  #  0  2  6 10 14  0
-  #  0  3  7 11 15  0
-  #  0  4  8 12 16  0
-  #  0  0  0  0  0  0
-  pad = 1
-
-  # forward
-  [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-  [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
-                                                                      stride, stride, pad, pad)
-  [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
-                                                                          stride, stride, pad, pad)
-
-  # equivalency check
-  # -- channel 1
-  #  1  3  4
-  #  9 11 12
-  # 13 15 16
-  # -- channel 2
-  #  1  9 13
-  #  3 11 15
-  #  4 12 16
-  target = matrix("1 3 4 9 11 12 13 15 16 1 9 13 3 11 15 4 12 16", rows=1, cols=C*Hout*Wout)
-  target = rbind(target, target)  # n=2
-  tmp = test_util::check_all_equal(out, target)
-  tmp = test_util::check_all_equal(out_simple, target)
-  tmp = test_util::check_all_equal(out_builtin, target)
-
-  print(" - Testing for correct behavior against known answer w/ all negative matrix w/ pad=0.")
-  # generate data
-  # -- channel 1
-  #  -1  -2  -3  -4
-  #  -5  -6  -7  -8
-  #  -9 -10 -11 -12
-  # -13 -14 -15 -16
-  # -- channel 2
-  #  -1  -5  -9 -13
-  #  -2  -6 -10 -14
-  #  -3  -7 -11 -15
-  #  -4  -8 -12 -16
-  X = X * -1
-  pad = 0
-
-  # forward
-  [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-  [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
-                                                                      stride, stride, pad, pad)
-  [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
-                                                                          stride, stride, pad, pad)
-
-  # equivalency check
-  # -- channel 1
-  #  -1  -3
-  #  -9 -11
-  # -- channel 2
-  #  -1  -9
-  #  -3 -11
-  target = matrix("-1 -3 -9 -11 -1 -9 -3 -11", rows=1, cols=C*Hout*Wout)
-  target = rbind(target, target)  # n=2
-  tmp = test_util::check_all_equal(out, target)
-  tmp = test_util::check_all_equal(out_simple, target)
-  tmp = test_util::check_all_equal(out_builtin, target)
-
-
-  print(" - Testing for correct behavior against known answer w/ all negative matrix w/ pad=1.")
-  # generate data
-  # -- channel 1
-  #  0   0   0   0   0  0
-  #  0  -1  -2  -3  -4  0
-  #  0  -5  -6  -7  -8  0
-  #  0  -9 -10 -11 -12  0
-  #  0 -13 -14 -15 -16  0
-  #  0   0   0   0   0  0
-  # -- channel 2
-  #  0   0   0   0   0  0
-  #  0  -1  -5  -9 -13  0
-  #  0  -2  -6 -10 -14  0
-  #  0  -3  -7 -11 -15  0
-  #  0  -4  -8 -12 -16  0
-  #  0   0   0   0   0  0
-  pad = 1
-
-  # forward
-  [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-  [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
-                                                                      stride, stride, pad, pad)
-  [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
-                                                                          stride, stride, pad, pad)
-
-  # equivalency check
-  # -- channel 1
-  #  0  0  0
-  #  0 -6  0
-  #  0  0  0
-  # -- channel 2
-  #  0  0  0
-  #  0 -6  0
-  #  0  0  0
-  target = matrix("-1 -2 -4 -5 -6 -8 -13 -14 -16 -1 -5 -13 -2 -6 -14 -4 -8 -16",
-                  rows=1, cols=C*Hout*Wout)
-  target = rbind(target, target)  # n=2
-  tmp = test_util::check_all_equal(out, target)
-  tmp = test_util::check_all_equal(out_simple, target)
-  tmp = test_util::check_all_equal(out_builtin, target)
-}
-
-batch_norm2d = function() {
-  /*
-   * Test for the 2D (spatial) batch normalization function.
-   */
-  print("Testing the 2D (spatial) batch normalization function.")
-
-  # Generate data
-  N = 2  # Number of examples
-  C = 3  # num channels
-  Hin = 4  # input height
-  Win = 5  # input width
-  mode = 'train'  # execution mode
-  mu = 0.9  # momentum of moving averages
-  eps = 1e-5  # smoothing term
-  X = matrix("70  29 23 55 72
-              42  98 68 48 39
-              34  73 44  6 40
-              74  18 18 53 53
-
-              63  85 72 61 72
-              32  36 23 29 63
-               9  43 43 49 43
-              31  43 89 94 50
-
-              62  12 32 41 87
-              25  48 99 52 61
-              12  83 60 55 34
-              30  42 68 88 51
-
-
-              67  59 62 67 84
-               8  76 24 19 57
-              10  89 63 72  2
-              59  56 16 15 70
-
-              32  69 55 39 93
-              84  36  4 30 40
-              70 100 36 76 59
-              69  15 40 24 34
-
-              51  67 11 13 32
-              66  85 55 85 38
-              32  35 17 83 34
-              55  58 52  0 99", rows=N, cols=C*Hin*Win)
-
-  # Create layer
-  [gamma, beta, ema_mean, ema_var] = batch_norm2d::init(C)
-
-  # Forward
-  [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-      batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-
-  # Equivalency check
-  target = matrix("0.86215019 -0.76679718 -1.00517964  0.26619387  0.94161105
-                  -0.25030172  1.97460198  0.78268933 -0.01191914 -0.36949289
-                  -0.56814504  0.98134136 -0.17084086 -1.68059683 -0.32976246
-                   1.02107191 -1.20383179 -1.20383179  0.18673301  0.18673301
-
-                   0.50426388  1.41921711  0.87856293  0.42108631  0.87856293
-                  -0.78498828 -0.61863315 -1.15928721 -0.90975463  0.50426388
-                  -1.74153018 -0.32751167 -0.32751167 -0.07797909 -0.32751167
-                  -0.82657707 -0.32751167  1.58557224  1.79351616 -0.0363903
-
-                   0.4607178  -1.49978399 -0.71558321 -0.36269283  1.44096887
-                  -0.99005347 -0.08822262  1.91148913  0.06861746  0.42150795
-                  -1.49978399  1.28412855  0.38229787  0.18624771 -0.63716316
-                  -0.79400325 -0.32348287  0.69597805  1.48017895  0.0294075
-
-
-                   0.74295878  0.42511559  0.54430676  0.74295878  1.41837597
-                  -1.60113597  1.10053277 -0.96544927 -1.16410136  0.34565473
-                  -1.52167511  1.61702824  0.5840373   0.94161105 -1.83951855
-                   0.42511559  0.30592418 -1.28329265 -1.32302308  0.86215019
-
-                  -0.78498828  0.75379658  0.17155361 -0.4938668   1.75192738
-                   1.37762833 -0.61863315 -1.9494741  -0.86816585 -0.45227802
-                   0.79538536  2.04304862 -0.61863315  1.04491806  0.33790874
-                   0.75379658 -1.49199748 -0.45227802 -1.11769855 -0.70181072
-
-                   0.0294075   0.65676796 -1.53899395 -1.46057391 -0.71558321
-                   0.61755812  1.36254871  0.18624771  1.36254871 -0.48032296
-                  -0.71558321 -0.59795308 -1.30373383  1.28412855 -0.63716316
-                   0.18624771  0.30387771  0.06861746 -1.97030437  1.91148913",
-                  rows=1, cols=N*C*Hin*Win)
-  out = matrix(out, rows=1, cols=N*C*Hin*Win)
-  for (i in 1:length(out)) {
-    rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
-                                           as.scalar(target[1,i]), 1e-3, 1e-4)
-  }
-}
-
-tanh = function() {
-  /*
-   * Test for the `tanh` forward function.
-   */
-  print("Testing the tanh forward function.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 3  # num channels
-  X = rand(rows=N, cols=C, pdf="normal")
-
-  out = tanh::forward(X)
-  out_ref = (exp(X) - exp(-X)) / (exp(X) + exp(-X))
-
-  # Equivalency check
-  for (i in 1:nrow(out)) {
-    for (j in 1:ncol(out)) {
-      rel_error = test_util::check_rel_error(as.scalar(out[i,j]), as.scalar(out_ref[i,j]),
-                                             1e-10, 1e-12)
-    }
-  }
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/util.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/util.dml b/scripts/staging/SystemML-NN/nn/test/util.dml
deleted file mode 100644
index e32a885..0000000
--- a/scripts/staging/SystemML-NN/nn/test/util.dml
+++ /dev/null
@@ -1,155 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Test utility functions.
- */
-
-all_equal = function(matrix[double] X1, matrix[double] X2)
-    return(boolean equivalent) {
-  /*
-   * Determine if two matrices are equivalent.
-   *
-   * Inputs:
-   *  - X1: Inputs, of shape (any, any).
-   *  - X2: Inputs, of same shape as X1.
-   *
-   * Outputs:
-   *  - equivalent: Whether or not the two matrices are equivalent.
-   */
-  equivalent = as.logical(prod(X1 == X2))
-}
-
-check_all_equal = function(matrix[double] X1, matrix[double] X2)
-    return(boolean equivalent) {
-  /*
-   * Check if two matrices are equivalent, and report any issues.
-   *
-   * Issues an "ERROR" statement if elements of the two matrices are
-   * not equal.
-   *
-   * Inputs:
-   *  - X1: Inputs, of shape (any, any).
-   *  - X2: Inputs, of same shape as X1.
-   *
-   * Outputs:
-   *  - equivalent: Whether or not the two matrices are equivalent.
-   */
-  # Determine if matrices are equivalent
-  equivalent = all_equal(X1, X2)
-
-  # Evaluate relative error
-  if (!equivalent) {
-    print("ERROR: The two matrices are not equivalent.")
-  }
-}
-
-compute_rel_error = function(double x1, double x2)
-    return (double rel_error) {
-  /*
-   * Relative error measure between two values.
-   *
-   * Uses smoothing to avoid divide-by-zero errors.
-   *
-   * Inputs:
-   *  - x1: First value.
-   *  - x2: Second value.
-   *
-   * Outputs:
-   *  - rel_error: Relative error measure between the two values.
-   */
-  rel_error = abs(x1-x2) / max(1e-8, abs(x1)+abs(x2))
-}
-
-check_rel_error = function(double x1, double x2, double thresh_error, double thresh_warn)
-    return (double rel_error) {
-  /*
-   * Check and report any issues with the relative error measure between
-   * two values.
-   *
-   * Issues an "ERROR" statement for relative errors > thresh_error,
-   * indicating that the implementation is likely incorrect.
-   *
-   * Issues a "WARNING" statement for relative errors < thresh_error
-   * but > thresh_warn, indicating that the implementation may be
-   * incorrect.
-   *
-   * Inputs:
-   *  - x1: First value.
-   *  - x2: Second value.
-   *  - thresh_error: Error threshold.
-   *  - thresh_warn: Warning threshold.
-   *
-   * Outputs:
-   *  - rel_error: Relative error measure between the two values.
-   */
-  # Compute relative error
-  rel_error = compute_rel_error(x1, x2)
-
-  # Evaluate relative error
-  if (rel_error > thresh_error) {
-    print("ERROR: Relative error " + rel_error + " > " + thresh_error + " with " + x1 +
-          " vs " + x2 + ".")
-  }
-  else if (rel_error > thresh_warn & rel_error <= thresh_error) {
-    print("WARNING: Relative error " + rel_error + " > " + thresh_warn + " & <= " + thresh_error +
-          " with " + x1 + " vs " + x2 + ".")
-  }
-}
-
-check_rel_grad_error = function(double dw_a, double dw_n, double lossph, double lossmh)
-    return (double rel_error) {
-  /*
-   * Check and report any issues with the relative error measure between
-   * the analytical and numerical partial derivatives.
-   *
-   *  - Issues an "ERROR" statement for relative errors > 1e-2,
-   *  indicating that the gradient is likely incorrect.
-   *  - Issues a "WARNING" statement for relative errors < 1e-2
-   *  but > 1e-4, indicating that the may be incorrect.
-   *
-   * Inputs:
-   *  - dw_a: Analytical partial derivative wrt w.
-   *  - dw_n: Numerical partial derivative wrt w.
-   *  - lossph: Loss evaluated with w set to w+h.
-   *  - lossmh: Loss evaluated with w set to w-h.
-   *
-   * Outputs:
-   *  - rel_error: Relative error measure between the two derivatives.
-   */
-  # Compute relative error
-  rel_error = compute_rel_error(dw_a, dw_n)
-
-  # Evaluate relative error
-  thresh_error = 1e-2
-  thresh_warn = 1e-4
-  if (rel_error > thresh_error) {
-    print("ERROR: Relative error " + rel_error + " > " + thresh_error + " with " + dw_a +
-          " analytical vs " + dw_n + " numerical, with lossph " + lossph +
-          " and lossmh " + lossmh)
-  }
-  else if (rel_error > thresh_warn & rel_error <= thresh_error) {
-    print("WARNING: Relative error " + rel_error + " > " + thresh_warn + " & <= " + thresh_error +
-          " with " + dw_a + " analytical vs " + dw_n + " numerical, with lossph " + lossph +
-          " and lossmh " + lossmh)
-  }
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/util.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/util.dml b/scripts/staging/SystemML-NN/nn/util.dml
deleted file mode 100644
index 3a73f08..0000000
--- a/scripts/staging/SystemML-NN/nn/util.dml
+++ /dev/null
@@ -1,202 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Utility functions.
- */
-
-channel_sums = function(matrix[double] X, int C, int Hin, int Win)
-    return (matrix[double] out) {
-  /*
-   * Computes a channel-wise summation over a 4D input.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (C, 1).
-   */
-  # Here we sum each column, reshape to (C, Hin*Win), and sum each row to result in the summation
-  # for each channel.
-  out = rowSums(matrix(colSums(X), rows=C, cols=Hin*Win))  # shape (C, 1)
-}
-
-im2col = function(matrix[double] img, int Hin, int Win, int Hf, int Wf, int strideh, int stridew)
-    return (matrix[double] img_cols) {
-  /*
-   * Rearrange local image regions (patches) into columns.
-   *
-   * Assumes image has already been padded as necessary.
-   *
-   * Inputs:
-   *  - img: Input image, of shape (C, Hin*Win), where C is the number
-   *      of input channels (depth).
-   *  - Hin: Input height, including padding.
-   *  - Win: Input width, including padding.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *
-   * Outputs:
-   *  - img_cols: Local spatial regions (patches) of the image stretched
-   *      out into columns, of shape (C*Hf*Wf, Hout*Wout).
-   */
-  C = nrow(img)
-  Hout = as.integer(floor((Hin-Hf)/strideh + 1))
-  Wout = as.integer(floor((Win-Wf)/stridew + 1))
-
-  # Note: We start with `img_cols` transposed to allow for row-major
-  # left-indexing inside the loop, which is more performant.
-  img_cols = matrix(0, rows=Hout*Wout, cols=C*Hf*Wf)  # zeros
-  parfor (hout in 1:Hout, check=0) {  # all output rows
-    hin = (hout-1)*strideh + 1
-    parfor (wout in 1:Wout, check=0) {  # all output columns
-      win = (wout-1)*stridew + 1
-      # Extract a local patch of the input image corresponding spatially to the filter sizes.
-      img_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
-      parfor (c in 1:C) {  # all channels
-        img_slice = matrix(img[c,], rows=Hin, cols=Win)  # reshape
-        img_patch[c,] = matrix(img_slice[hin:hin+Hf-1, win:win+Wf-1], rows=1, cols=Hf*Wf)
-      }
-      img_cols[(hout-1)*Wout + wout,] = t(matrix(img_patch, rows=C*Hf*Wf, cols=1))  # reshape
-    }
-  }
-  img_cols = t(img_cols)
-}
-
-col2im = function(matrix[double] img_cols, int C, int Hin, int Win, int Hf, int Wf,
-                  int strideh, int stridew, string reduction)
-    return (matrix[double] img) {
-  /*
-   * Create an image from columns of local image regions (patches).
-   *
-   * The reduction strategy determines how to deal with overlapping
-   * patches.  If it is set to "add", any overlapping patches will be
-   * added together when creating the image.  This is useful when
-   * computing gradients on the original image given gradients on the
-   * patches.  Otherwise, if "none" is provided, any overlapping
-   * patches will just override previous ones when creating the image.
-   * This is useful when recreating an image from the output of
-   * `im2col`.
-   *
-   * Assumes original image was already padded as necessary.
-   *
-   * Inputs:
-   *  - img_cols: Local spatial regions (patches) of the image stretched
-   *      out into columns, of shape (C*Hf*Wf, Hout*Wout).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height, including padding.
-   *  - Win: Input width, including padding.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - reduction: The reduction strategy to use for overlapping
-   *      patches.  Valid options are "add" and "none".
-   *
-   * Outputs:
-   *  - img: Input image, of shape (C, Hin*Win).
-   */
-  Hout = as.integer(floor((Hin-Hf)/strideh + 1))
-  Wout = as.integer(floor((Win-Wf)/stridew + 1))
-
-  img = matrix(0, rows=C, cols=Hin*Win)  # zeros
-  for (hout in 1:Hout) {  # all output rows
-    hin = (hout-1)*strideh + 1
-    for (wout in 1:Wout) {  # all output columns
-      win = (wout-1)*stridew + 1
-      # Extract a local patch of the input image corresponding spatially to the filter sizes.
-      img_patch = matrix(img_cols[,(hout-1)*Wout + wout], rows=C, cols=Hf*Wf)  # zeros
-      parfor (c in 1:C) {  # all channels
-        img_patch_slice = matrix(img_patch[c,], rows=Hf, cols=Wf)  # reshape
-        if (reduction == "add") {
-          img_slice = matrix(0, rows=Hin, cols=Win)
-          img_slice[hin:hin+Hf-1, win:win+Wf-1] = img_patch_slice
-          img[c,] = img[c,] + matrix(img_slice, rows=1, cols=Hin*Win)
-        } else {
-          img_slice = matrix(img[c,], rows=Hin, cols=Win)
-          img_slice[hin:hin+Hf-1, win:win+Wf-1] = img_patch_slice
-          img[c,] = matrix(img_slice, rows=1, cols=Hin*Win)
-        }
-      }
-    }
-  }
-}
-
-pad_image = function(matrix[double] img, int Hin, int Win, int padh, int padw, double pad_value)
-    return (matrix[double] img_padded) {
-  /*
-   * Pads an image along the height and width dimensions with zeros.
-   *
-   * Inputs:
-   *  - img: Input image, of shape (C, Hin*Win), where C is the number
-   *      of input channels (depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - padh: Padding for top and bottom sides.
-   *  - padw: Padding for left and right sides.
-   *  - pad_value: Value to use for the padding.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - img_padded: The input image padded along the height and width
-   *      dimensions, of shape (C, (Hin+2*padh)*(Win+2*padw)).
-   */
-  C = nrow(img)
-  img_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
-  parfor (c in 1:C) {
-    img_slice = matrix(img[c,], rows=Hin, cols=Win)  # depth slice C reshaped
-    img_padded_slice = matrix(pad_value, rows=Hin+2*padh, cols=Win+2*padw)
-    img_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = img_slice
-    img_padded[c,] = matrix(img_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
-  }
-}
-
-unpad_image = function(matrix[double] img_padded, int Hin, int Win, int padh, int padw)
-    return (matrix[double] img) {
-  /*
-   * Unpads an image along the height and width dimensions.
-   *
-   * Inputs:
-   *  - img_padded: The input image padded along the height and width
-   *      dimensions, of shape (C, (Hin+2*padh)*(Win+2*padw)).
-   *  - Hin: Input height of unpadded image.
-   *  - Win: Input width of unpadded image.
-   *  - padh: Padding for top and bottom sides.
-   *  - padw: Padding for left and right sides.
-   *
-   * Outputs:
-   *  - img: Input image, of shape (C, Hin*Win), where C is the number
-   *      of input channels (depth).
-   */
-  C = nrow(img_padded)
-  img = matrix(0, rows=C, cols=Hin*Win)
-  parfor (c in 1:C) {
-    img_padded_slice = matrix(img_padded[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
-    img_slice = img_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
-    img[c,] = matrix(img_slice, rows=1, cols=Hin*Win)
-  }
-}
-


[02/11] incubator-systemml git commit: [SYSTEMML-1524] Move `examples` into `nn`

Posted by du...@apache.org.
[SYSTEMML-1524] Move `examples` into `nn`


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/1f5cf697
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/1f5cf697
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/1f5cf697

Branch: refs/heads/master
Commit: 1f5cf697c49313861a3bdbcc634f7a56daabdc16
Parents: aa2211a
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Wed Apr 26 14:40:40 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Wed Apr 26 14:40:40 2017 -0700

----------------------------------------------------------------------
 scripts/staging/SystemML-NN/README.md           |   2 +-
 .../examples/Example - MNIST LeNet.ipynb        | 198 -----------
 .../Example - MNIST Softmax Classifier.ipynb    | 185 -----------
 scripts/staging/SystemML-NN/examples/README.md  |  75 -----
 .../SystemML-NN/examples/get_mnist_data.sh      |  28 --
 .../examples/mnist_lenet-predict.dml            |  87 -----
 .../SystemML-NN/examples/mnist_lenet-train.dml  | 123 -------
 .../SystemML-NN/examples/mnist_lenet.dml        | 331 -------------------
 .../examples/mnist_softmax-predict.dml          |  74 -----
 .../examples/mnist_softmax-train.dml            | 108 ------
 .../SystemML-NN/examples/mnist_softmax.dml      | 177 ----------
 scripts/staging/SystemML-NN/examples/nn         |   1 -
 .../nn/examples/Example - MNIST LeNet.ipynb     | 189 +++++++++++
 .../Example - MNIST Softmax Classifier.ipynb    | 179 ++++++++++
 .../staging/SystemML-NN/nn/examples/README.md   |  74 +++++
 .../SystemML-NN/nn/examples/get_mnist_data.sh   |  28 ++
 .../nn/examples/mnist_lenet-predict.dml         |  91 +++++
 .../nn/examples/mnist_lenet-train.dml           | 123 +++++++
 .../SystemML-NN/nn/examples/mnist_lenet.dml     | 331 +++++++++++++++++++
 .../nn/examples/mnist_softmax-predict.dml       |  77 +++++
 .../nn/examples/mnist_softmax-train.dml         | 110 ++++++
 .../SystemML-NN/nn/examples/mnist_softmax.dml   | 178 ++++++++++
 22 files changed, 1381 insertions(+), 1388 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/README.md
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/README.md b/scripts/staging/SystemML-NN/README.md
index 3943765..b80f2c6 100644
--- a/scripts/staging/SystemML-NN/README.md
+++ b/scripts/staging/SystemML-NN/README.md
@@ -22,7 +22,7 @@ limitations under the License.
 ### A deep learning library for [Apache SystemML](https://github.com/apache/incubator-systemml).
 
 ## Examples:
-#### Please see the [`examples`](examples) folder for more detailed examples, or view the following two quick examples.
+#### Please see the [`examples`](nn/examples) folder for more detailed examples, or view the following two quick examples.
 ### Neural net for regression with vanilla SGD:
 ```python
 # Imports

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/Example - MNIST LeNet.ipynb
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/Example - MNIST LeNet.ipynb b/scripts/staging/SystemML-NN/examples/Example - MNIST LeNet.ipynb
deleted file mode 100644
index 3ad210e..0000000
--- a/scripts/staging/SystemML-NN/examples/Example - MNIST LeNet.ipynb	
+++ /dev/null
@@ -1,198 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Quick Setup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "# Create a SystemML MLContext object\n",
-    "from systemml import MLContext, dml\n",
-    "ml = MLContext(sc)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Download Data - MNIST"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9].  Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "%%sh\n",
-    "mkdir -p data/mnist/\n",
-    "cd data/mnist/\n",
-    "curl -O http://pjreddie.com/media/files/mnist_train.csv\n",
-    "curl -O http://pjreddie.com/media/files/mnist_test.csv"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## SystemML \"LeNet\" Neural Network"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 1. Train"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "script_string = \"\"\"\n",
-    "source(\"mnist_lenet.dml\") as mnist_lenet\n",
-    "\n",
-    "# Read training data\n",
-    "data = read($data, format=\"csv\")\n",
-    "n = nrow(data)\n",
-    "\n",
-    "# Extract images and labels\n",
-    "images = data[,2:ncol(data)]\n",
-    "labels = data[,1]\n",
-    "\n",
-    "# Scale images to [-1,1], and one-hot encode the labels\n",
-    "images = (images / 255.0) * 2 - 1\n",
-    "labels = table(seq(1, n), labels+1, n, 10)\n",
-    "\n",
-    "# Split into training (55,000 examples) and validation (5,000 examples)\n",
-    "X = images[5001:nrow(images),]\n",
-    "X_val = images[1:5000,]\n",
-    "y = labels[5001:nrow(images),]\n",
-    "y_val = labels[1:5000,]\n",
-    "\n",
-    "# Train\n",
-    "[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win)\n",
-    "\"\"\"\n",
-    "script = (dml(script_string).input(\"$data\", \"data/mnist/mnist_train.csv\")\n",
-    "                            .input(C=1, Hin=28, Win=28)\n",
-    "                            .output(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))\n",
-    "W1, b1, W2, b2, W3, b3, W4, b4 = (ml.execute(script)\n",
-    "                                    .get(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2. Compute Test Accuracy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "script_string = \"\"\"\n",
-    "source(\"mnist_lenet.dml\") as mnist_lenet\n",
-    "\n",
-    "# Read test data\n",
-    "data = read($data, format=\"csv\")\n",
-    "n = nrow(data)\n",
-    "\n",
-    "# Extract images and labels\n",
-    "X_test = data[,2:ncol(data)]\n",
-    "y_test = data[,1]\n",
-    "\n",
-    "# Scale images to [-1,1], and one-hot encode the labels\n",
-    "X_test = (X_test / 255.0) * 2 - 1\n",
-    "y_test = table(seq(1, n), y_test+1, n, 10)\n",
-    "\n",
-    "# Eval on test set\n",
-    "probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)\n",
-    "[loss, accuracy] = mnist_lenet::eval(probs, y_test)\n",
-    "\n",
-    "print(\"Test Accuracy: \" + accuracy)\n",
-    "\"\"\"\n",
-    "script = dml(script_string).input(**{\"$data\": \"data/mnist/mnist_train.csv\",\n",
-    "                                     \"C\": 1, \"Hin\": 28, \"Win\": 28,\n",
-    "                                     \"W1\": W1, \"b1\": b1,\n",
-    "                                     \"W2\": W2, \"b2\": b2,\n",
-    "                                     \"W3\": W3, \"b3\": b3,\n",
-    "                                     \"W4\": W4, \"b4\": b4})\n",
-    "ml.execute(script)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 3. Extract Model Into Spark DataFrames For Future Use"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "W1_df = W1.toDF()\n",
-    "b1_df = b1.toDF()\n",
-    "W2_df = W2.toDF()\n",
-    "b2_df = b2.toDF()\n",
-    "W3_df = W3.toDF()\n",
-    "b3_df = b3.toDF()\n",
-    "W4_df = W4.toDF()\n",
-    "b4_df = b4.toDF()\n",
-    "W1_df, b1_df, W2_df, b2_df, W3_df, b3_df, W4_df, b4_df"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/Example - MNIST Softmax Classifier.ipynb
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/Example - MNIST Softmax Classifier.ipynb b/scripts/staging/SystemML-NN/examples/Example - MNIST Softmax Classifier.ipynb
deleted file mode 100644
index 7f2c2f0..0000000
--- a/scripts/staging/SystemML-NN/examples/Example - MNIST Softmax Classifier.ipynb	
+++ /dev/null
@@ -1,185 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Quick Setup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false,
-    "scrolled": false
-   },
-   "outputs": [],
-   "source": [
-    "# Create a SystemML MLContext object\n",
-    "from systemml import MLContext, dml\n",
-    "ml = MLContext(sc)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Download Data - MNIST"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9].  Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "%%sh\n",
-    "mkdir -p data/mnist/\n",
-    "cd data/mnist/\n",
-    "curl -O http://pjreddie.com/media/files/mnist_train.csv\n",
-    "curl -O http://pjreddie.com/media/files/mnist_test.csv"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## SystemML Softmax Model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 1. Train"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "training = \"\"\"\n",
-    "source(\"mnist_softmax.dml\") as mnist_softmax\n",
-    "\n",
-    "# Read training data\n",
-    "data = read($data, format=\"csv\")\n",
-    "n = nrow(data)\n",
-    "\n",
-    "# Extract images and labels\n",
-    "images = data[,2:ncol(data)]\n",
-    "labels = data[,1]\n",
-    "\n",
-    "# Scale images to [0,1], and one-hot encode the labels\n",
-    "images = images / 255.0\n",
-    "labels = table(seq(1, n), labels+1, n, 10)\n",
-    "\n",
-    "# Split into training (55,000 examples) and validation (5,000 examples)\n",
-    "X = images[5001:nrow(images),]\n",
-    "X_val = images[1:5000,]\n",
-    "y = labels[5001:nrow(images),]\n",
-    "y_val = labels[1:5000,]\n",
-    "\n",
-    "# Train\n",
-    "[W, b] = mnist_softmax::train(X, y, X_val, y_val)\n",
-    "\"\"\"\n",
-    "script = dml(training).input(\"$data\", \"data/mnist/mnist_train.csv\").output(\"W\", \"b\")\n",
-    "W, b = ml.execute(script).get(\"W\", \"b\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2. Compute Test Accuracy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "testing = \"\"\"\n",
-    "source(\"mnist_softmax.dml\") as mnist_softmax\n",
-    "\n",
-    "# Read test data\n",
-    "data = read($data, format=\"csv\")\n",
-    "n = nrow(data)\n",
-    "\n",
-    "# Extract images and labels\n",
-    "X_test = data[,2:ncol(data)]\n",
-    "y_test = data[,1]\n",
-    "\n",
-    "# Scale images to [0,1], and one-hot encode the labels\n",
-    "X_test = X_test / 255.0\n",
-    "y_test = table(seq(1, n), y_test+1, n, 10)\n",
-    "\n",
-    "# Eval on test set\n",
-    "probs = mnist_softmax::predict(X_test, W, b)\n",
-    "[loss, accuracy] = mnist_softmax::eval(probs, y_test)\n",
-    "\n",
-    "print(\"Test Accuracy: \" + accuracy)\n",
-    "\"\"\"\n",
-    "script = dml(testing).input(\"$data\", \"data/mnist/mnist_test.csv\", W=W, b=b)\n",
-    "ml.execute(script)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 3. Extract Model Into Spark DataFrames For Future Use"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "W_df = W.toDF()\n",
-    "b_df = b.toDF()\n",
-    "W_df, b_df"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/README.md
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/README.md b/scripts/staging/SystemML-NN/examples/README.md
deleted file mode 100644
index ffacea2..0000000
--- a/scripts/staging/SystemML-NN/examples/README.md
+++ /dev/null
@@ -1,75 +0,0 @@
-<!--
-{% comment %}
-Licensed to the Apache Software Foundation (ASF) under one or more
-contributor license agreements.  See the NOTICE file distributed with
-this work for additional information regarding copyright ownership.
-The ASF licenses this file to you under the Apache License, Version 2.0
-(the "License"); you may not use this file except in compliance with
-the License.  You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-{% endcomment %}
--->
-
-# SystemML-NN Examples
-
-#### This folder contains scripts and PySpark Jupyter notebooks serving as examples of using the *SystemML-NN* (`nn`) deep learning library.
-
----
-
-# Examples
-### MNIST Softmax Classifier
-
-* This example trains a softmax classifier, which is essentially a multi-class logistic regression model, on the MNIST data.  The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
-* Notebook: `Example - MNIST Softmax Classifier.ipynb`.
-* DML Functions: `mnist_softmax.dml`
-* Training script: `mnist_softmax-train.dml`
-* Prediction script: `mnist_softmax-predict.dml`
-
-### MNIST "LeNet" Neural Net
-
-* This example trains a neural network on the MNIST data using a ["LeNet" architecture](http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf). The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
-* Notebook: `Example - MNIST LeNet.ipynb`.
-* DML Functions: `mnist_lenet.dml`
-* Training script: `mnist_lenet-train.dml`
-* Prediction script: `mnist_lenet-predict.dml`
-
----
-
-# Setup
-## Code
-* To run the examples, please first download and unzip the project via GitHub using the "Clone or download" button on the [homepage of the project](https://github.com/dusenberrymw/systemml-nn), *or* via the following commands:
-
-  ```
-  curl -LO https://github.com/dusenberrymw/systemml-nn/archive/master.zip
-  unzip master.zip
-  ```
-
-* Then, move into the `examples` folder via:
-  ```
-  cd systemml-nn-master/examples/
-  ```
-
-## Data
-* These examples use the classic [MNIST](http://yann.lecun.com/exdb/mnist/) dataset, which contains labeled 28x28 pixel images of handwritten digits in the range of 0-9.  There are 60,000 training images, and 10,000 testing images.  Of the 60,000 training images, 5,000 will be used as validation images.
-* **Download**:
-  * **Notebooks**: The data will be automatically downloaded as a step in either of the example notebooks.
-  * **Training scripts**: Please run `get_mnist_data.sh` to download the data separately.
-
-## Execution
-* These examples contain scripts written in SystemML's R-like language (`*.dml`), as well as PySpark Jupyter notebooks (`*.ipynb`).  The scripts contain the math for the algorithms, enclosed in functions, and the notebooks serve as full, end-to-end examples of reading in data, training models using the functions within the scripts, and evaluating final performance.
-* **Notebooks**: To run the notebook examples, please install the SystemML Python package with `pip install systemml`, and then startup Jupyter in the following manner from this directory (or for more information, please see [this great blog post](http://spark.tc/0-to-life-changing-application-with-apache-systemml/)):
-
-  ```
-  PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook" pyspark --master local[*] --driver-memory 3G --driver-class-path SystemML.jar --jars SystemML.jar
-  ```
-
-  Note that all printed output, such as training statistics, from the SystemML scripts will be sent to the terminal in which Jupyter was started (for now...).
-
-* **Scripts**: To run the scripts from the command line using `spark-submit`, please see the comments located at the top of the `-train` and `-predict` scripts.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/get_mnist_data.sh
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/get_mnist_data.sh b/scripts/staging/SystemML-NN/examples/get_mnist_data.sh
deleted file mode 100755
index deb0c40..0000000
--- a/scripts/staging/SystemML-NN/examples/get_mnist_data.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-DIR="$(cd "$(dirname "$0")" && pwd)"
-mkdir -p $DIR/data/mnist/
-cd $DIR/data/mnist/
-curl -O https://pjreddie.com/media/files/mnist_train.csv
-curl -O https://pjreddie.com/media/files/mnist_test.csv
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml b/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml
deleted file mode 100644
index 759418d..0000000
--- a/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml
+++ /dev/null
@@ -1,87 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST LeNet - Predict
-#
-# This script computes the class probability predictions of a
-# trained convolutional net using the "LeNet" architecture on
-# images of handwritten digits.
-#
-# Inputs:
-#  - X: File containing training images.
-#     The format is "pixel_1, pixel_2, ..., pixel_n".
-#  - C: Number of color chanels in the images.
-#  - Hin: Input image height.
-#  - Win: Input image width.
-#  - model_dir: Directory containing the trained weights and biases
-#     of the model.
-#  - out_dir: Directory to store class probability predictions for
-#     each image.
-#  - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
-#     Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-#  - probs: File containing class probability predictions for each
-#     image.
-#
-# Data:
-# The X file should contain images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels.
-#
-# Sample Invocation:
-# Execute using Spark
-#   ```
-#   spark-submit --master local[*] --driver-memory 5G
-#   --conf spark.driver.maxResultSize=0 --conf spark.akka.frameSize=128
-#   $SYSTEMML_HOME/target/SystemML.jar -f mnist_lenet-predict.dml
-#   -nvargs X=data/mnist/images.csv C=1 Hin=28 Win=28
-#   model_dir=model/mnist_lenet out_dir=data/mnist
-#   ```
-#
-source("mnist_lenet.dml") as mnist_lenet
-
-# Read training data
-fmt = ifdef($fmt, "csv")
-X = read($X, format=fmt)
-C = $C
-Hin = $Hin
-Win = $Win
-
-# Scale images to [-1,1]
-X = (X / 255.0) * 2 - 1
-
-# Read model coefficients
-W1 = read($model_dir+"/W1")
-b1 = read($model_dir+"/b1")
-W2 = read($model_dir+"/W2")
-b2 = read($model_dir+"/b2")
-W3 = read($model_dir+"/W3")
-b3 = read($model_dir+"/b3")
-W4 = read($model_dir+"/W4")
-b4 = read($model_dir+"/b4")
-
-# Predict classes
-probs = mnist_lenet::predict(X, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
-
-# Output results
-write(probs, $out_dir+"/probs."+fmt, format=fmt)
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml b/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml
deleted file mode 100644
index eafb34c..0000000
--- a/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml
+++ /dev/null
@@ -1,123 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST LeNet - Train
-#
-# This script trains a convolutional net using the "LeNet" architecture
-# on images of handwritten digits.
-#
-# Inputs:
-#  - train: File containing labeled MNIST training images.
-#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
-#  - test: File containing labeled MNIST test images.
-#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
-#  - C: Number of color chanels in the images.
-#  - Hin: Input image height.
-#  - Win: Input image width.
-#  - epochs: [DEFAULT: 10] Total number of full training loops over
-#     the full data set.
-#  - out_dir: [DEFAULT: "."] Directory to store weights and bias
-#     matrices of trained model, as well as final test accuracy.
-#  - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
-#     Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-#  - W1, W2, W3, W4: Files containing the trained weights of the model.
-#  - b1, b2, b3, b4: Files containing the trained biases of the model.
-#  - accuracy: File containing the final accuracy on the test data.
-#
-# Data:
-# The MNIST dataset contains labeled images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels, and each label is
-# one of 10 possible digits in [0,9].
-#
-# Sample Invocation (running from wihtin the `examples` folder):
-# 1. Download data (60,000 training examples, and 10,000 test examples)
-#   ```
-#   get_mnist_data.sh
-#   ```
-#
-# 2. Execute using Spark
-#   ```
-#   spark-submit --master local[*] --driver-memory 10G
-#   --conf spark.driver.maxResultSize=0 --conf spark.akka.frameSize=128
-#   $SYSTEMML_HOME/target/SystemML.jar -f mnist_lenet-train.dml
-#   -nvargs train=data/mnist/mnist_train.csv test=data/mnist/mnist_test.csv
-#   C=1 Hin=28 Win=28 epochs=10 out_dir=model/mnist_lenet
-#   ```
-#
-source("mnist_lenet.dml") as mnist_lenet
-
-# Read training data & settings
-fmt = ifdef($fmt, "csv")
-train = read($train, format=fmt)
-test = read($test, format=fmt)
-C = $C
-Hin = $Hin
-Win = $Win
-epochs = ifdef($epochs, 10)
-out_dir = ifdef($out_dir, ".")
-
-# Extract images and labels
-images = train[,2:ncol(train)]
-labels = train[,1]
-X_test = test[,2:ncol(test)]
-y_test = test[,1]
-
-# Scale images to [-1,1], and one-hot encode the labels
-n = nrow(train)
-n_test = nrow(test)
-images = (images / 255.0) * 2 - 1
-labels = table(seq(1, n), labels+1, n, 10)
-X_test = (X_test / 255.0) * 2 - 1
-y_test = table(seq(1, n_test), y_test+1, n_test, 10)
-
-# Split into training (55,000 examples) and validation (5,000 examples)
-X = images[5001:nrow(images),]
-X_val = images[1:5000,]
-y = labels[5001:nrow(images),]
-y_val = labels[1:5000,]
-
-# Train
-[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)
-
-# Write model out
-write(W1, out_dir+"/W1")
-write(b1, out_dir+"/b1")
-write(W2, out_dir+"/W2")
-write(b2, out_dir+"/b2")
-write(W3, out_dir+"/W3")
-write(b3, out_dir+"/b3")
-write(W4, out_dir+"/W4")
-write(b4, out_dir+"/b4")
-
-# Eval on test set
-probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
-[loss, accuracy] = mnist_lenet::eval(probs, y_test)
-
-# Output results
-print("Test Accuracy: " + accuracy)
-write(accuracy, out_dir+"/accuracy")
-
-print("")
-print("")
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet.dml b/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
deleted file mode 100644
index e5755c4..0000000
--- a/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
+++ /dev/null
@@ -1,331 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * MNIST LeNet Example
- */
-# Imports
-source("nn/layers/affine.dml") as affine
-source("nn/layers/conv2d_builtin.dml") as conv2d
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/dropout.dml") as dropout
-source("nn/layers/l2_reg.dml") as l2_reg
-source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
-source("nn/layers/relu.dml") as relu
-source("nn/layers/softmax.dml") as softmax
-source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
-
-train = function(matrix[double] X, matrix[double] y,
-                 matrix[double] X_val, matrix[double] y_val,
-                 int C, int Hin, int Win, int epochs)
-    return (matrix[double] W1, matrix[double] b1,
-            matrix[double] W2, matrix[double] b2,
-            matrix[double] W3, matrix[double] b3,
-            matrix[double] W4, matrix[double] b4) {
-  /*
-   * Trains a convolutional net using the "LeNet" architecture.
-   *
-   * The input matrix, X, has N examples, each represented as a 3D
-   * volume unrolled into a single vector.  The targets, y, have K
-   * classes, and are one-hot encoded.
-   *
-   * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
-   *  - y: Target matrix, of shape (N, K).
-   *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
-   *  - y_val: Target validation matrix, of shape (N, K).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - epochs: Total number of full training loops over the full data set.
-   *
-   * Outputs:
-   *  - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
-   *  - b1: 1st layer biases vector, of shape (F1, 1).
-   *  - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
-   *  - b2: 2nd layer biases vector, of shape (F2, 1).
-   *  - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
-   *  - b3: 3rd layer biases vector, of shape (1, N3).
-   *  - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
-   *  - b4: 4th layer biases vector, of shape (1, K).
-   */
-  N = nrow(X)
-  K = ncol(y)
-
-  # Create network:
-  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
-  Hf = 5  # filter height
-  Wf = 5  # filter width
-  stride = 1
-  pad = 2  # For same dimensions, (Hf - stride) / 2
-
-  F1 = 32  # num conv filters in conv1
-  F2 = 64  # num conv filters in conv2
-  N3 = 512  # num nodes in affine3
-  # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)
-
-  [W1, b1] = conv2d::init(F1, C, Hf, Wf)  # inputs: (N, C*Hin*Win)
-  [W2, b2] = conv2d::init(F2, F1, Hf, Wf)  # inputs: (N, F1*(Hin/2)*(Win/2))
-  [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3)  # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
-  [W4, b4] = affine::init(N3, K)  # inputs: (N, N3)
-  W4 = W4 / sqrt(2)  # different initialization, since being fed into softmax, instead of relu
-
-  # Initialize SGD w/ Nesterov momentum optimizer
-  lr = 0.01  # learning rate
-  mu = 0.9  #0.5  # momentum
-  decay = 0.95  # learning rate decay constant
-  vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
-  vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
-  vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
-  vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)
-
-  # Regularization
-  lambda = 5e-04
-
-  # Optimize
-  print("Starting optimization")
-  batch_size = 64
-  iters = ceil(N / batch_size)
-  for (e in 1:epochs) {
-    for(i in 1:iters) {
-      # Get next batch
-      beg = ((i-1) * batch_size) %% N + 1
-      end = min(N, beg + batch_size - 1)
-      X_batch = X[beg:end,]
-      y_batch = y[beg:end,]
-
-      # Compute forward pass
-      ## layer 1: conv1 -> relu1 -> pool1
-      [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
-                                                pad, pad)
-      outr1 = relu::forward(outc1)
-      [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
-                                                    strideh=2, stridew=2, pad=0, pad=0)
-      ## layer 2: conv2 -> relu2 -> pool2
-      [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
-                                                stride, stride, pad, pad)
-      outr2 = relu::forward(outc2)
-      [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
-                                                    strideh=2, stridew=2, pad=0, pad=0)
-      ## layer 3:  affine3 -> relu3 -> dropout
-      outa3 = affine::forward(outp2, W3, b3)
-      outr3 = relu::forward(outa3)
-      [outd3, maskd3] = dropout::forward(outr3, 0.5, -1)
-      ## layer 4:  affine4 -> softmax
-      outa4 = affine::forward(outd3, W4, b4)
-      probs = softmax::forward(outa4)
-
-      # Compute loss & accuracy for training & validation data every 100 iterations.
-      if (i %% 100 == 0) {
-        # Compute training loss & accuracy
-        loss_data = cross_entropy_loss::forward(probs, y_batch)
-        loss_reg_W1 = l2_reg::forward(W1, lambda)
-        loss_reg_W2 = l2_reg::forward(W2, lambda)
-        loss_reg_W3 = l2_reg::forward(W3, lambda)
-        loss_reg_W4 = l2_reg::forward(W4, lambda)
-        loss = loss_data + loss_reg_W1 + loss_reg_W2 + loss_reg_W3 + loss_reg_W4
-        accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
-
-        # Compute validation loss & accuracy
-        probs_val = predict(X_val, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
-        loss_val = cross_entropy_loss::forward(probs_val, y_val)
-        accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
-
-        # Output results
-        print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
-              + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
-      }
-
-      # Compute data backward pass
-      ## loss:
-      dprobs = cross_entropy_loss::backward(probs, y_batch)
-      ## layer 4:  affine4 -> softmax
-      douta4 = softmax::backward(dprobs, outa4)
-      [doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4)
-      ## layer 3:  affine3 -> relu3 -> dropout
-      doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3)
-      douta3 = relu::backward(doutr3, outa3)
-      [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
-      ## layer 2: conv2 -> relu2 -> pool2
-      doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
-                                    strideh=2, stridew=2, pad=0, pad=0)
-      doutc2 = relu::backward(doutr2, outc2)
-      [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
-                                            Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
-      ## layer 1: conv1 -> relu1 -> pool1
-      doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
-                                    strideh=2, stridew=2, pad=0, pad=0)
-      doutc1 = relu::backward(doutr1, outc1)
-      [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win,
-                                              Hf, Wf, stride, stride, pad, pad)
-
-      # Compute regularization backward pass
-      dW1_reg = l2_reg::backward(W1, lambda)
-      dW2_reg = l2_reg::backward(W2, lambda)
-      dW3_reg = l2_reg::backward(W3, lambda)
-      dW4_reg = l2_reg::backward(W4, lambda)
-      dW1 = dW1 + dW1_reg
-      dW2 = dW2 + dW2_reg
-      dW3 = dW3 + dW3_reg
-      dW4 = dW4 + dW4_reg
-
-      # Optimize with SGD w/ Nesterov momentum
-      [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
-      [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
-      [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
-      [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
-      [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
-      [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
-      [W4, vW4] = sgd_nesterov::update(W4, dW4, lr, mu, vW4)
-      [b4, vb4] = sgd_nesterov::update(b4, db4, lr, mu, vb4)
-    }
-    # Anneal momentum towards 0.999
-    #mu = mu + (0.999 - mu)/(1+epochs-e)
-    # Decay learning rate
-    lr = lr * decay
-  }
-}
-
-predict = function(matrix[double] X, int C, int Hin, int Win,
-                   matrix[double] W1, matrix[double] b1,
-                   matrix[double] W2, matrix[double] b2,
-                   matrix[double] W3, matrix[double] b3,
-                   matrix[double] W4, matrix[double] b4)
-    return (matrix[double] probs) {
-  /*
-   * Computes the class probability predictions of a convolutional
-   * net using the "LeNet" architecture.
-   *
-   * The input matrix, X, has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
-   *  - b1: 1st layer biases vector, of shape (F1, 1).
-   *  - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
-   *  - b2: 2nd layer biases vector, of shape (F2, 1).
-   *  - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
-   *  - b3: 3rd layer biases vector, of shape (1, N3).
-   *  - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
-   *  - b4: 4th layer biases vector, of shape (1, K).
-   *
-   * Outputs:
-   *  - probs: Class probabilities, of shape (N, K).
-   */
-  N = nrow(X)
-
-  # Network:
-  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
-  Hf = 5  # filter height
-  Wf = 5  # filter width
-  stride = 1
-  pad = 2  # For same dimensions, (Hf - stride) / 2
-
-  F1 = nrow(W1)  # num conv filters in conv1
-  F2 = nrow(W2)  # num conv filters in conv2
-  N3 = ncol(W3)  # num nodes in affine3
-  K = ncol(W4)  # num nodes in affine4, equal to number of target dimensions (num classes)
-
-  # Compute predictions over mini-batches
-  probs = matrix(0, rows=N, cols=K)
-  batch_size = 64
-  iters = ceil(N / batch_size)
-  for(i in 1:iters) {
-    # Get next batch
-    beg = ((i-1) * batch_size) %% N + 1
-    end = min(N, beg + batch_size - 1)
-    X_batch = X[beg:end,]
-
-    # Compute forward pass
-    ## layer 1: conv1 -> relu1 -> pool1
-    [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
-                                              pad, pad)
-    outr1 = relu::forward(outc1)
-    [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
-                                                  strideh=2, stridew=2, pad=0, pad=0)
-    ## layer 2: conv2 -> relu2 -> pool2
-    [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
-                                              stride, stride, pad, pad)
-    outr2 = relu::forward(outc2)
-    [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
-                                                  strideh=2, stridew=2, pad=0, pad=0)
-    ## layer 3:  affine3 -> relu3
-    outa3 = affine::forward(outp2, W3, b3)
-    outr3 = relu::forward(outa3)
-    ## layer 4:  affine4 -> softmax
-    outa4 = affine::forward(outr3, W4, b4)
-    probs_batch = softmax::forward(outa4)
-
-    # Store predictions
-    probs[beg:end,] = probs_batch
-  }
-}
-
-eval = function(matrix[double] probs, matrix[double] y)
-    return (double loss, double accuracy) {
-  /*
-   * Evaluates a convolutional net using the "LeNet" architecture.
-   *
-   * The probs matrix contains the class probability predictions
-   * of K classes over N examples.  The targets, y, have K classes,
-   * and are one-hot encoded.
-   *
-   * Inputs:
-   *  - probs: Class probabilities, of shape (N, K).
-   *  - y: Target matrix, of shape (N, K).
-   *
-   * Outputs:
-   *  - loss: Scalar loss, of shape (1).
-   *  - accuracy: Scalar accuracy, of shape (1).
-   */
-  # Compute loss & accuracy
-  loss = cross_entropy_loss::forward(probs, y)
-  correct_pred = rowIndexMax(probs) == rowIndexMax(y)
-  accuracy = mean(correct_pred)
-}
-
-generate_dummy_data = function()
-    return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
-  /*
-   * Generate a dummy dataset similar to the MNIST dataset.
-   *
-   * Outputs:
-   *  - X: Input data matrix, of shape (N, D).
-   *  - y: Target matrix, of shape (N, K).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   */
-  # Generate dummy input data
-  N = 1024  # num examples
-  C = 1  # num input channels
-  Hin = 28  # input height
-  Win = 28  # input width
-  K = 10  # num target classes
-  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
-  classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))
-  y = table(seq(1, N), classes)  # one-hot encoding
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml b/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml
deleted file mode 100644
index 4101e27..0000000
--- a/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml
+++ /dev/null
@@ -1,74 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST Softmax - Predict
-#
-# This script computes the class probability predictions of a
-# trained softmax classifier on images of handwritten digits.
-#
-# Inputs:
-#  - X: File containing training images.
-#     The format is "pixel_1, pixel_2, ..., pixel_n".
-#  - model_dir: Directory containing the trained weights and biases
-#     of the model.
-#  - out_dir: Directory to store class probability predictions for
-#     each image.
-#  - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
-#     Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-#  - probs: File containing class probability predictions for each
-#     image.
-#
-# Data:
-# The X file should contain images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels.
-#
-# Sample Invocation:
-# Execute using Spark
-#   ```
-#   spark-submit --master local[*] --driver-memory 5G
-#   --conf spark.driver.maxResultSize=0 --conf spark.akka.frameSize=128
-#   $SYSTEMML_HOME/target/SystemML.jar -f mnist_softmax-predict.dml
-#   -nvargs X=data/mnist/images.csv model_dir=model/mnist_softmax
-#   out_dir=data/mnist
-#   ```
-#
-source("mnist_softmax.dml") as mnist_softmax
-
-# Read training data
-fmt = ifdef($fmt, "csv")
-X = read($X, format=fmt)
-
-# Scale images to [0,1], and one-hot encode the labels
-X = X / 255.0
-
-# Read model coefficients
-W = read($model_dir+"/W")
-b = read($model_dir+"/b")
-
-# Predict classes
-probs = mnist_softmax::predict(X, W, b)
-
-# Output results
-write(probs, $out_dir+"/probs."+fmt, format=fmt)
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml b/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml
deleted file mode 100644
index 2941dfa..0000000
--- a/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml
+++ /dev/null
@@ -1,108 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST Softmax - Train
-#
-# This script trains a softmax classifier on images of handwritten
-# digits.
-#
-# Inputs:
-#  - train: File containing labeled MNIST training images.
-#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
-#  - test: File containing labeled MNIST test images.
-#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
-#  - out_dir: Directory to store weights and bias matrices of
-#     trained model, as well as final test accuracy.
-#  - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
-#     Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-#  - W: File containing the trained weights of the model.
-#  - b: File containing the trained biases of the model.
-#  - accuracy: File containing the final accuracy on the test data.
-#
-# Data:
-# The MNIST dataset contains labeled images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels, and each label is
-# one of 10 possible digits in [0,9].
-#
-# Sample Invocation (running from wihtin the `examples` folder):
-# 1. Download data (60,000 training examples, and 10,000 test examples)
-#   ```
-#   get_mnist_data.sh
-#   ```
-#
-# 2. Execute using Spark
-#   ```
-#   spark-submit --master local[*] --driver-memory 5G
-#   --conf spark.driver.maxResultSize=0 --conf spark.akka.frameSize=128
-#   $SYSTEMML_HOME/target/SystemML.jar -f mnist_softmax-train.dml
-#   -nvargs train=data/mnist/mnist_train.csv test=data/mnist/mnist_test.csv
-#   out_dir=model/mnist_softmax
-#   ```
-#
-source("mnist_softmax.dml") as mnist_softmax
-
-# Read training data
-fmt = ifdef($fmt, "csv")
-train = read($train, format=fmt)
-test = read($test, format=fmt)
-
-# Extract images and labels
-images = train[,2:ncol(train)]
-labels = train[,1]
-X_test = test[,2:ncol(test)]
-y_test = test[,1]
-
-# Scale images to [0,1], and one-hot encode the labels
-n = nrow(train)
-n_test = nrow(test)
-classes = 10
-images = images / 255.0
-labels = table(seq(1, n), labels+1, n, classes)
-X_test = X_test / 255.0
-y_test = table(seq(1, n_test), y_test+1, n_test, classes)
-
-# Split into training (55,000 examples) and validation (5,000 examples)
-X = images[5001:nrow(images),]
-X_val = images[1:5000,]
-y = labels[5001:nrow(images),]
-y_val = labels[1:5000,]
-
-# Train
-[W, b] = mnist_softmax::train(X, y, X_val, y_val)
-
-# Write model out
-write(W, $out_dir+"/W")
-write(b, $out_dir+"/b")
-
-# Eval on test set
-probs = mnist_softmax::predict(X_test, W, b)
-[loss, accuracy] = mnist_softmax::eval(probs, y_test)
-
-# Output results
-print("Test Accuracy: " + accuracy)
-write(accuracy, $out_dir+"/accuracy")
-
-print("")
-print("")
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/mnist_softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_softmax.dml b/scripts/staging/SystemML-NN/examples/mnist_softmax.dml
deleted file mode 100644
index dc712f6..0000000
--- a/scripts/staging/SystemML-NN/examples/mnist_softmax.dml
+++ /dev/null
@@ -1,177 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * MNIST Softmax Example
- */
-# Imports
-source("nn/layers/affine.dml") as affine
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/softmax.dml") as softmax
-source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
-
-train = function(matrix[double] X, matrix[double] y,
-                 matrix[double] X_val, matrix[double] y_val)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Trains a softmax classifier.
-   *
-   * The input matrix, X, has N examples, each with D features.
-   * The targets, y, have K classes, and are one-hot encoded.
-   *
-   * Inputs:
-   *  - X: Input data matrix, of shape (N, D).
-   *  - y: Target matrix, of shape (N, K).
-   *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
-   *  - y_val: Target validation matrix, of shape (N, K).
-   *
-   * Outputs:
-   *  - W: Weights (parameters) matrix, of shape (D, M).
-   *  - b: Biases vector, of shape (1, M).
-   */
-  N = nrow(X)  # num examples
-  D = ncol(X)  # num features
-  K = ncol(y)  # num classes
-
-  # Create softmax classifier:
-  # affine -> softmax
-  [W, b] = affine::init(D, K)
-  W = W / sqrt(2.0/(D)) * sqrt(1/(D))
-
-  # Initialize SGD w/ Nesterov momentum optimizer
-  lr = 0.2  # learning rate
-  mu = 0  # momentum
-  decay = 0.99  # learning rate decay constant
-  vW = sgd_nesterov::init(W)  # optimizer momentum state for W
-  vb = sgd_nesterov::init(b)  # optimizer momentum state for b
-
-  # Optimize
-  print("Starting optimization")
-  batch_size = 50
-  epochs = 1
-  iters = 1000 #ceil(N / batch_size)
-  for (e in 1:epochs) {
-    for(i in 1:iters) {
-      # Get next batch
-      beg = ((i-1) * batch_size) %% N + 1
-      end = min(N, beg + batch_size - 1)
-      X_batch = X[beg:end,]
-      y_batch = y[beg:end,]
-
-      # Compute forward pass
-      ## affine & softmax:
-      out = affine::forward(X_batch, W, b)
-      probs = softmax::forward(out)
-
-      # Compute loss & accuracy for training & validation data
-      loss = cross_entropy_loss::forward(probs, y_batch)
-      accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
-      probs_val = predict(X_val, W, b)
-      loss_val = cross_entropy_loss::forward(probs_val, y_val)
-      accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
-      print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
-            accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
-
-      # Compute backward pass
-      ## loss:
-      dprobs = cross_entropy_loss::backward(probs, y_batch)
-      ## affine & softmax:
-      dout = softmax::backward(dprobs, out)
-      [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b)
-
-      # Optimize with SGD w/ Nesterov momentum
-      [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW)
-      [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb)
-    }
-    # Anneal momentum towards 0.999
-    mu = mu + (0.999 - mu)/(1+epochs-e)
-    # Decay learning rate
-    lr = lr * decay
-  }
-}
-
-predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
-    return (matrix[double] probs) {
-  /*
-   * Computes the class probability predictions of a softmax classifier.
-   *
-   * The input matrix, X, has N examples, each with D features.
-   *
-   * Inputs:
-   *  - X: Input data matrix, of shape (N, D).
-   *  - W: Weights (parameters) matrix, of shape (D, M).
-   *  - b: Biases vector, of shape (1, M).
-   *
-   * Outputs:
-   *  - probs: Class probabilities, of shape (N, K).
-   */
-  # Compute forward pass
-  ## affine & softmax:
-  out = affine::forward(X, W, b)
-  probs = softmax::forward(out)
-}
-
-eval = function(matrix[double] probs, matrix[double] y)
-    return (double loss, double accuracy) {
-  /*
-   * Evaluates a softmax classifier.
-   *
-   * The probs matrix contains the class probability predictions
-   * of K classes over N examples.  The targets, y, have K classes,
-   * and are one-hot encoded.
-   *
-   * Inputs:
-   *  - probs: Class probabilities, of shape (N, K).
-   *  - y: Target matrix, of shape (N, K).
-   *
-   * Outputs:
-   *  - loss: Scalar loss, of shape (1).
-   *  - accuracy: Scalar accuracy, of shape (1).
-   */
-  # Compute loss & accuracy
-  loss = cross_entropy_loss::forward(probs, y)
-  correct_pred = rowIndexMax(probs) == rowIndexMax(y)
-  accuracy = mean(correct_pred)
-}
-
-generate_dummy_data = function()
-    return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
-  /*
-   * Generate a dummy dataset similar to the MNIST dataset.
-   *
-   * Outputs:
-   *  - X: Input data matrix, of shape (N, D).
-   *  - y: Target matrix, of shape (N, K).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   */
-  # Generate dummy input data
-  N = 1024  # num examples
-  C = 1  # num input channels
-  Hin = 28  # input height
-  Win = 28  # input width
-  T = 10  # num targets
-  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
-  classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
-  y = table(seq(1, N), classes)  # one-hot encoding
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/nn
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/nn b/scripts/staging/SystemML-NN/examples/nn
deleted file mode 120000
index cfe2905..0000000
--- a/scripts/staging/SystemML-NN/examples/nn
+++ /dev/null
@@ -1 +0,0 @@
-../nn
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb b/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb
new file mode 100644
index 0000000..0423269
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb	
@@ -0,0 +1,189 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a SystemML MLContext object\n",
+    "from systemml import MLContext, dml\n",
+    "ml = MLContext(sc)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download Data - MNIST"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9].  Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%sh\n",
+    "mkdir -p data/mnist/\n",
+    "cd data/mnist/\n",
+    "curl -O https://pjreddie.com/media/files/mnist_train.csv\n",
+    "curl -O https://pjreddie.com/media/files/mnist_test.csv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## SystemML \"LeNet\" Neural Network"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "script_string = \"\"\"\n",
+    "source(\"nn/examples/mnist_lenet.dml\") as mnist_lenet\n",
+    "\n",
+    "# Read training data\n",
+    "data = read($data, format=\"csv\")\n",
+    "n = nrow(data)\n",
+    "\n",
+    "# Extract images and labels\n",
+    "images = data[,2:ncol(data)]\n",
+    "labels = data[,1]\n",
+    "\n",
+    "# Scale images to [-1,1], and one-hot encode the labels\n",
+    "images = (images / 255.0) * 2 - 1\n",
+    "labels = table(seq(1, n), labels+1, n, 10)\n",
+    "\n",
+    "# Split into training (55,000 examples) and validation (5,000 examples)\n",
+    "X = images[5001:nrow(images),]\n",
+    "X_val = images[1:5000,]\n",
+    "y = labels[5001:nrow(images),]\n",
+    "y_val = labels[1:5000,]\n",
+    "\n",
+    "# Train\n",
+    "epochs = 10\n",
+    "[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)\n",
+    "\"\"\"\n",
+    "script = (dml(script_string).input(\"$data\", \"data/mnist/mnist_train.csv\")\n",
+    "                            .input(C=1, Hin=28, Win=28)\n",
+    "                            .output(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))\n",
+    "W1, b1, W2, b2, W3, b3, W4, b4 = (ml.execute(script)\n",
+    "                                    .get(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. Compute Test Accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "script_string = \"\"\"\n",
+    "source(\"nn/examples/mnist_lenet.dml\") as mnist_lenet\n",
+    "\n",
+    "# Read test data\n",
+    "data = read($data, format=\"csv\")\n",
+    "n = nrow(data)\n",
+    "\n",
+    "# Extract images and labels\n",
+    "X_test = data[,2:ncol(data)]\n",
+    "y_test = data[,1]\n",
+    "\n",
+    "# Scale images to [-1,1], and one-hot encode the labels\n",
+    "X_test = (X_test / 255.0) * 2 - 1\n",
+    "y_test = table(seq(1, n), y_test+1, n, 10)\n",
+    "\n",
+    "# Eval on test set\n",
+    "probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)\n",
+    "[loss, accuracy] = mnist_lenet::eval(probs, y_test)\n",
+    "\n",
+    "print(\"Test Accuracy: \" + accuracy)\n",
+    "\"\"\"\n",
+    "script = dml(script_string).input(**{\"$data\": \"data/mnist/mnist_train.csv\",\n",
+    "                                     \"C\": 1, \"Hin\": 28, \"Win\": 28,\n",
+    "                                     \"W1\": W1, \"b1\": b1,\n",
+    "                                     \"W2\": W2, \"b2\": b2,\n",
+    "                                     \"W3\": W3, \"b3\": b3,\n",
+    "                                     \"W4\": W4, \"b4\": b4})\n",
+    "ml.execute(script)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. Extract Model Into Spark DataFrames For Future Use"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "W1_df = W1.toDF()\n",
+    "b1_df = b1.toDF()\n",
+    "W2_df = W2.toDF()\n",
+    "b2_df = b2.toDF()\n",
+    "W3_df = W3.toDF()\n",
+    "b3_df = b3.toDF()\n",
+    "W4_df = W4.toDF()\n",
+    "b4_df = b4.toDF()\n",
+    "W1_df, b1_df, W2_df, b2_df, W3_df, b3_df, W4_df, b4_df"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 + Spark 2.x + SystemML",
+   "language": "python",
+   "name": "pyspark3_2.x"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb b/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb
new file mode 100644
index 0000000..5e7182a
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb	
@@ -0,0 +1,179 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# Create a SystemML MLContext object\n",
+    "from systemml import MLContext, dml\n",
+    "ml = MLContext(sc)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download Data - MNIST"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9].  Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%%sh\n",
+    "mkdir -p data/mnist/\n",
+    "cd data/mnist/\n",
+    "curl -O https://pjreddie.com/media/files/mnist_train.csv\n",
+    "curl -O https://pjreddie.com/media/files/mnist_test.csv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## SystemML Softmax Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training = \"\"\"\n",
+    "source(\"nn/examples/mnist_softmax.dml\") as mnist_softmax\n",
+    "\n",
+    "# Read training data\n",
+    "data = read($data, format=\"csv\")\n",
+    "n = nrow(data)\n",
+    "\n",
+    "# Extract images and labels\n",
+    "images = data[,2:ncol(data)]\n",
+    "labels = data[,1]\n",
+    "\n",
+    "# Scale images to [0,1], and one-hot encode the labels\n",
+    "images = images / 255.0\n",
+    "labels = table(seq(1, n), labels+1, n, 10)\n",
+    "\n",
+    "# Split into training (55,000 examples) and validation (5,000 examples)\n",
+    "X = images[5001:nrow(images),]\n",
+    "X_val = images[1:5000,]\n",
+    "y = labels[5001:nrow(images),]\n",
+    "y_val = labels[1:5000,]\n",
+    "\n",
+    "# Train\n",
+    "epochs = 1\n",
+    "[W, b] = mnist_softmax::train(X, y, X_val, y_val, epochs)\n",
+    "\"\"\"\n",
+    "script = dml(training).input(\"$data\", \"data/mnist/mnist_train.csv\").output(\"W\", \"b\")\n",
+    "W, b = ml.execute(script).get(\"W\", \"b\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. Compute Test Accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "testing = \"\"\"\n",
+    "source(\"nn/examples/mnist_softmax.dml\") as mnist_softmax\n",
+    "\n",
+    "# Read test data\n",
+    "data = read($data, format=\"csv\")\n",
+    "n = nrow(data)\n",
+    "\n",
+    "# Extract images and labels\n",
+    "X_test = data[,2:ncol(data)]\n",
+    "y_test = data[,1]\n",
+    "\n",
+    "# Scale images to [0,1], and one-hot encode the labels\n",
+    "X_test = X_test / 255.0\n",
+    "y_test = table(seq(1, n), y_test+1, n, 10)\n",
+    "\n",
+    "# Eval on test set\n",
+    "probs = mnist_softmax::predict(X_test, W, b)\n",
+    "[loss, accuracy] = mnist_softmax::eval(probs, y_test)\n",
+    "\n",
+    "print(\"Test Accuracy: \" + accuracy)\n",
+    "\"\"\"\n",
+    "script = dml(testing).input(\"$data\", \"data/mnist/mnist_test.csv\", W=W, b=b)\n",
+    "ml.execute(script)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. Extract Model Into Spark DataFrames For Future Use"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "W_df = W.toDF()\n",
+    "b_df = b.toDF()\n",
+    "W_df, b_df"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/README.md
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/README.md b/scripts/staging/SystemML-NN/nn/examples/README.md
new file mode 100644
index 0000000..d5e9d04
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/README.md
@@ -0,0 +1,74 @@
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+# SystemML-NN Examples
+
+#### This folder contains scripts and PySpark Jupyter notebooks serving as examples of using the *SystemML-NN* (`nn`) deep learning library.
+
+---
+
+# Examples
+### MNIST Softmax Classifier
+
+* This example trains a softmax classifier, which is essentially a multi-class logistic regression model, on the MNIST data.  The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
+* Notebook: `Example - MNIST Softmax Classifier.ipynb`.
+* DML Functions: `mnist_softmax.dml`
+* Training script: `mnist_softmax-train.dml`
+* Prediction script: `mnist_softmax-predict.dml`
+
+### MNIST "LeNet" Neural Net
+
+* This example trains a neural network on the MNIST data using a ["LeNet" architecture](http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf). The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
+* Notebook: `Example - MNIST LeNet.ipynb`.
+* DML Functions: `mnist_lenet.dml`
+* Training script: `mnist_lenet-train.dml`
+* Prediction script: `mnist_lenet-predict.dml`
+
+---
+
+# Setup
+## Code
+* To run the examples, please first download and unzip the project via GitHub using the "Clone or download" button on the [homepage of the project](https://github.com/dusenberrymw/systemml-nn), *or* via the following commands:
+
+  ```
+  git clone https://github.com/dusenberrymw/systemml-nn.git
+  ```
+
+* Then, move into the `systemml-nn` folder via:
+  ```
+  cd systemml-nn
+  ```
+
+## Data
+* These examples use the classic [MNIST](http://yann.lecun.com/exdb/mnist/) dataset, which contains labeled 28x28 pixel images of handwritten digits in the range of 0-9.  There are 60,000 training images, and 10,000 testing images.  Of the 60,000 training images, 5,000 will be used as validation images.
+* **Download**:
+  * **Notebooks**: The data will be automatically downloaded as a step in either of the example notebooks.
+  * **Training scripts**: Please run `get_mnist_data.sh` to download the data separately.
+
+## Execution
+* These examples contain scripts written in SystemML's R-like language (`*.dml`), as well as PySpark Jupyter notebooks (`*.ipynb`).  The scripts contain the math for the algorithms, enclosed in functions, and the notebooks serve as full, end-to-end examples of reading in data, training models using the functions within the scripts, and evaluating final performance.
+* **Notebooks**: To run the notebook examples, please install the SystemML Python package with `pip install systemml`, and then startup Jupyter in the following manner from this directory (or for more information, please see [this great blog post](http://spark.tc/0-to-life-changing-application-with-apache-systemml/)):
+
+  ```
+  PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook" pyspark --master local[*] --driver-memory 3G --driver-class-path SystemML.jar --jars SystemML.jar
+  ```
+
+  Note that all printed output, such as training statistics, from the SystemML scripts will be sent to the terminal in which Jupyter was started (for now...).
+
+* **Scripts**: To run the scripts from the command line using `spark-submit`, please see the comments located at the top of the `-train` and `-predict` scripts.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh b/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh
new file mode 100755
index 0000000..deb0c40
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+DIR="$(cd "$(dirname "$0")" && pwd)"
+mkdir -p $DIR/data/mnist/
+cd $DIR/data/mnist/
+curl -O https://pjreddie.com/media/files/mnist_train.csv
+curl -O https://pjreddie.com/media/files/mnist_test.csv
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml
new file mode 100644
index 0000000..85a5307
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml
@@ -0,0 +1,91 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST LeNet - Predict
+#
+# This script computes the class probability predictions of a
+# trained convolutional net using the "LeNet" architecture on
+# images of handwritten digits.
+#
+# Inputs:
+#  - X: File containing training images.
+#     The format is "pixel_1, pixel_2, ..., pixel_n".
+#  - C: Number of color chanels in the images.
+#  - Hin: Input image height.
+#  - Win: Input image width.
+#  - model_dir: Directory containing the trained weights and biases
+#     of the model.
+#  - out_dir: Directory to store class probability predictions for
+#     each image.
+#  - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
+#     Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+#  - probs: File containing class probability predictions for each
+#     image.
+#
+# Data:
+# The X file should contain images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels.
+#
+# Sample Invocation (running from outside the `nn` folder):
+# 1. Download images.
+#
+#   For example, save images to `nn/examples/data/mnist/images.csv`.
+#
+# 2. Execute using Spark
+#   ```
+#   spark-submit --master local[*] --driver-memory 5G
+#   --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+#   $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_lenet-predict.dml
+#   -nvargs X=nn/examples/data/mnist/images.csv C=1 Hin=28 Win=28
+#   model_dir=nn/examples/model/mnist_lenet out_dir=nn/examples/data/mnist
+#   ```
+#
+source("nn/examples/mnist_lenet.dml") as mnist_lenet
+
+# Read training data
+fmt = ifdef($fmt, "csv")
+X = read($X, format=fmt)
+C = $C
+Hin = $Hin
+Win = $Win
+
+# Scale images to [-1,1]
+X = (X / 255.0) * 2 - 1
+
+# Read model coefficients
+W1 = read($model_dir+"/W1")
+b1 = read($model_dir+"/b1")
+W2 = read($model_dir+"/W2")
+b2 = read($model_dir+"/b2")
+W3 = read($model_dir+"/W3")
+b3 = read($model_dir+"/b3")
+W4 = read($model_dir+"/W4")
+b4 = read($model_dir+"/b4")
+
+# Predict classes
+probs = mnist_lenet::predict(X, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
+
+# Output results
+write(probs, $out_dir+"/probs."+fmt, format=fmt)
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml
new file mode 100644
index 0000000..0fc733e
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml
@@ -0,0 +1,123 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST LeNet - Train
+#
+# This script trains a convolutional net using the "LeNet" architecture
+# on images of handwritten digits.
+#
+# Inputs:
+#  - train: File containing labeled MNIST training images.
+#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
+#  - test: File containing labeled MNIST test images.
+#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
+#  - C: Number of color chanels in the images.
+#  - Hin: Input image height.
+#  - Win: Input image width.
+#  - epochs: [DEFAULT: 10] Total number of full training loops over
+#     the full data set.
+#  - out_dir: [DEFAULT: "."] Directory to store weights and bias
+#     matrices of trained model, as well as final test accuracy.
+#  - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
+#     Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+#  - W1, W2, W3, W4: Files containing the trained weights of the model.
+#  - b1, b2, b3, b4: Files containing the trained biases of the model.
+#  - accuracy: File containing the final accuracy on the test data.
+#
+# Data:
+# The MNIST dataset contains labeled images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels, and each label is
+# one of 10 possible digits in [0,9].
+#
+# Sample Invocation (running from outside the `nn` folder):
+# 1. Download data (60,000 training examples, and 10,000 test examples)
+#   ```
+#   nn/examples/get_mnist_data.sh
+#   ```
+#
+# 2. Execute using Spark
+#   ```
+#   spark-submit --master local[*] --driver-memory 10G
+#   --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+#   $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_lenet-train.dml
+#   -nvargs train=nn/examples/data/mnist/mnist_train.csv test=nn/examples/data/mnist/mnist_test.csv
+#   C=1 Hin=28 Win=28 epochs=10 out_dir=nn/examples/model/mnist_lenet
+#   ```
+#
+source("nn/examples/mnist_lenet.dml") as mnist_lenet
+
+# Read training data & settings
+fmt = ifdef($fmt, "csv")
+train = read($train, format=fmt)
+test = read($test, format=fmt)
+C = $C
+Hin = $Hin
+Win = $Win
+epochs = ifdef($epochs, 10)
+out_dir = ifdef($out_dir, ".")
+
+# Extract images and labels
+images = train[,2:ncol(train)]
+labels = train[,1]
+X_test = test[,2:ncol(test)]
+y_test = test[,1]
+
+# Scale images to [-1,1], and one-hot encode the labels
+n = nrow(train)
+n_test = nrow(test)
+images = (images / 255.0) * 2 - 1
+labels = table(seq(1, n), labels+1, n, 10)
+X_test = (X_test / 255.0) * 2 - 1
+y_test = table(seq(1, n_test), y_test+1, n_test, 10)
+
+# Split into training (55,000 examples) and validation (5,000 examples)
+X = images[5001:nrow(images),]
+X_val = images[1:5000,]
+y = labels[5001:nrow(images),]
+y_val = labels[1:5000,]
+
+# Train
+[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)
+
+# Write model out
+write(W1, out_dir+"/W1")
+write(b1, out_dir+"/b1")
+write(W2, out_dir+"/W2")
+write(b2, out_dir+"/b2")
+write(W3, out_dir+"/W3")
+write(b3, out_dir+"/b3")
+write(W4, out_dir+"/W4")
+write(b4, out_dir+"/b4")
+
+# Eval on test set
+probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
+[loss, accuracy] = mnist_lenet::eval(probs, y_test)
+
+# Output results
+print("Test Accuracy: " + accuracy)
+write(accuracy, out_dir+"/accuracy")
+
+print("")
+print("")
+


[06/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn` library to `scripts/nn`

Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
deleted file mode 100644
index a529a12..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
+++ /dev/null
@@ -1,178 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * MNIST Softmax Example
- */
-# Imports
-source("nn/layers/affine.dml") as affine
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/softmax.dml") as softmax
-source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
-
-train = function(matrix[double] X, matrix[double] y,
-                 matrix[double] X_val, matrix[double] y_val,
-                 int epochs)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Trains a softmax classifier.
-   *
-   * The input matrix, X, has N examples, each with D features.
-   * The targets, y, have K classes, and are one-hot encoded.
-   *
-   * Inputs:
-   *  - X: Input data matrix, of shape (N, D).
-   *  - y: Target matrix, of shape (N, K).
-   *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
-   *  - y_val: Target validation matrix, of shape (N, K).
-   *  - epochs: Total number of full training loops over the full data set.
-   *
-   * Outputs:
-   *  - W: Weights (parameters) matrix, of shape (D, M).
-   *  - b: Biases vector, of shape (1, M).
-   */
-  N = nrow(X)  # num examples
-  D = ncol(X)  # num features
-  K = ncol(y)  # num classes
-
-  # Create softmax classifier:
-  # affine -> softmax
-  [W, b] = affine::init(D, K)
-  W = W / sqrt(2.0/(D)) * sqrt(1/(D))
-
-  # Initialize SGD w/ Nesterov momentum optimizer
-  lr = 0.2  # learning rate
-  mu = 0  # momentum
-  decay = 0.99  # learning rate decay constant
-  vW = sgd_nesterov::init(W)  # optimizer momentum state for W
-  vb = sgd_nesterov::init(b)  # optimizer momentum state for b
-
-  # Optimize
-  print("Starting optimization")
-  batch_size = 50
-  iters = 1000 #ceil(N / batch_size)
-  for (e in 1:epochs) {
-    for(i in 1:iters) {
-      # Get next batch
-      beg = ((i-1) * batch_size) %% N + 1
-      end = min(N, beg + batch_size - 1)
-      X_batch = X[beg:end,]
-      y_batch = y[beg:end,]
-
-      # Compute forward pass
-      ## affine & softmax:
-      out = affine::forward(X_batch, W, b)
-      probs = softmax::forward(out)
-
-      # Compute loss & accuracy for training & validation data
-      loss = cross_entropy_loss::forward(probs, y_batch)
-      accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
-      probs_val = predict(X_val, W, b)
-      loss_val = cross_entropy_loss::forward(probs_val, y_val)
-      accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
-      print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
-            accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
-
-      # Compute backward pass
-      ## loss:
-      dprobs = cross_entropy_loss::backward(probs, y_batch)
-      ## affine & softmax:
-      dout = softmax::backward(dprobs, out)
-      [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b)
-
-      # Optimize with SGD w/ Nesterov momentum
-      [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW)
-      [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb)
-    }
-    # Anneal momentum towards 0.999
-    mu = mu + (0.999 - mu)/(1+epochs-e)
-    # Decay learning rate
-    lr = lr * decay
-  }
-}
-
-predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
-    return (matrix[double] probs) {
-  /*
-   * Computes the class probability predictions of a softmax classifier.
-   *
-   * The input matrix, X, has N examples, each with D features.
-   *
-   * Inputs:
-   *  - X: Input data matrix, of shape (N, D).
-   *  - W: Weights (parameters) matrix, of shape (D, M).
-   *  - b: Biases vector, of shape (1, M).
-   *
-   * Outputs:
-   *  - probs: Class probabilities, of shape (N, K).
-   */
-  # Compute forward pass
-  ## affine & softmax:
-  out = affine::forward(X, W, b)
-  probs = softmax::forward(out)
-}
-
-eval = function(matrix[double] probs, matrix[double] y)
-    return (double loss, double accuracy) {
-  /*
-   * Evaluates a softmax classifier.
-   *
-   * The probs matrix contains the class probability predictions
-   * of K classes over N examples.  The targets, y, have K classes,
-   * and are one-hot encoded.
-   *
-   * Inputs:
-   *  - probs: Class probabilities, of shape (N, K).
-   *  - y: Target matrix, of shape (N, K).
-   *
-   * Outputs:
-   *  - loss: Scalar loss, of shape (1).
-   *  - accuracy: Scalar accuracy, of shape (1).
-   */
-  # Compute loss & accuracy
-  loss = cross_entropy_loss::forward(probs, y)
-  correct_pred = rowIndexMax(probs) == rowIndexMax(y)
-  accuracy = mean(correct_pred)
-}
-
-generate_dummy_data = function()
-    return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
-  /*
-   * Generate a dummy dataset similar to the MNIST dataset.
-   *
-   * Outputs:
-   *  - X: Input data matrix, of shape (N, D).
-   *  - y: Target matrix, of shape (N, K).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   */
-  # Generate dummy input data
-  N = 1024  # num examples
-  C = 1  # num input channels
-  Hin = 28  # input height
-  Win = 28  # input width
-  T = 10  # num targets
-  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
-  classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
-  y = table(seq(1, N), classes)  # one-hot encoding
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/affine.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/affine.dml b/scripts/staging/SystemML-NN/nn/layers/affine.dml
deleted file mode 100644
index c9a740b..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/affine.dml
+++ /dev/null
@@ -1,92 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Affine (fully-connected) layer.
- */
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b)
-    return (matrix[double] out) {
-  /*
-   * Computes the forward pass for an affine (fully-connected) layer
-   * with M neurons.  The input data has N examples, each with D
-   * features.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, D).
-   *  - W: Weights, of shape (D, M).
-   *  - b: Biases, of shape (1, M).
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, M).
-   */
-  out = X %*% W + b
-}
-
-backward = function(matrix[double] dout, matrix[double] X,
-                    matrix[double] W, matrix[double] b)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
-  /*
-   * Computes the backward pass for a fully-connected (affine) layer
-   * with M neurons.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of shape (N, M).
-   *  - X: Inputs, of shape (N, D).
-   *  - W: Weights, of shape (D, M).
-   *  - b: Biases, of shape (1, M).
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, D).
-   *  - dW: Gradient wrt `W`, of shape (D, M).
-   *  - db: Gradient wrt `b`, of shape (1, M).
-   */
-  dX = dout %*% t(W)
-  dW = t(X) %*% dout
-  db = colSums(dout)
-}
-
-init = function(int D, int M)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * We use the heuristic by He et al., which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * unit-Gaussian weights by a factor of sqrt(2/n), under the
-   * assumption of relu neurons.
-   *  - http://arxiv.org/abs/1502.01852
-   *
-   * Inputs:
-   *  - D: Dimensionality of the input features (number of features).
-   *  - M: Number of neurons in this layer.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (D, M).
-   *  - b: Biases, of shape (1, M).
-   */
-  W = rand(rows=D, cols=M, pdf="normal") * sqrt(2.0/D)
-  b = matrix(0, rows=1, cols=M)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
deleted file mode 100644
index 2ccffdb..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
+++ /dev/null
@@ -1,210 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 1D Batch Normalization layer.
- */
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
-                   string mode, matrix[double] ema_mean, matrix[double] ema_var,
-                   double mu, double epsilon)
-    return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
-            matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
-  /*
-   * Computes the forward pass for a 1D batch normalization layer.
-   * The input data has N examples, each with D features.
-   *
-   * A batch normalization layer uses the per-feature sample mean and
-   * per-feature uncorrected sample variance during training to
-   * normalize each feature of the input data.  Additionally, it
-   * introduces learnable parameters (gamma, beta) to control the
-   * amount of normalization.
-   *
-   *   `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
-   *
-   * This implementation maintains exponential moving averages of the
-   * mean and variance during training for use during testing.
-   *
-   * Reference:
-   *  - Batch Normalization: Accelerating Deep Network Training by
-   *    Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
-   *    - https://arxiv.org/abs/1502.03167
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, D).
-   *  - gamma: Scale parameters, of shape (1, D).
-   *  - beta: Shift parameters, of shape (1, D).
-   *  - mode: 'train' or 'test' to indicate if the model is currently
-   *      being trained or tested.  During training, the current batch
-   *      mean and variance will be used to normalize the inputs, while
-   *      during testing, the exponential average of the mean and
-   *      variance over all previous batches will be used.
-   *  - ema_mean: Exponential moving average of the mean, of
-   *      shape (1, D).
-   *  - ema_var: Exponential moving average of the variance, of
-   *      shape (1, D).
-   *  - mu: Momentum value for moving averages.
-   *      Typical values are in the range of [0.9, 0.999].
-   *  - epsilon: Smoothing term to avoid divide by zero errors.
-   *      Typical values are in the range of [1e-5, 1e-3].
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, D).
-   *  - ema_mean_upd: Updated exponential moving average of the mean,
-   *      of shape (1, D).
-   *  - ema_var_upd: Updated exponential moving average of the variance,
-   *      of shape (1, D).
-   *  - cache_mean: Cache of the batch mean, of shape (1, D).
-   *      Note: This is used for performance during training.
-   *  - cache_var: Cache of the batch variance, of shape (1, D).
-   *      Note: This is used for performance during training.
-   *  - cache_norm: Cache of the normalized inputs, of shape (N, D).
-   *      Note: This is used for performance during training.
-   */
-  N = nrow(X)
-
-  if (mode == 'train') {
-    # Compute feature-wise mean and variance
-    mean = colMeans(X)  # shape (1, D)
-    # var = (1/N) * colSums((X-mean)^2)
-    var = colVars(X) * ((N-1)/N)  # compute uncorrected variance, of shape (1, D)
-    # Update moving averages
-    ema_mean_upd = mu*ema_mean + (1-mu)*mean
-    ema_var_upd = mu*ema_var + (1-mu)*var
-  }
-  else {
-    # Use moving averages of mean and variance during testing
-    mean = ema_mean
-    var = ema_var
-    ema_mean_upd = ema_mean
-    ema_var_upd = ema_var
-  }
-
-  # Normalize, shift, and scale
-  # norm = (X-mean)*(var+epsilon)^(-1/2)
-  norm = (X-mean) / sqrt(var+epsilon)  # shape (N, D)
-  out = norm*gamma + beta  # shape (N, D)
-
-  # Save variable for backward pass
-  cache_mean = mean
-  cache_var = var
-  cache_norm = norm
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
-                    matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
-                    matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
-                    matrix[double] X, matrix[double] gamma, matrix[double] beta,
-                    string mode, matrix[double] ema_mean, matrix[double] ema_var,
-                    double mu, double epsilon)
-      return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
-  /*
-   * Computes the backward pass for a 1D batch normalization layer.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of shape (N, D).
-   *  - out: Outputs from the forward pass, of shape (N, D).
-   *  - ema_mean_upd: Updated exponential moving average of the mean
-   *      from the forward pass, of shape (1, D).
-   *  - ema_var_upd: Updated exponential moving average of the variance
-   *      from the forward pass, of shape (1, D).
-   *  - cache_mean: Cache of the batch mean from the forward pass, of
-   *      shape (1, D).  Note: This is used for performance during
-   *      training.
-   *  - cache_var: Cache of the batch variance from the forward pass,
-   *      of shape (1, D).  Note: This is used for performance during
-   *      training.
-   *  - cache_norm: Cache of the normalized inputs from the forward
-   *      pass, of shape (N, D).  Note: This is used for performance
-   *      during training.
-   *  - X: Inputs, of shape (N, D).
-   *  - gamma: Scale parameters, of shape (1, D).
-   *  - beta: Shift parameters, of shape (1, D).
-   *  - mode: 'train' or 'test' to indicate if the model is currently
-   *      being trained or tested.  During training, the current batch
-   *      mean and variance will be used to normalize the inputs, while
-   *      during testing, the exponential average of the mean and
-   *      variance over all previous batches will be used.
-   *  - ema_mean: Exponential moving average of the mean, of
-   *      shape (1, D).
-   *  - ema_var: Exponential moving average of the variance, of
-   *      shape (1, D).
-   *  - mu: Momentum value for moving averages.
-   *      Typical values are in the range of [0.9, 0.999].
-   *  - epsilon: Smoothing term to avoid divide by zero errors.
-   *      Typical values are in the range of [1e-5, 1e-3].
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, D).
-   *  - dgamma: Gradient wrt `W`, of shape (1, D).
-   *  - dbeta: Gradient wrt `b`, of shape (1, D).
-   *
-   */
-  N = nrow(X)
-  mean = cache_mean
-  var = cache_var
-  norm = cache_norm
-  centered = X-mean
-
-  if (mode == 'train') {
-    # Compute gradients during training
-    dgamma = colSums(dout*norm)  # shape (1, D)
-    dbeta = colSums(dout)  # shape (1, D)
-    dnorm = dout * gamma  # shape (N, D)
-    dvar = (-1/2) * colSums(centered * (var+epsilon)^(-3/2) * dnorm)  # shape (1, D)
-    dmean = colSums((-dnorm/sqrt(var+epsilon)) + ((-2/N)*centered*dvar))  # shape (1, D)
-    dX = (dnorm/sqrt(var+epsilon)) + ((2/N)*centered*dvar) + ((1/N)*dmean)  # shape (N, D)
-  }
-  else {
-    # Compute gradients during testing
-    dgamma = colSums(dout*norm)  # shape (1, D)
-    dbeta = colSums(dout)  # shape (1, D)
-    dnorm = dout * gamma  # shape (N, D)
-    dX = dnorm / sqrt(var+epsilon)  # shape (N, D)
-  }
-}
-
-init = function(int D)
-    return (matrix[double] gamma, matrix[double] beta,
-            matrix[double] ema_mean, matrix[double] ema_var) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - D: Dimensionality of the input features (number of features).
-   *
-   * Outputs:
-   *  - gamma: Scale parameters, of shape (1, D).
-   *  - beta: Shift parameters, of shape (1, D).
-   *  - ema_mean: Exponential moving average of the mean, of
-   *      shape (1, D).
-   *  - ema_var: Exponential moving average of the variance, of
-   *      shape (1, D).
-   */
-   gamma = matrix(1, rows=1, cols=D)
-   beta = matrix(0, rows=1, cols=D)
-   ema_mean = matrix(0, rows=1, cols=D)
-   ema_var = matrix(1, rows=1, cols=D)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
deleted file mode 100644
index 49c6746..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
+++ /dev/null
@@ -1,238 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D (Spatial) Batch Normalization layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
-                   int C, int Hin, int Win, string mode,
-                   matrix[double] ema_mean, matrix[double] ema_var,
-                   double mu, double epsilon)
-    return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
-            matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
-  /*
-   * Computes the forward pass for a 2D (spatial) batch normalization
-   * layer.  The input data has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * A spatial batch normalization layer uses the per-channel sample
-   * mean and per-channel uncorrected sample variance during training
-   * to normalize each channel of the input data.  Additionally, it
-   * introduces learnable parameters (gamma, beta) to control the
-   * amount of normalization.
-   *
-   *   `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
-   *
-   * This implementation maintains exponential moving averages of the
-   * mean and variance during training for use during testing.
-   *
-   * Reference:
-   *  - Batch Normalization: Accelerating Deep Network Training by
-   *    Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
-   *    - https://arxiv.org/abs/1502.03167
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - gamma: Scale parameters, of shape (C, 1).
-   *  - beta: Shift parameters, of shape (C, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - mode: 'train' or 'test' to indicate if the model is currently
-   *      being trained or tested.  During training, the current batch
-   *      mean and variance will be used to normalize the inputs, while
-   *      during testing, the exponential average of the mean and
-   *      variance over all previous batches will be used.
-   *  - ema_mean: Exponential moving average of the mean, of
-   *      shape (C, 1).
-   *  - ema_var: Exponential moving average of the variance, of
-   *      shape (C, 1).
-   *  - mu: Momentum value for moving averages.
-   *      Typical values are in the range of [0.9, 0.999].
-   *  - epsilon: Smoothing term to avoid divide by zero errors.
-   *      Typical values are in the range of [1e-5, 1e-3].
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hin*Win).
-   *  - ema_mean_upd: Updated exponential moving average of the mean,
-   *      of shape (C, 1).
-   *  - ema_var_upd: Updated exponential moving average of the variance,
-   *      of shape (C, 1).
-   *  - cache_mean: Cache of the batch mean, of shape (C, 1).
-   *      Note: This is used for performance during training.
-   *  - cache_var: Cache of the batch variance, of shape (C, 1).
-   *      Note: This is used for performance during training.
-   *  - cache_norm: Cache of the normalized inputs, of
-   *      shape (C, N*Hin*Win). Note: This is used for performance
-   *      during training.
-   */
-  N = nrow(X)
-
-  if (mode == 'train') {
-    # Compute channel-wise mean and variance
-    # Since we don't have tensors, we will compute the means and variances in a piece-wise fashion.
-    #  - mean of total group is mean of subgroup means
-    #  - variance is the mean of the subgroup variances + the variance of the subgroup means
-    subgrp_means = matrix(colMeans(X), rows=C, cols=Hin*Win)
-    subgrp_vars = matrix(colVars(X) * ((N-1)/N), rows=C, cols=Hin*Win)  # uncorrected variances
-    mean = rowMeans(subgrp_means)  # shape (C, 1)
-    var = rowMeans(subgrp_vars) + rowVars(subgrp_means)*(((Hin*Win)-1)/(Hin*Win))  # shape (C, 1)
-    # Update moving averages
-    ema_mean_upd = mu*ema_mean + (1-mu)*mean
-    ema_var_upd = mu*ema_var + (1-mu)*var
-  }
-  else {
-    # Use moving averages of mean and variance during testing
-    mean = ema_mean
-    var = ema_var
-    ema_mean_upd = ema_mean
-    ema_var_upd = ema_var
-  }
-
-  # Normalize, shift, and scale
-  # norm = (X-mean)*(var+epsilon)^(-1/2)
-  #      = (X-mean) / sqrt(var+epsilon)
-  centered = bias_add(X, -mean)  # shape (N, C*Hin*Win)
-  norm = bias_multiply(centered, 1/sqrt(var+epsilon))  # shape (N, C*Hin*Win)
-  # out = norm*gamma + beta
-  scaled = bias_multiply(norm, gamma)  # shape (N, C*Hin*Win)
-  out = bias_add(scaled, beta)  # shape (N, C*Hin*Win)
-
-  # Save variable for backward pass
-  cache_mean = mean
-  cache_var = var
-  cache_norm = norm
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
-                    matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
-                    matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
-                    matrix[double] X, matrix[double] gamma, matrix[double] beta,
-                    int C, int Hin, int Win, string mode,
-                    matrix[double] ema_mean, matrix[double] ema_var,
-                    double mu, double epsilon)
-      return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
-  /*
-   * Computes the backward pass for a 2D (spatial) batch normalization
-   * layer.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
-   *  - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
-   *  - ema_mean_upd: Updated exponential moving average of the mean
-   *      from the forward pass, of shape (C, 1).
-   *  - ema_var_upd: Updated exponential moving average of the variance
-   *      from the forward pass, of shape (C, 1).
-   *  - cache_mean: Cache of the batch mean from the forward pass, of
-   *      shape (C, 1).  Note: This is used for performance during
-   *      training.
-   *  - cache_var: Cache of the batch variance from the forward pass,
-   *      of shape (C, 1).  Note: This is used for performance during
-   *      training.
-   *  - cache_norm: Cache of the normalized inputs from the forward
-   *      pass, of shape (C, N*Hin*Win).  Note: This is used for
-   *      performance during training.
-   *  - X: Input data matrix to the forward pass, of
-   *      shape (N, C*Hin*Win).
-   *  - gamma: Scale parameters, of shape (C, 1).
-   *  - beta: Shift parameters, of shape (C, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - mode: 'train' or 'test' to indicate if the model is currently
-   *      being trained or tested.  During training, the current batch
-   *      mean and variance will be used to normalize the inputs, while
-   *      during testing, the exponential average of the mean and
-   *      variance over all previous batches will be used.
-   *  - ema_mean: Exponential moving average of the mean, of
-   *      shape (C, 1).
-   *  - ema_var: Exponential moving average of the variance, of
-   *      shape (C, 1).
-   *  - mu: Momentum value for moving averages.
-   *      Typical values are in the range of [0.9, 0.999].
-   *  - epsilon: Smoothing term to avoid divide by zero errors.
-   *      Typical values are in the range of [1e-5, 1e-3].
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dgamma: Gradient wrt `W`, of shape (C, 1).
-   *  - dbeta: Gradient wrt `b`, of shape (C, 1).
-   *
-   */
-  N = nrow(X)
-  mean = cache_mean
-  var = cache_var
-  norm = cache_norm
-  centered = bias_add(X, -mean)  # shape (N, C*Hin*Win)
-
-  if (mode == 'train') {
-    # Compute gradients during training
-    dgamma = util::channel_sums(dout*norm, C, Hin, Win)  # shape (C, 1)
-    dbeta = util::channel_sums(dout, C, Hin, Win)  # shape (C, 1)
-    dnorm = bias_multiply(dout, gamma)  # shape (N, C*Hin*Win)
-    dvar = util::channel_sums((-1/2) * bias_multiply(centered, (var+epsilon)^(-3/2)) * dnorm,
-                              C, Hin, Win)  # shape (C, 1)
-    dmean_norm_branch = util::channel_sums(bias_multiply(dnorm, -1/sqrt(var+epsilon)), C, Hin, Win)
-    dmean_var_branch =  util::channel_sums((-2/(N*Hin*Win)) * centered, C, Hin, Win)
-    dmean_var_branch = dmean_var_branch * dvar  # we can't use a function within an expression yet
-    dmean = dmean_norm_branch + dmean_var_branch  # shape (C, 1)
-    dX_norm_branch = bias_multiply(dnorm, 1/sqrt(var+epsilon))
-    dX_mean_branch = (1/(N*Hin*Win)) * bias_add(matrix(0, rows=1, cols=C*Hin*Win), dmean)
-    dX_var_branch = (2/(N*Hin*Win)) * bias_multiply(centered, dvar)
-    dX = dX_norm_branch + dX_mean_branch + dX_var_branch  # shape (N, C*Hin*Win)
-  }
-  else {
-    # Compute gradients during testing
-    dgamma = util::channel_sums(dout*norm, C, Hin, Win)  # shape (C, 1)
-    dbeta = util::channel_sums(dout, C, Hin, Win)  # shape (C, 1)
-    dnorm = bias_multiply(dout, gamma)  # shape (N, C*Hin*Win)
-    dX = bias_multiply(dnorm, 1/sqrt(var+epsilon))  # shape (N, C*Hin*Win)
-  }
-}
-
-init = function(int C)
-    return (matrix[double] gamma, matrix[double] beta,
-            matrix[double] ema_mean, matrix[double] ema_var) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - C: Number of input channels (dimensionality of input depth).
-   *
-   * Outputs:
-   *  - gamma: Scale parameters, of shape (C, 1).
-   *  - beta: Shift parameters, of shape (C, 1).
-   *  - ema_mean: Exponential moving average of the mean, of
-   *      shape (C, 1).
-   *  - ema_var: Exponential moving average of the variance, of
-   *      shape (C, 1).
-   */
-   gamma = matrix(1, rows=C, cols=1)
-   beta = matrix(0, rows=C, cols=1)
-   ema_mean = matrix(0, rows=C, cols=1)
-   ema_var = matrix(1, rows=C, cols=1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d.dml b/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
deleted file mode 100644
index 9d03568..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
+++ /dev/null
@@ -1,194 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
-                   int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial convolutional layer with
-   * F filters.  The input data has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * This implementation uses `im2col` internally for each image to
-   * extract local image regions (patches) into columns, and then
-   * performs a matrix multiplication with the filters to compute the
-   * output maps.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set `padh = (Hf - 1) / 2`,
-   *      assuming `strideh = 1`.
-   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
-   *      preserves the spatial dimensions of the input.
-   *  - padw: Padding for left and right sides.
-   *      For same output width as input, set `padw = (Wf - 1) / 2`,
-   *      assuming `stridew = 1`.
-   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
-   *      preserves the spatial dimensions of the input.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  F = nrow(W)
-  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
-  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
-  # Create output volume
-  out = matrix(0, rows=N, cols=F*Hout*Wout)
-
-  # Convolution - im2col implementation
-  parfor (n in 1:N) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
-
-    # Pad image
-    Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0)  # shape (C, (Hin+2*padh)*(Win+2*padw))
-
-    # Extract local image patches into columns with im2col, of shape (C*Hf*Wf, Hout*Wout)
-    Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
-
-    # Convolve patches with filters
-    outn = W %*% Xn_padded_cols + b  # shape (F, Hout*Wout)
-    out[n,] = matrix(outn, rows=1, cols=F*Hout*Wout)  # reshape
-  }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
-                    matrix[double] X, matrix[double] W, matrix[double] b,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
-  /*
-   * Computes the backward pass for a 2D spatial convolutional layer
-   * with F filters.
-   *
-   * This implementation uses `im2col` and `col2im` internally.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *  - padw: Padding for left and right sides.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt `b`, of shape (F, 1).
-   */
-  N = nrow(X)
-  F = nrow(W)
-
-  # Create gradient volumes
-  # Note: Create convenience gradient volumes for dW and db that will
-  # allow for one gradient to be stored per example, allowing for
-  # parallel computation at the expense of memory.  We will reduce at
-  # the end.
-  dX = matrix(0, rows=N, cols=C*Hin*Win)
-  dWN = matrix(0, rows=N, cols=F*C*Hf*Wf)  # dW = matrix(0, rows=F, cols=C*Hf*Wf)
-  dbN = matrix(0, rows=N, cols=F)  # db = matrix(0, rows=F, cols=1)
-
-  # Partial derivatives for convolution - im2col implementation
-  parfor (n in 1:N) {  # all examples
-    doutn = matrix(dout[n,], rows=F, cols=Hout*Wout)
-
-    # Compute dW
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
-    Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0)  # shape (C, (Hin+2*padh)*(Win+2*padw))
-    Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
-    # dW = dW + doutn %*% t(Xn_padded_cols)
-    dWN[n,] = matrix(doutn %*% t(Xn_padded_cols), rows=1, cols=F*C*Hf*Wf)
-
-    # Compute db
-    # db = db + rowSums(doutn)
-    dbN[n,] = matrix(rowSums(doutn), rows=1, cols=F)
-
-    # Compute dX
-    dXn_padded_cols = t(W) %*% doutn  # shape (C*Hf*Wf, Hout*Wout)
-    dXn_padded = util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, Wf,
-                              strideh, stridew, "add")
-    dXn = util::unpad_image(dXn_padded, Hin, Win, padh, padw)
-    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)  # reshape
-  }
-
-  # Reduce convenience gradient volumes with one gradient per example
-  # into single gradients for W and b.
-  dW = matrix(colSums(dWN), rows=F, cols=C*Hf*Wf)
-  db = matrix(colSums(dbN), rows=F, cols=1)
-}
-
-init = function(int F, int C, int Hf, int Wf)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * We use the heuristic by He et al., which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * unit-Gaussian weights by a factor of sqrt(2/n), under the
-   * assumption of relu neurons.
-   *  - http://arxiv.org/abs/1502.01852
-   *
-   * Inputs:
-   *  - F: Number of filters.
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   */
-  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
-  b = matrix(0, rows=F, cols=1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
deleted file mode 100644
index bda7a9c..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
+++ /dev/null
@@ -1,160 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- *
- * This implementation uses a built-in operator for higher performance.
- */
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
-                   int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial convolutional layer with
-   * F filters.  The input data has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * This implementation uses a built-in operator for higher
-   * performance.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set `padh = (Hf - 1) / 2`,
-   *      assuming `strideh = 1`.
-   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
-   *      preserves the spatial dimensions of the input.
-   *  - padw: Padding for left and right sides.
-   *      For same output width as input, set `padw = (Wf - 1) / 2`,
-   *      assuming `stridew = 1`.
-   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
-   *      preserves the spatial dimensions of the input.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  F = nrow(W)
-  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
-  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
-  # Convolution - built-in implementation
-  out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
-               stride=[strideh,stridew], padding=[padh,padw])
-
-  # Add bias term to each output filter
-  out = bias_add(out, b)
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
-                    matrix[double] X, matrix[double] W, matrix[double] b,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
-  /*
-   * Computes the backward pass for a 2D spatial convolutional layer
-   * with F filters.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set `padh = (Hf - 1) / 2`,
-   *      assuming `strideh = 1`.
-   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
-   *      preserves the spatial dimensions of the input.
-   *  - padw: Padding for left and right sides.
-   *      For same output width as input, set `padw = (Wf - 1) / 2`,
-   *      assuming `stridew = 1`.
-   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
-   *      preserves the spatial dimensions of the input.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt `b`, of shape (F, 1).
-   */
-  N = nrow(X)
-  F = nrow(W)
-
-  # Partial derivatives for convolution - built-in implementation
-  dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
-                              input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
-  dX = conv2d_backward_data(W, dout, stride=[strideh, stridew], padding=[padh,padw],
-                            input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
-
-  # Partial derivatives for bias vector
-  db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))
-}
-
-init = function(int F, int C, int Hf, int Wf)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * We use the heuristic by He et al., which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * unit-Gaussian weights by a factor of sqrt(2/n), under the
-   * assumption of relu neurons.
-   *  - http://arxiv.org/abs/1502.01852
-   *
-   * Inputs:
-   *  - F: Number of filters.
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   */
-  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
-  b = matrix(0, rows=F, cols=1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
deleted file mode 100644
index 63db502..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
+++ /dev/null
@@ -1,78 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Cross-Entropy loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
-    return (double loss) {
-  /*
-   * Computes the forward pass for a cross-entropy loss function.  The
-   * inputs consist of N examples, each with K dimensions corresponding
-   * to normalized probabilities of K classes.
-   *
-   *   ```
-   *   L_i = -y_i^T * log(pred_i)
-   *   L = (1/N) sum(L_i) for i=1 to N
-   *   ```
-   *
-   * In these equations, `L` is the total loss, `L_i` is the loss for
-   * example `i`, `y_i` is the K-dimensional vector of target class
-   * probabilities, `pred_i` is K-dimensional vector of predicted
-   * class probabilities, and `N` is the number of examples.
-   *
-   * This can be interpreted as the negative log-likelihood assuming
-   * a Bernoulli distribution generalized to K dimensions, or a
-   * Multinomial with one observation.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, K).
-   *  - y: Targets, of shape (N, K).
-   *
-   * Outputs:
-   *  - loss: Average loss.
-   */
-  N = nrow(y)
-  eps = 1e-10  # numerical stability to avoid log(0)
-  losses = rowSums(-y * log(pred+eps))
-  loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
-    return (matrix[double] dpred) {
-  /*
-   * Computes the backward pass of a cross-entropy loss function.  The
-   * inputs consist of N examples, each with K dimensions corresponding
-   * to normalized probabilities of K classes.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, K).
-   *  - y: Targets, of shape (N, K).
-   *
-   * Outputs:
-   *  - dpred: Gradient wrt `pred`, of shape (N, K).
-   */
-  N = nrow(y)
-  eps = 1e-10  # numerical stability to avoid divide-by-zero
-  dpred = (1/N) * -y * (1/(pred+eps))
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/dropout.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/dropout.dml b/scripts/staging/SystemML-NN/nn/layers/dropout.dml
deleted file mode 100644
index a36878b..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/dropout.dml
+++ /dev/null
@@ -1,76 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Dropout layer.
- */
-
-forward = function(matrix[double] X, double p, int seed)
-    return (matrix[double] out, matrix[double] mask) {
-  /*
-   * Computes the forward pass for an inverted dropout layer.
-   *
-   * Drops the inputs element-wise with a probability p, and divides
-   * by p to maintain the expected values of those inputs (which are
-   * the outputs of neurons) at test time.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *  - p: Probability of keeping a neuron output.
-   *  - seed: [Optional: -1] Random number generator seed to allow for
-   *      deterministic evaluation.  Set to -1 for a random seed.
-   *
-   * Outputs:
-   *  - out: Outputs, of same shape as `X`.
-   *  - mask: Dropout mask used to compute the output.
-   */
-  # Normally, we might use something like
-  #    `mask = rand(rows=nrow(X), cols=ncol(X), min=0, max=1, seed=seed) <= p`
-  # to create a dropout mask.  Fortunately, SystemML has a `sparsity` parameter on
-  # the `rand` function that allows use to create a mask directly.
-  if (seed == -1) {
-    mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p)
-  } else {
-    mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed)
-  }
-  out = X * mask / p
-}
-
-backward = function(matrix[double] dout, matrix[double] X, double p, matrix[double] mask)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for an inverted dropout layer.
-   *
-   * Applies the mask to the upstream gradient, and divides by p to
-   * maintain the expected values at test time.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out`, of same shape as `X`.
-   *  - X: Inputs, of shape (any, any).
-   *  - p: Probability of keeping a neuron output.
-   *  - mask: Dropout mask used to compute the output.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of same shape as `X`.
-   */
-  dX = mask / p * dout
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
deleted file mode 100644
index b74566d..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
+++ /dev/null
@@ -1,72 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L1 loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
-    return (double loss) {
-  /*
-   * Computes the forward pass for an L1 loss function.  The inputs
-   * consist of N examples, each with M dimensions to predict.
-   *
-   *   ```
-   *   L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j.
-   *   L = (1/N) sum(L_i) for i=1 to N
-   *   ```
-   *
-   * In these equations, `L` is the total loss, `L_i` is the loss for
-   * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
-   * prediction, and `N` is the number of examples.
-   *
-   * This can be interpreted as the negative log-likelihood assuming
-   * a Laplace distribution.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, M).
-   *  - y: Targets, of shape (N, M).
-   *
-   * Outputs:
-   *  - loss: Average loss.
-   */
-  N = nrow(y)
-  losses = rowSums(abs(pred-y))
-  loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
-    return (matrix[double] dpred) {
-  /*
-   * Computes the backward pass for an L1 loss function.  The inputs
-   * consist of N examples, each with M dimensions to predict.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, M).
-   *  - y: Targets, of shape (N, M).
-   *
-   * Outputs:
-   *  - dpred: Gradient wrt `pred`, of shape (N, M).
-   */
-  N = nrow(y)
-  dpred = sign(pred-y) / N
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
deleted file mode 100644
index 2b81c0b..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
+++ /dev/null
@@ -1,56 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L1 regularization.
- */
-
-forward = function(matrix[double] X, double lambda)
-    return (double reg_loss) {
-  /*
-   * Computes the forward pass for an L1 regularization function.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *  - lambda: Regularization strength.
-   *      A typical value is 0.01.
-   *
-   * Outputs:
-   *  - reg_loss: Total regularization loss.
-   */
-  reg_loss = lambda * sum(abs(X))
-}
-
-backward = function(matrix[double] X, double lambda)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for an L1 regularization function.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *  - lambda: Regularization strength.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of same shape as `X`.
-   */
-  dX = lambda * sign(X)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
deleted file mode 100644
index 0482f25..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
+++ /dev/null
@@ -1,72 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L2 loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
-    return (double loss) {
-  /*
-   * Computes the forward pass for an L2 loss function.  The inputs
-   * consist of N examples, each with M dimensions to predict.
-   *
-   *   ```
-   *   L_i = (1/2) norm(pred_i - y_i)^2
-   *   L = (1/N) sum(L_i) for i=1 to N
-   *   ```
-   *
-   * In these equations, `L` is the total loss, `L_i` is the loss for
-   * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
-   * prediction, and `N` is the number of examples.
-   *
-   * This can be interpreted as the negative log-likelihood assuming
-   * a Gaussian distribution.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, M).
-   *  - y: Targets, of shape (N, M).
-   *
-   * Outputs:
-   *  - loss: Average loss.
-   */
-  N = nrow(y)
-  losses = 0.5 * rowSums((pred-y)^2)
-  loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
-    return (matrix[double] dpred) {
-  /*
-   * Computes the backward pass for an L2 loss function.  The inputs
-   * consist of N examples, each with M dimensions to predict.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, M).
-   *  - y: Targets, of shape (N, M).
-   *
-   * Outputs:
-   *  - dpred: Gradient wrt `pred`, of shape (N, M).
-   */
-  N = nrow(y)
-  dpred = (pred-y) / N
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
deleted file mode 100644
index 7255efe..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
+++ /dev/null
@@ -1,56 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L2 regularization.
- */
-
-forward = function(matrix[double] X, double lambda)
-    return (double reg_loss) {
-  /*
-   * Computes the forward pass for an L2 regularization function.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *  - lambda: Regularization strength.
-   *      A typical value is 0.01.
-   *
-   * Outputs:
-   *  - reg_loss: Total regularization loss.
-   */
-  reg_loss = 0.5 * lambda * sum(X^2)
-}
-
-backward = function(matrix[double] X, double lambda)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for an L2 regularization function.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *  - lambda: Regularization strength.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of same shape as `X`.
-   */
-  dX = lambda * X
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
deleted file mode 100644
index 15914f7..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
+++ /dev/null
@@ -1,76 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Log loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
-    return (double loss) {
-  /*
-   * Computes the forward pass for a log loss function.
-   *
-   *   ```
-   *   L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i)
-   *   L = (1/N) sum(L_i) for i=1 to N
-   *   ```
-   *
-   * In these equations, `L` is the total loss, `L_i` is the loss for
-   * example `i`, `y_i` is the binary target, `pred_i` is probability
-   * of the true class (i.e. `y=1`), and `N` is the number of examples.
-   *
-   * This can be interpreted as the negative log-likelihood assuming
-   * a Bernoulli distribution.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, 1).
-   *      Predictions should be probabilities of the true
-   *      class (i.e. probability of `y=1`).
-   *  - y: Targets, of shape (N, 1).
-   *      Targets should be binary in the set {0, 1}.
-   *
-   * Outputs:
-   *  - loss: Average loss.
-   */
-  N = nrow(y)
-  losses = -y*log(pred) - (1-y)*log(1-pred)
-  loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
-    return (matrix[double] dpred) {
-  /*
-   * Computes the backward pass for a log loss function.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, 1).
-   *      Predictions should be probabilities of the true
-   *      class (i.e. probability of `y=1`).
-   *  - y: Targets, of shape (N, 1).
-   *      Targets should be binary in the set {0, 1}.
-   *
-   * Outputs:
-   *  - dpred: Gradient wrt `pred`, of shape (N, 1).
-   */
-  N = nrow(y)
-  dpred = (1/N) * (pred-y) / (pred*(1-pred))
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/lstm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/lstm.dml b/scripts/staging/SystemML-NN/nn/layers/lstm.dml
deleted file mode 100644
index a75add4..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/lstm.dml
+++ /dev/null
@@ -1,260 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * LSTM layer.
- */
-source("nn/layers/sigmoid.dml") as sigmoid
-source("nn/layers/tanh.dml") as tanh
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
-                   boolean return_sequences, matrix[double] out0, matrix[double] c0)
-    return (matrix[double] out, matrix[double] c,
-            matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog) {
-  /*
-   * Computes the forward pass for an LSTM layer with M neurons.
-   * The input data has N sequences of T examples, each with D features.
-   *
-   * In an LSTM, an internal cell state is maintained, additive
-   * interactions operate over the cell state at each timestep, and
-   * some amount of this cell state is exposed as output at each
-   * timestep.  Additionally, the output of the previous timestep is fed
-   * back in as an additional input at the current timestep.
-   *
-   * Reference:
-   *  - Long Short-Term Memory, Hochreiter, 1997
-   *    - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, T*D).
-   *  - W: Weights, of shape (D+M, 4M).
-   *  - b: Biases, of shape (1, 4M).
-   *  - T: Length of example sequences (number of timesteps).
-   *  - D: Dimensionality of the input features (number of features).
-   *  - return_sequences: Whether to return `out` at all timesteps,
-   *      or just for the final timestep.
-   *  - out0: Outputs from previous timestep, of shape (N, M).
-   *      Note: This is *optional* and could just be an empty matrix.
-   *  - c0: Initial cell state, of shape (N, M).
-   *      Note: This is *optional* and could just be an empty matrix.
-   *
-   * Outputs:
-   *  - out: If `return_sequences` is True, outputs for all timesteps,
-   *      of shape (N, T*M).  Else, outputs for the final timestep, of
-   *      shape (N, M).
-   *  - c: Cell state for final timestep, of shape (N, M).
-   *  - cache_out: Cache of outputs, of shape (T, N*M).
-   *      Note: This is used for performance during training.
-   *  - cache_c: Cache of cell state, of shape (T, N*M).
-   *      Note: This is used for performance during training.
-   *  - cache_ifog: Cache of intermediate values, of shape (T, N*4M).
-   *      Note: This is used for performance during training.
-   */
-  N = nrow(X)
-  M = as.integer(ncol(W)/4)
-  out_prev = out0
-  c_prev = c0
-  c = c_prev
-  if (return_sequences) {
-    out = matrix(0, rows=N, cols=T*M)
-  }
-  else {
-    out = matrix(0, rows=N, cols=M)
-  }
-  # caches to be used during the backward pass for performance
-  cache_out = matrix(0, rows=T, cols=N*M)
-  cache_c = matrix(0, rows=T, cols=N*M)
-  cache_ifog = matrix(0, rows=T, cols=N*4*M)
-
-  for (t in 1:T) {  # each timestep
-    X_t = X[,(t-1)*D+1:t*D]  # shape (N, D)
-    input = cbind(X_t, out_prev)  # shape (N, D+M)
-    ifog = input %*% W + b  # input, forget, output, and g gates; shape (N, 4M)
-    tmp = sigmoid::forward(ifog[,1:3*M])  # i,f,o gates squashed with sigmoid
-    ifog[,1:3*M] = tmp
-    tmp = tanh::forward(ifog[,3*M+1:4*M])  # g gate squashed with tanh
-    ifog[,3*M+1:4*M] = tmp
-    # c_t = f*prev_c + i*g
-    c = ifog[,M+1:2*M]*c_prev + ifog[,1:M]*ifog[,3*M+1:4*M]  # shape (N, M)
-    # out_t = o*tanh(c)
-    tmp = tanh::forward(c)
-    out_t = ifog[,2*M+1:3*M] * tmp  # shape (N, M)
-
-    # store
-    if (return_sequences) {
-      out[,(t-1)*M+1:t*M] = out_t
-    }
-    else {
-      out = out_t
-    }
-    out_prev = out_t
-    c_prev = c
-    cache_out[t,] = matrix(out_t, rows=1, cols=N*M)  # reshape
-    cache_c[t,] = matrix(c, rows=1, cols=N*M)  # reshape
-    cache_ifog[t,] = matrix(ifog, rows=1, cols=N*4*M)  # reshape
-  }
-}
-
-backward = function(matrix[double] dout, matrix[double] dc,
-                    matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
-                    boolean given_sequences, matrix[double] out0, matrix[double] c0,
-                    matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db,
-            matrix[double] dout0, matrix[double] dc0) {
-  /*
-   * Computes the backward pass for an LSTM layer with M neurons.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out`.  If `given_sequences` is `True`,
-   *      contains gradients on outputs for all timesteps, of
-   *      shape (N, T*M). Else, contains the gradient on the output
-   *      for the final timestep, of shape (N, M).
-   *  - dc: Gradient wrt `c` (from later in time), of shape (N, M).
-   *      This would come from later in time if the cell state was used
-   *      downstream as the initial cell state for another LSTM layer.
-   *      Typically, this would be used when a sequence was cut at
-   *      timestep `T` and then continued in the next batch.  If `c`
-   *      was not used downstream, then `dc` would be an empty matrix.
-   *  - X: Inputs, of shape (N, T*D).
-   *  - W: Weights, of shape (D+M, 4M).
-   *  - b: Biases, of shape (1, 4M).
-   *  - T: Length of example sequences (number of timesteps).
-   *  - D: Dimensionality of the input features.
-   *  - given_sequences: Whether `dout` is for all timesteps,
-   *      or just for the final timestep.  This is based on whether
-   *      `return_sequences` was true in the forward pass.
-   *  - out0: Outputs from previous timestep, of shape (N, M).
-   *      Note: This is *optional* and could just be an empty matrix.
-   *  - c0: Initial cell state, of shape (N, M).
-   *      Note: This is *optional* and could just be an empty matrix.
-   *  - cache_out: Cache of outputs, of shape (T, N*M).
-   *      Note: This is used for performance during training.
-   *  - cache_c: Cache of cell state, of shape (T, N*M).
-   *      Note: This is used for performance during training.
-   *  - cache_ifog: Cache of intermediate values, of shape (T, N*4*M).
-   *      Note: This is used for performance during training.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, T*D).
-   *  - dW: Gradient wrt `W`, of shape (D+M, 4M).
-   *  - db: Gradient wrt `b`, of shape (1, 4M).
-   *  - dout0: Gradient wrt `out0`, of shape (N, M).
-   *  - dc0: Gradient wrt `c0`, of shape (N, M).
-   */
-  N = nrow(X)
-  M = as.integer(ncol(W)/4)
-  dX = matrix(0, rows=N, cols=T*D)
-  dW = matrix(0, rows=D+M, cols=4*M)
-  db = matrix(0, rows=1, cols=4*M)
-  dout0 = matrix(0, rows=N, cols=M)
-  dc0 = matrix(0, rows=N, cols=M)
-  dct = dc
-  if (!given_sequences) {
-    # only given dout for output at final timestep, so prepend empty douts for all other timesteps
-    dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout)  # shape (N, T*M)
-  }
-
-  t = T
-  for (iter in 1:T) {  # each timestep in reverse order
-    X_t = X[,(t-1)*D+1:t*D]  # shape (N, D)
-    dout_t = dout[,(t-1)*M+1:t*M]  # shape (N, M)
-    out_t = matrix(cache_out[t,], rows=N, cols=M)  # shape (N, M)
-    ct = matrix(cache_c[t,], rows=N, cols=M)  # shape (N, M)
-    if (t == 1) {
-      out_prev = out0  # shape (N, M)
-      c_prev = c0  # shape (N, M)
-    }
-    else {
-      out_prev = matrix(cache_out[t-1,], rows=N, cols=M)  # shape (N, M)
-      c_prev = matrix(cache_c[t-1,], rows=N, cols=M)  # shape (N, M)
-    }
-    input = cbind(X_t, out_prev)  # shape (N, D+M)
-    ifog = matrix(cache_ifog[t,], rows=N, cols=4*M)
-    i = ifog[,1:M]  # input gate, shape (N, M)
-    f = ifog[,M+1:2*M]  # forget gate, shape (N, M)
-    o = ifog[,2*M+1:3*M]  # output gate, shape (N, M)
-    g = ifog[,3*M+1:4*M]  # g gate, shape (N, M)
-
-    tmp = tanh::backward(dout_t, ct)
-    dct = dct + o*tmp  # shape (N, M)
-    tmp = tanh::forward(ct)
-    do = tmp * dout_t  # output gate, shape (N, M)
-    df = c_prev * dct  # forget gate, shape (N, M)
-    dc_prev = f * dct  # shape (N, M)
-    di = g * dct  # input gate, shape (N, M)
-    dg = i * dct  # g gate, shape (N, M)
-
-    di_raw = i * (1-i) * di
-    df_raw = f * (1-f) * df
-    do_raw = o * (1-o) * do
-    dg_raw = (1-g^2) * dg
-    difog_raw = cbind(di_raw, cbind(df_raw, cbind(do_raw, dg_raw)))  # shape (N, 4M)
-
-    dW = dW + t(input) %*% difog_raw  # shape (D+M, 4M)
-    db = db + colSums(difog_raw)  # shape (1, 4M)
-    dinput = difog_raw %*% t(W)  # shape (N, D+M)
-    dX[,(t-1)*D+1:t*D] = dinput[,1:D]
-    dout_prev = dinput[,D+1:D+M]  # shape (N, M)
-    if (t == 1) {
-      dout0 = dout_prev  # shape (N, M)
-      dc0 = dc_prev  # shape (N, M)
-    }
-    else {
-      dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev  # shape (N, M)
-      dct = dc_prev  # shape (N, M)
-    }
-    t = t - 1
-  }
-}
-
-init = function(int N, int D, int M)
-    return (matrix[double] W, matrix[double] b, matrix[double] out0, matrix[double] c0) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * We use the Glorot uniform heuristic which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
-   *  - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
-   *
-   * Inputs:
-   *  - N: Number of examples in batch.
-   *  - D: Dimensionality of the input features (number of features).
-   *  - M: Number of neurons in this layer.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (D+M, 4M).
-   *  - b: Biases, of shape (1, 4M).
-   *  - out0: Empty previous timestep output matrix, of shape (N, M).
-   *  - c0: Empty initial cell state matrix, of shape (N, M).
-   */
-  fan_in = D+M
-  fan_out = 4*M
-  scale = sqrt(6/(fan_in+fan_out))
-  W = rand(rows=D+M, cols=4*M, min=-scale, max=scale, pdf="uniform")
-  b = matrix(0, rows=1, cols=4*M)
-  out0 = matrix(0, rows=N, cols=M)
-  c0 = matrix(0, rows=N, cols=M)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
deleted file mode 100644
index fba1a4c..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
+++ /dev/null
@@ -1,159 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Max Pooling layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * This implementation uses `im2col` internally for each image to
-   * extract local image regions (patches) of each channel slice into
-   * columns, and then performs max pooling over the patches to compute
-   * the output maps.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
-  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-  pad_value = -1/0  # in max pooling we pad with -infinity
-
-  # Create output volume
-  out = matrix(0, rows=N, cols=C*Hout*Wout)
-
-  # Max pooling - im2col implementation
-  parfor (n in 1:N) {  # all examples
-    img = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
-
-    if (padh > 0 | padw > 0) {
-      # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
-      img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
-    }
-
-    img_maxes = matrix(0, rows=C, cols=Hout*Wout)  # zeros
-    parfor (c in 1:C) {  # all channels
-      # Extract local image slice patches into columns with im2col, of shape (Hf*Wf, Hout*Wout)
-      img_slice_cols = util::im2col(img[c,], Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
-
-      # Max pooling on patches
-      img_maxes[c,] = colMaxs(img_slice_cols)
-    }
-
-    out[n,] = matrix(img_maxes, rows=1, cols=C*Hout*Wout)
-  }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   */
-  N = nrow(X)
-  pad_value = -1/0  # in max pooling we pad with -infinity
-
-  # Create gradient volume
-  dX = matrix(0, rows=N, cols=C*Hin*Win)
-
-  # Gradient of max pooling
-  parfor (n in 1:N, check=0) {  # all examples
-    img = matrix(X[n,], rows=C, cols=Hin*Win)
-    if (padh > 0 | padw > 0) {
-      # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
-      img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
-    }
-
-    dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
-    parfor (c in 1:C, check=0) {  # all channels
-      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
-      for (hout in 1:Hout, check=0) {  # all output rows
-        hin = (hout-1)*strideh + 1
-        for (wout in 1:Wout) {  # all output columns
-          win = (wout-1)*stridew + 1
-          img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
-          max_val_ind = img_slice_patch == max(img_slice_patch)  # max value indicator matrix
-          # gradient passes through only for the max value(s) in this patch
-          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
-          dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
-                                                   + dimg_slice_patch
-        }
-      }
-      dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
-    }
-
-    if (padh > 0 | padw > 0) {
-      # Unpad image gradient
-      dimg = util::unpad_image(dimg, Hin, Win, padh, padw)  # shape (C, (Hin+2*padh)*(Win+2*padw))
-    }
-    dX[n,] = matrix(dimg, rows=1, cols=C*Hin*Win)
-  }
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
deleted file mode 100644
index 880f818..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
+++ /dev/null
@@ -1,103 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Max Pooling layer.
- *
- * This implementation uses a built-in operator for higher performance.
- */
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * This implementation uses a built-in operator for higher
-   * performance.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
-  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
-  # Max pooling - built-in implementation
-  out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
-                 stride=[strideh,stridew], padding=[padh,padw])
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   */
-  N = nrow(X)
-
-  # Gradient of max pooling
-  dX = max_pool_backward(X, dout, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
-                         stride=[strideh,stridew], padding=[padh,padw])
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/relu.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/relu.dml b/scripts/staging/SystemML-NN/nn/layers/relu.dml
deleted file mode 100644
index 93a6e90..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/relu.dml
+++ /dev/null
@@ -1,59 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Rectified Linear Unit (ReLU) nonlinearity layer.
- */
-
-forward = function(matrix[double] X)
-    return (matrix[double] out) {
-  /*
-   * Computes the forward pass for a ReLU nonlinearity layer.
-   *
-   * Performs an element-wise evaluation of `f(input) = max(0, input)`.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *
-   * Outputs:
-   *  - out: Outputs, of same shape as `X`.
-   */
-  out = max(X, 0)
-}
-
-backward = function(matrix[double] dout, matrix[double] X)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a ReLU nonlinearity layer.
-   *
-   * Essentially performs a pass-through of the upstream gradient
-   * for cells > 0.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
-   *  - X: Previous input data matrix, of shape (any, any).
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of same shape as `X`.
-   */
-   dX = (X > 0) * dout
-}
-



[08/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn` library to `scripts/nn`

Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/grad_check.dml b/scripts/nn/test/grad_check.dml
new file mode 100644
index 0000000..f3bc9a7
--- /dev/null
+++ b/scripts/nn/test/grad_check.dml
@@ -0,0 +1,1769 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Gradient checks for various architectures.
+ */
+source("nn/layers/affine.dml") as affine
+source("nn/layers/batch_norm1d.dml") as batch_norm1d
+source("nn/layers/batch_norm2d.dml") as batch_norm2d
+source("nn/layers/conv2d.dml") as conv2d
+source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/dropout.dml") as dropout
+source("nn/layers/l1_loss.dml") as l1_loss
+source("nn/layers/l1_reg.dml") as l1_reg
+source("nn/layers/l2_loss.dml") as l2_loss
+source("nn/layers/l2_reg.dml") as l2_reg
+source("nn/layers/log_loss.dml") as log_loss
+source("nn/layers/lstm.dml") as lstm
+source("nn/layers/max_pool2d.dml") as max_pool2d
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
+source("nn/layers/relu.dml") as relu
+source("nn/layers/rnn.dml") as rnn
+source("nn/layers/scale_shift1d.dml") as scale_shift1d
+source("nn/layers/scale_shift2d.dml") as scale_shift2d
+source("nn/layers/sigmoid.dml") as sigmoid
+source("nn/layers/softmax.dml") as softmax
+source("nn/layers/tanh.dml") as tanh
+source("nn/test/conv2d_simple.dml") as conv2d_simple
+source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
+source("nn/test/util.dml") as test_util
+
+affine = function() {
+  /*
+   * Gradient check for the affine layer.
+   */
+  print("Grad checking the affine layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 100 # num features
+  M = 10 # num neurons
+  X = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=M)
+  [W, b] = affine::init(D, M)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = affine::forward(X, W, b)
+  dout = l2_loss::backward(out, y)
+  [dX, dW, db] = affine::backward(dout, X, W, b)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = affine::forward(X, W, b)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = affine::forward(X, W, b)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      outmh = affine::forward(X, W, b)
+      lossmh = l2_loss::forward(outmh, y)
+      W[i,j] = old + h
+      outph = affine::forward(X, W, b)
+      lossph = l2_loss::forward(outph, y)
+      W[i,j] = old  # reset
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      outmh = affine::forward(X, W, b)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      outph = affine::forward(X, W, b)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+}
+
+batch_norm1d = function() {
+  /*
+   * Gradient check for the 1D batch normalization layer.
+   */
+  print("Grad checking the 1D batch normalization layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 100 # num features
+  mu = 0.9  # momentum
+  eps = 1e-5  # epsilon
+  X = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=D)
+  gamma = rand(rows=1, cols=D)
+  beta = rand(rows=1, cols=D)
+  ema_mean = rand(rows=1, cols=D)
+  ema_var = rand(rows=1, cols=D)
+  #[dummy, dummy, ema_mean, ema_var] = batch_norm1d::init(D)
+
+  # Check training & testing modes
+  for (i in 1:2) {
+    if (i == 1)
+      mode = 'train'
+    else
+      mode = 'test'
+    print(" - Grad checking the '"+mode+"' mode.")
+
+    # Compute analytical gradients of loss wrt parameters
+    [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+        batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+    dout = l2_loss::backward(out, y)
+    [dX, dgamma, dbeta] = batch_norm1d::backward(dout, out, ema_mean_upd, ema_var_upd,
+                                                 cache_mean, cache_var, cache_norm,
+                                                 X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+
+    # Grad check
+    h = 1e-5
+    print("   - Grad checking X.")
+    for (i in 1:nrow(X)) {
+      for (j in 1:ncol(X)) {
+        # Compute numerical derivative
+        old = as.scalar(X[i,j])
+        X[i,j] = old - h
+        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+        lossmh = l2_loss::forward(outmh, y)
+        X[i,j] = old + h
+        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+        lossph = l2_loss::forward(outph, y)
+        X[i,j] = old  # reset
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      }
+    }
+
+    print("   - Grad checking gamma.")
+    for (i in 1:nrow(gamma)) {
+      for (j in 1:ncol(gamma)) {
+        # Compute numerical derivative
+        old = as.scalar(gamma[i,j])
+        gamma[i,j] = old - h
+        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+        lossmh = l2_loss::forward(outmh, y)
+        gamma[i,j] = old + h
+        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+        lossph = l2_loss::forward(outph, y)
+        gamma[i,j] = old  # reset
+        dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+                                                    lossph, lossmh)
+      }
+    }
+
+    print("   - Grad checking beta.")
+    for (i in 1:nrow(beta)) {
+      for (j in 1:ncol(beta)) {
+        # Compute numerical derivative
+        old = as.scalar(beta[i,j])
+        beta[i,j] = old - h
+        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+        lossmh = l2_loss::forward(outmh, y)
+        beta[i,j] = old + h
+        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+        lossph = l2_loss::forward(outph, y)
+        beta[i,j] = old  # reset
+        dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+                                                    lossph, lossmh)
+      }
+    }
+  }
+}
+
+batch_norm2d = function() {
+  /*
+   * Gradient check for the 2D (spatial) batch normalization layer.
+   */
+  print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  C = 2  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  mu = 0.9  # momentum
+  eps = 1e-5  # epsilon
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=C*Hin*Win)
+  gamma = rand(rows=C, cols=1)
+  beta = rand(rows=C, cols=1)
+  ema_mean = rand(rows=C, cols=1)
+  ema_var = rand(rows=C, cols=1)
+  #[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C)
+
+  # Check training & testing modes
+  for (i in 1:2) {
+    if (i == 1)
+      mode = 'train'
+    else
+      mode = 'test'
+    print(" - Grad checking the '"+mode+"' mode.")
+
+    # Compute analytical gradients of loss wrt parameters
+    [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+        batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+    dout = l2_loss::backward(out, y)
+    [dX, dgamma, dbeta] = batch_norm2d::backward(dout, out, ema_mean_upd, ema_var_upd,
+                                                 cache_mean, cache_var, cache_norm,
+                                                 X, gamma, beta, C, Hin, Win, mode,
+                                                 ema_mean, ema_var, mu, eps)
+
+    # Grad check
+    h = 1e-5
+    print("   - Grad checking X.")
+    for (i in 1:nrow(X)) {
+      for (j in 1:ncol(X)) {
+        # Compute numerical derivative
+        old = as.scalar(X[i,j])
+        X[i,j] = old - h
+        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+        lossmh = l2_loss::forward(outmh, y)
+        X[i,j] = old + h
+        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+        lossph = l2_loss::forward(outph, y)
+        X[i,j] = old  # reset
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      }
+    }
+
+    print("   - Grad checking gamma.")
+    for (i in 1:nrow(gamma)) {
+      for (j in 1:ncol(gamma)) {
+        # Compute numerical derivative
+        old = as.scalar(gamma[i,j])
+        gamma[i,j] = old - h
+        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+        lossmh = l2_loss::forward(outmh, y)
+        gamma[i,j] = old + h
+        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+        lossph = l2_loss::forward(outph, y)
+        gamma[i,j] = old  # reset
+        dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+                                                    lossph, lossmh)
+      }
+    }
+
+    print("   - Grad checking beta.")
+    for (i in 1:nrow(beta)) {
+      for (j in 1:ncol(beta)) {
+        # Compute numerical derivative
+        old = as.scalar(beta[i,j])
+        beta[i,j] = old - h
+        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+        lossmh = l2_loss::forward(outmh, y)
+        beta[i,j] = old + h
+        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+        lossph = l2_loss::forward(outph, y)
+        beta[i,j] = old  # reset
+        dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+                                                    lossph, lossmh)
+      }
+    }
+  }
+}
+
+conv2d = function() {
+  /*
+   * Gradient check for the 2D convolutional layer using `im2col`.
+   */
+  print("Grad checking the `im2col` 2D convolutional layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  F = 2  # num filters
+  Hf = 3  # filter height
+  Wf = 3  # filter width
+  stride = 1
+  pad = 1
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=F*Hin*Win)
+
+  # Create layers
+  [W, b] = conv2d::init(F, C, Hf, Wf)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  dout = l2_loss::backward(out, y)
+  [dX, dW, db] = conv2d::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                  pad, pad)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      W[i,j] = old + h
+      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      W[i,j] = old  # reset
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+}
+
+conv2d_builtin = function() {
+  /*
+   * Gradient check for the 2D convolutional layer using built-in
+   * functions.
+   */
+  print("Grad checking the built-in 2D convolutional layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  F = 2  # num filters
+  Hf = 3  # filter height
+  Wf = 3  # filter width
+  stride = 1
+  pad = 1
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=F*Hin*Win)
+
+  # Create layers
+  [W, b] = conv2d_builtin::init(F, C, Hf, Wf)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                              pad, pad)
+  dout = l2_loss::backward(out, y)
+  [dX, dW, db] = conv2d_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
+                                          stride, stride, pad, pad)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      W[i,j] = old + h
+      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      W[i,j] = old  # reset
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+}
+
+conv2d_simple = function() {
+  /*
+   * Gradient check for the simple reference 2D convolutional layer.
+   */
+  print("Grad checking the simple reference 2D convolutional layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  F = 2  # num filters
+  Hf = 3  # filter height
+  Wf = 3  # filter width
+  stride = 1
+  pad = 1
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=F*Hin*Win)
+
+  # Create layers
+  [W, b] = conv2d_simple::init(F, C, Hf, Wf)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  dout = l2_loss::backward(out, y)
+  [dX, dW, db] = conv2d_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
+                                         stride, stride, pad, pad)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      W[i,j] = old + h
+      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      W[i,j] = old  # reset
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+}
+
+cross_entropy_loss = function() {
+  /*
+   * Gradient check for the cross-entropy loss function.
+   */
+  print("Grad checking the cross-entropy loss function.")
+
+  # Generate data
+  N = 3 # num examples
+  K = 10 # num targets
+  pred = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
+  pred = pred / rowSums(pred)  # normalized probs
+  y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
+  y = y / rowSums(y)  # normalized probs
+
+  # Compute analytical gradient
+  dpred = cross_entropy_loss::backward(pred, y)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(pred)) {
+    for (j in 1:ncol(pred)) {
+      # Compute numerical derivative
+      old = as.scalar(pred[i,j])
+      pred[i,j] = old - h
+      lossmh = cross_entropy_loss::forward(pred, y)
+      pred[i,j] = old + h
+      lossph = cross_entropy_loss::forward(pred, y)
+      pred[i,j] = old  # reset W[i,j]
+      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+    }
+  }
+}
+
+dropout = function() {
+  /*
+   * Gradient check for the (inverted) dropout layer.
+   */
+  print("Grad checking the (inverted) dropout layer with L2 loss.")
+
+  # Generate data
+  N = 3  # num examples
+  M = 100  # num neurons
+  p = 0.5  # probability of dropping neuron output
+  seed = as.integer(floor(as.scalar(rand(rows=1, cols=1, min=1, max=100000))))  # random seed
+  X = rand(rows=N, cols=M)
+  y = rand(rows=N, cols=M)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, mask] = dropout::forward(X, p, seed)
+  dout = l2_loss::backward(out, y)
+  dX = dropout::backward(dout, X, p, mask)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, mask] = dropout::forward(X, p, seed)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, mask] = dropout::forward(X, p, seed)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+l1_loss = function() {
+  /*
+   * Gradient check for the L1 loss function.
+   */
+  print("Grad checking the L1 loss function.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 2 # num targets
+  pred = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=D)
+
+  # Compute analytical gradient
+  dpred = l1_loss::backward(pred, y)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(pred)) {
+    for (j in 1:ncol(pred)) {
+      # Compute numerical derivative
+      old = as.scalar(pred[i,j])
+      pred[i,j] = old - h
+      lossmh = l1_loss::forward(pred, y)
+      pred[i,j] = old + h
+      lossph = l1_loss::forward(pred, y)
+      pred[i,j] = old  # reset W[i,j]
+      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+    }
+  }
+}
+
+l1_reg = function() {
+  /*
+   * Gradient check for the L1 regularization function.
+   */
+  print("Grad checking the L1 regularization function.")
+
+  # Generate data
+  D = 5 # num features
+  M = 3 # num neurons
+  lambda = 0.01
+  W = rand(rows=D, cols=M)
+
+  # Compute analytical gradient
+  dW = l1_reg::backward(W, lambda)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      reg_lossmh = l1_reg::forward(W, lambda)
+      W[i,j] = old + h
+      reg_lossph = l1_reg::forward(W, lambda)
+      W[i,j] = old  # reset W[i,j]
+      dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
+                                                  reg_lossph, reg_lossmh)
+    }
+  }
+}
+
+l2_loss = function() {
+  /*
+   * Gradient check for the L2 loss function.
+   */
+  print("Grad checking the L2 loss function.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 2 # num targets
+  pred = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=D)
+
+  # Compute analytical gradient
+  dpred = l2_loss::backward(pred, y)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(pred)) {
+    for (j in 1:ncol(pred)) {
+      # Compute numerical derivative
+      old = as.scalar(pred[i,j])
+      pred[i,j] = old - h
+      lossmh = l2_loss::forward(pred, y)
+      pred[i,j] = old + h
+      lossph = l2_loss::forward(pred, y)
+      pred[i,j] = old  # reset W[i,j]
+      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+    }
+  }
+}
+
+l2_reg = function() {
+  /*
+   * Gradient check for the L2 regularization function.
+   */
+  print("Grad checking the L2 regularization function.")
+
+  # Generate data
+  D = 5 # num features
+  M = 3 # num neurons
+  lambda = 0.01
+  W = rand(rows=D, cols=M)
+
+  # Compute analytical gradient
+  dW = l2_reg::backward(W, lambda)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      reg_lossmh = l2_reg::forward(W, lambda)
+      W[i,j] = old + h
+      reg_lossph = l2_reg::forward(W, lambda)
+      W[i,j] = old  # reset W[i,j]
+      dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
+                                                  reg_lossph, reg_lossmh)
+    }
+  }
+}
+
+log_loss = function() {
+  /*
+   * Gradient check for the log loss function.
+   */
+  print("Grad checking the log loss function.")
+
+  # Generate data
+  N = 20 # num examples
+  D = 1 # num targets
+  pred = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
+  y = round(rand(rows=N, cols=D, min=0, max=1, pdf="uniform"))
+
+  # Compute analytical gradient
+  dpred = log_loss::backward(pred, y)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(pred)) {
+    for (j in 1:ncol(pred)) {
+      # Compute numerical derivative
+      old = as.scalar(pred[i,j])
+      pred[i,j] = old - h
+      lossmh = log_loss::forward(pred, y)
+      pred[i,j] = old + h
+      lossph = log_loss::forward(pred, y)
+      pred[i,j] = old  # reset W[i,j]
+      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+    }
+  }
+}
+
+lstm = function() {
+  /*
+   * Gradient check for the LSTM layer.
+   */
+  print("Grad checking the LSTM layer with L2 loss.")
+
+  # Generate data
+  N = 3  # num examples
+  D = 10  # num features
+  T = 15  # num timesteps (sequence length)
+  M = 5 # num neurons
+  return_seq = TRUE
+  X = rand(rows=N, cols=T*D)
+  y = rand(rows=N, cols=T*M)
+  yc = rand(rows=N, cols=M)
+  out0 = rand(rows=N, cols=M)
+  c0 = rand(rows=N, cols=M)
+  [W, b, dummy, dummy2] = lstm::init(N, D, M)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+  dout = l2_loss::backward(out, y)
+  dc = l2_loss::backward(c, yc)
+  [dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0,
+                                            cache_out, cache_c, cache_ifog)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outmh = l2_loss::forward(outmh, y)
+      loss_cmh = l2_loss::forward(cmh, yc)
+      lossmh = loss_outmh + loss_cmh
+      X[i,j] = old + h
+      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outph = l2_loss::forward(outph, y)
+      loss_cph = l2_loss::forward(cph, yc)
+      lossph = loss_outph + loss_cph
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outmh = l2_loss::forward(outmh, y)
+      loss_cmh = l2_loss::forward(cmh, yc)
+      lossmh = loss_outmh + loss_cmh
+      W[i,j] = old + h
+      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outph = l2_loss::forward(outph, y)
+      loss_cph = l2_loss::forward(cph, yc)
+      lossph = loss_outph + loss_cph
+      W[i,j] = old  # reset
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outmh = l2_loss::forward(outmh, y)
+      loss_cmh = l2_loss::forward(cmh, yc)
+      lossmh = loss_outmh + loss_cmh
+      b[i,j] = old + h
+      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outph = l2_loss::forward(outph, y)
+      loss_cph = l2_loss::forward(cph, yc)
+      lossph = loss_outph + loss_cph
+      b[i,j] = old  # reset
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking out0.")
+  for (i in 1:nrow(out0)) {
+    for (j in 1:ncol(out0)) {
+      # Compute numerical derivative
+      old = as.scalar(out0[i,j])
+      out0[i,j] = old - h
+      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outmh = l2_loss::forward(outmh, y)
+      loss_cmh = l2_loss::forward(cmh, yc)
+      lossmh = loss_outmh + loss_cmh
+      out0[i,j] = old + h
+      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outph = l2_loss::forward(outph, y)
+      loss_cph = l2_loss::forward(cph, yc)
+      lossph = loss_outph + loss_cph
+      out0[i,j] = old  # reset
+      dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking c0.")
+  for (i in 1:nrow(c0)) {
+    for (j in 1:ncol(c0)) {
+      # Compute numerical derivative
+      old = as.scalar(c0[i,j])
+      c0[i,j] = old - h
+      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outmh = l2_loss::forward(outmh, y)
+      loss_cmh = l2_loss::forward(cmh, yc)
+      lossmh = loss_outmh + loss_cmh
+      c0[i,j] = old + h
+      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outph = l2_loss::forward(outph, y)
+      loss_cph = l2_loss::forward(cph, yc)
+      lossph = loss_outph + loss_cph
+      c0[i,j] = old  # reset
+      dc0_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
+    }
+  }
+}
+
+max_pool2d = function() {
+  /*
+   * Gradient check for the 2D max pooling layer.
+   */
+  print("Grad checking the 2D max pooling layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 4  # input height
+  Win = 4  # input width
+  Hf = 2  # pool filter height
+  Wf = 2  # pool filter width
+  stride = 2
+  X = rand(rows=N, cols=C*Hin*Win)
+
+  for (pad in 0:1) {
+    print(" - Grad checking w/ pad="+pad+".")
+    Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
+    Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
+    y = rand(rows=N, cols=C*Hout*Wout)
+
+    # Compute analytical gradients of loss wrt parameters
+    [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+    dout = l2_loss::backward(out, y)
+    dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+
+    # Grad check
+    h = 1e-5
+    for (i in 1:nrow(X)) {
+      for (j in 1:ncol(X)) {
+        # Compute numerical derivative
+        old = as.scalar(X[i,j])
+        X[i,j] = old - h
+        [outmh, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+        lossmh = l2_loss::forward(outmh, y)
+        X[i,j] = old + h
+        [outph, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+        lossph = l2_loss::forward(outph, y)
+        X[i,j] = old  # reset
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      }
+    }
+  }
+}
+
+max_pool2d_builtin = function() {
+  /*
+   * Gradient check for the 2D max pooling layer.
+   */
+  print("Grad checking the built-in 2D max pooling layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 4  # input height
+  Win = 4  # input width
+  Hf = 2  # pool filter height
+  Wf = 2  # pool filter width
+  stride = 2
+  X = rand(rows=N, cols=C*Hin*Win)
+
+  for (pad in 0:1) {
+    print(" - Grad checking w/ pad="+pad+".")
+    Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
+    Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
+    y = rand(rows=N, cols=C*Hout*Wout)
+
+    # Compute analytical gradients of loss wrt parameters
+    [out, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
+    dout = l2_loss::backward(out, y)
+    dX = max_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
+                                      pad, pad)
+
+    # Grad check
+    h = 1e-5
+    for (i in 1:nrow(X)) {
+      for (j in 1:ncol(X)) {
+        # Compute numerical derivative
+        old = as.scalar(X[i,j])
+        X[i,j] = old - h
+        [outmh, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                          pad, pad)
+        lossmh = l2_loss::forward(outmh, y)
+        X[i,j] = old + h
+        [outph, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                          pad, pad)
+        lossph = l2_loss::forward(outph, y)
+        X[i,j] = old  # reset
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      }
+    }
+  }
+}
+
+max_pool2d_simple = function() {
+  /*
+   * Gradient check for the simple reference 2D max pooling layer.
+   */
+  print("Grad checking the simple reference 2D max pooling layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 4  # input height
+  Win = 4  # input width
+  Hf = 2  # pool filter height
+  Wf = 2  # pool filter width
+  stride = 2
+  X = rand(rows=N, cols=C*Hin*Win)
+
+  for (pad in 0:1) {
+    print(" - Grad checking w/ pad="+pad+".")
+    Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
+    Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
+    y = rand(rows=N, cols=C*Hout*Wout)
+
+    # Compute analytical gradients of loss wrt parameters
+    [out, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+    dout = l2_loss::backward(out, y)
+    dX = max_pool2d_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
+                                     pad, pad)
+
+    # Grad check
+    h = 1e-5
+    for (i in 1:nrow(X)) {
+      for (j in 1:ncol(X)) {
+        # Compute numerical derivative
+        old = as.scalar(X[i,j])
+        X[i,j] = old - h
+        [outmh, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                         pad, pad)
+        lossmh = l2_loss::forward(outmh, y)
+        X[i,j] = old + h
+        [outph, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                         pad, pad)
+        lossph = l2_loss::forward(outph, y)
+        X[i,j] = old  # reset
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      }
+    }
+  }
+}
+
+relu = function() {
+  /*
+   * Gradient check for the ReLU nonlinearity layer.
+   *
+   * NOTE: This could result in a false-negative in which the test
+   * fails due to a kink being crossed in the nonlinearity.  This
+   * occurs when the tests, f(x-h) and f(x+h), end up on opposite
+   * sides of the zero threshold of max(0, fx).  For now, just run
+   * the tests again.  In the future, we can explicitly check for
+   * this and rerun the test automatically.
+   */
+  print("Grad checking the ReLU nonlinearity layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  M = 10 # num neurons
+  X = rand(rows=N, cols=M, min=-5, max=5)
+  y = rand(rows=N, cols=M)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = relu::forward(X)
+  dout = l2_loss::backward(out, y)
+  dX = relu::backward(dout, X)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = relu::forward(X)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = relu::forward(X)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+rnn = function() {
+  /*
+   * Gradient check for the simple RNN layer.
+   */
+  print("Grad checking the simple RNN layer with L2 loss.")
+
+  # Generate data
+  N = 3  # num examples
+  D = 10  # num features
+  T = 15  # num timesteps (sequence length)
+  M = 5 # num neurons
+  return_seq = TRUE
+  X = rand(rows=N, cols=T*D)
+  y = rand(rows=N, cols=T*M)
+  out0 = rand(rows=N, cols=M)
+  [W, b, dummy] = rnn::init(N, D, M)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+  dout = l2_loss::backward(out, y)
+  [dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossmh = l2_loss::forward(outmh, y)
+      W[i,j] = old + h
+      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossph = l2_loss::forward(outph, y)
+      W[i,j] = old  # reset
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking out0.")
+  for (i in 1:nrow(out0)) {
+    for (j in 1:ncol(out0)) {
+      # Compute numerical derivative
+      old = as.scalar(out0[i,j])
+      out0[i,j] = old - h
+      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossmh = l2_loss::forward(outmh, y)
+      out0[i,j] = old + h
+      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossph = l2_loss::forward(outph, y)
+      out0[i,j] = old  # reset
+      dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
+    }
+  }
+}
+
+scale_shift1d = function() {
+  /*
+   * Gradient check for the 1D scale & shift layer.
+   */
+  print("Grad checking the 1D scale & shift layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 100 # num features
+  X = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=D)
+  [gamma, beta] = scale_shift1d::init(D)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = scale_shift1d::forward(X, gamma, beta)
+  dout = l2_loss::backward(out, y)
+  [dX, dgamma, dbeta] = scale_shift1d::backward(dout, out, X, gamma, beta)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = scale_shift1d::forward(X, gamma, beta)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = scale_shift1d::forward(X, gamma, beta)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking gamma.")
+  for (i in 1:nrow(gamma)) {
+    for (j in 1:ncol(gamma)) {
+      # Compute numerical derivative
+      old = as.scalar(gamma[i,j])
+      gamma[i,j] = old - h
+      outmh = scale_shift1d::forward(X, gamma, beta)
+      lossmh = l2_loss::forward(outmh, y)
+      gamma[i,j] = old + h
+      outph = scale_shift1d::forward(X, gamma, beta)
+      lossph = l2_loss::forward(outph, y)
+      gamma[i,j] = old  # reset
+      dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+                                                  lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking beta.")
+  for (i in 1:nrow(beta)) {
+    for (j in 1:ncol(beta)) {
+      # Compute numerical derivative
+      old = as.scalar(beta[i,j])
+      beta[i,j] = old - h
+      outmh = scale_shift1d::forward(X, gamma, beta)
+      lossmh = l2_loss::forward(outmh, y)
+      beta[i,j] = old + h
+      outph = scale_shift1d::forward(X, gamma, beta)
+      lossph = l2_loss::forward(outph, y)
+      beta[i,j] = old  # reset
+      dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+                                                  lossph, lossmh)
+    }
+  }
+}
+
+scale_shift2d = function() {
+  /*
+   * Gradient check for the 2D scale & shift layer.
+   */
+  print("Grad checking the 2D scale & shift layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  C = 2  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=C*Hin*Win)
+  [gamma, beta] = scale_shift2d::init(C)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+  dout = l2_loss::backward(out, y)
+  [dX, dgamma, dbeta] = scale_shift2d::backward(dout, out, X, gamma, beta, C, Hin, Win)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking gamma.")
+  for (i in 1:nrow(gamma)) {
+    for (j in 1:ncol(gamma)) {
+      # Compute numerical derivative
+      old = as.scalar(gamma[i,j])
+      gamma[i,j] = old - h
+      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+      lossmh = l2_loss::forward(outmh, y)
+      gamma[i,j] = old + h
+      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+      lossph = l2_loss::forward(outph, y)
+      gamma[i,j] = old  # reset
+      dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+                                                  lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking beta.")
+  for (i in 1:nrow(beta)) {
+    for (j in 1:ncol(beta)) {
+      # Compute numerical derivative
+      old = as.scalar(beta[i,j])
+      beta[i,j] = old - h
+      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+      lossmh = l2_loss::forward(outmh, y)
+      beta[i,j] = old + h
+      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+      lossph = l2_loss::forward(outph, y)
+      beta[i,j] = old  # reset
+      dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+                                                  lossph, lossmh)
+    }
+  }
+}
+
+sigmoid = function() {
+  /*
+   * Gradient check for the sigmoid nonlinearity layer.
+   */
+  print("Grad checking the sigmoid nonlinearity layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  M = 10 # num neurons
+  X = rand(rows=N, cols=M)
+  y = rand(rows=N, cols=M)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = sigmoid::forward(X)
+  dout = l2_loss::backward(out, y)
+  dX = sigmoid::backward(dout, X)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = sigmoid::forward(X)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = sigmoid::forward(X)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+softmax = function() {
+  /*
+   * Gradient check for the softmax layer.
+   */
+  print("Grad checking the softmax layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 10 # num classes
+  X = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
+  y = y / rowSums(y)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = softmax::forward(X)
+  dout = l2_loss::backward(out, y)
+  dX = softmax::backward(dout, X)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = softmax::forward(X)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = softmax::forward(X)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+tanh = function() {
+  /*
+   * Gradient check for the hyperbolic tangent (tanh) nonlinearity
+   * layer.
+   */
+  print("Grad checking the tanh nonlinearity layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  M = 10 # num neurons
+  X = rand(rows=N, cols=M)
+  y = rand(rows=N, cols=M)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = tanh::forward(X)
+  dout = l2_loss::backward(out, y)
+  dX = tanh::backward(dout, X)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = tanh::forward(X)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = tanh::forward(X)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+two_layer_affine_l2_net = function() {
+  /*
+   * Gradient check for a two-layer, fully-connected, feed-forward
+   * network with ReLU nonlinearity and L2 loss.
+   *
+   * NOTE: This could result in a false-negative in which the test
+   * fails due to a kink being crossed in the ReLU nonlinearity.  This
+   * occurs when the tests, f(x-h) and f(x+h), end up on opposite
+   * sides of the zero threshold of max(0, fx).  For now, just run
+   * the tests again.  In the future, we can explicitly check for
+   * this and rerun the test automatically.
+   */
+  print("Grad checking a two-layer, fully-connected, feed-forward network with a ReLU " +
+        "nonlinearity, and an L2 loss function.")
+
+  # Generate input data
+  N = 1000 # num examples
+  D = 100 # num features
+  yD = 5 # num targets
+  X = rand(rows=N, cols=D, pdf="normal")
+  y = rand(rows=N, cols=yD)
+
+  # Create 2-layer, fully-connected network
+  M = 10 # number of hidden neurons
+  [W1, b1] = affine::init(D, M)
+  [W2, b2] = affine::init(M, yD)
+
+  # Optimize for short "burn-in" time to move to characteristic
+  # mode of operation and unmask any real issues.
+  print(" - Burn-in:")
+  lr = 0.0001
+  decay = 0.99
+  for(i in 1:5) {
+    # Compute forward and backward passes of net
+    [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
+    print("   - L2 loss: " + loss)
+
+    # Optimize with basic SGD
+    W1 = W1 - lr * dW1
+    b1 = b1 - lr * db1
+    W2 = W2 - lr * dW2
+    b2 = b2 - lr * db2
+    lr = lr * decay
+  }
+
+  # Compute analytical gradients
+  [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:2) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old_x = as.scalar(X[i,j])
+      X[i,j] = old_x - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      X[i,j] = old_x + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      X[i,j] = old_x  # reset X[i,j]
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W1.")
+  for (i in 1:nrow(W1)) {
+    for (j in 1:ncol(W1)) {
+      # Compute numerical derivative
+      old_w = as.scalar(W1[i,j])
+      W1[i,j] = old_w - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      W1[i,j] = old_w + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      W1[i,j] = old_w  # reset W[i,j]
+      dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W2.")
+  for (i in 1:nrow(W2)) {
+    for (j in 1:ncol(W2)) {
+      # Compute numerical derivative
+      old_w = as.scalar(W2[i,j])
+      W2[i,j] = old_w - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      W2[i,j] = old_w + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      W2[i,j] = old_w  # reset W[i,j]
+      dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b1.")
+  for (i in 1:nrow(b1)) {
+    for (j in 1:ncol(b1)) {
+      # Compute numerical derivative
+      old_b = as.scalar(b1[i,j])
+      b1[i,j] = old_b - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      b1[i,j] = old_b + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      b1[i,j] = old_b  # reset b[1,j]
+      dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b2.")
+  for (i in 1:nrow(b2)) {
+    for (j in 1:ncol(b2)) {
+      # Compute numerical derivative
+      old_b = as.scalar(b2[i,j])
+      b2[i,j] = old_b - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      b2[i,j] = old_b + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      b2[i,j] = old_b  # reset b[1,j]
+      dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh)
+    }
+  }
+}
+
+/*
+ * Test network with forward/backward functions.
+ */
+two_layer_affine_l2_net_run = function(matrix[double] X, matrix[double] y,
+                                       matrix[double] W1, matrix[double] b1,
+                                       matrix[double] W2, matrix[double] b2)
+    return (matrix[double] pred, double loss,
+            matrix[double] dX,
+            matrix[double] dW1, matrix[double] db1,
+            matrix[double] dW2, matrix[double] db2) {
+  # Compute forward pass
+  [loss, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+
+  # Compute backward pass
+  [dX, dpred, daout, dhout, dW1, db1, dW2, db2] =
+      two_layer_affine_l2_net_backward(X, y, pred, aout, hout, W1, b1, W2, b2)
+}
+
+two_layer_affine_l2_net_forward = function(matrix[double] X, matrix[double] y,
+                                           matrix[double] W1, matrix[double] b1,
+                                           matrix[double] W2, matrix[double] b2)
+    return (double loss, matrix[double] pred, matrix[double] aout, matrix[double] hout) {
+  # Compute forward pass
+  hout = affine::forward(X, W1, b1)
+  aout = relu::forward(hout)
+  pred = affine::forward(aout, W2, b2)
+
+  # Compute loss
+  loss = l2_loss::forward(pred, y)
+}
+
+two_layer_affine_l2_net_backward = function(matrix[double] X, matrix[double] y, matrix[double] pred,
+                                            matrix[double] aout, matrix[double] hout,
+                                            matrix[double] W1, matrix[double] b1,
+                                            matrix[double] W2, matrix[double] b2)
+    return (matrix[double] dX, matrix[double] dpred,
+            matrix[double] daout, matrix[double] dhout,
+            matrix[double] dW1, matrix[double] db1, matrix[double] dW2, matrix[double] db2) {
+  # Compute backward pass
+  dpred = l2_loss::backward(pred, y)
+  [daout, dW2, db2] = affine::backward(dpred, aout, W2, b2)
+  dhout = relu::backward(daout, hout)
+  [dX, dW1, db1] = affine::backward(dhout, X, W1, b1)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/max_pool2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/max_pool2d_simple.dml b/scripts/nn/test/max_pool2d_simple.dml
new file mode 100644
index 0000000..188bd6e
--- /dev/null
+++ b/scripts/nn/test/max_pool2d_simple.dml
@@ -0,0 +1,172 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Max Pooling layer.
+ *
+ * This implementation is intended to be a simple, reference version.
+ */
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * This implementation is intended to be a simple, reference version.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+  # Create output volume
+  out = matrix(0, rows=N, cols=C*Hout*Wout)
+
+  # Max pooling
+  parfor (n in 1:N, check=0) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+
+    # Pad image
+    pad_value = -1/0
+    Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+    parfor (c in 1:C) {
+      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+    }
+    img = Xn_padded  # shape (C, (Hin+2*padh)*(Win+2*padw))
+
+    parfor (c in 1:C, check=0) {  # all channels
+      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      parfor (hout in 1:Hout, check=0) {  # all output rows
+        hin = (hout-1) * strideh + 1
+        parfor (wout in 1:Wout, check=0) {  # all output columns
+          win = (wout-1) * stridew + 1
+          out[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout] = max(img_slice[hin:hin+Hf-1,
+                                                               win:win+Wf-1])
+        }
+      }
+    }
+  }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   */
+  N = nrow(X)
+
+  # Create gradient volume
+  dX = matrix(0, rows=N, cols=C*Hin*Win)
+
+  # Gradient of max pooling
+  for (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+
+    # Pad image
+    pad_value = -1/0
+    Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+    parfor (c in 1:C) {
+      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+    }
+    img = Xn_padded
+
+    dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+    for (c in 1:C) {  # all channels
+      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+      for (hout in 1:Hout, check=0) {  # all output rows
+        hin = (hout-1) * strideh + 1
+        for (wout in 1:Wout) {  # all output columns
+          win = (wout-1) * stridew + 1
+          img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
+          max_val_ind = img_slice_patch == max(img_slice_patch)  # max value indicator matrix
+          # gradient passes through only for the max value(s) in this patch
+          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
+          dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
+                                                   + dimg_slice_patch
+        }
+      }
+      dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
+    }
+
+    # Unpad derivs on input
+    dXn = matrix(0, rows=C, cols=Hin*Win)
+    parfor (c in 1:C, check=0) {
+      dXn_padded_slice = matrix(dimg[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
+      dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
+      dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
+    }
+    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/run_tests.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/run_tests.dml b/scripts/nn/test/run_tests.dml
new file mode 100644
index 0000000..d8173a9
--- /dev/null
+++ b/scripts/nn/test/run_tests.dml
@@ -0,0 +1,90 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Script to run tests.
+ */
+source("nn/test/grad_check.dml") as grad_check
+source("nn/test/test.dml") as test
+
+print("")
+print("Starting grad checks.")
+print("---")
+
+# Loss & loss-related functions
+grad_check::cross_entropy_loss()
+grad_check::l1_loss()
+grad_check::l1_reg()
+grad_check::l2_loss()
+grad_check::l2_reg()
+grad_check::log_loss()
+print("")
+
+# Core layers
+grad_check::affine()
+grad_check::batch_norm1d()
+grad_check::batch_norm2d()
+grad_check::conv2d()
+grad_check::conv2d_builtin()
+grad_check::conv2d_simple()
+grad_check::dropout()
+grad_check::lstm()
+grad_check::max_pool2d()
+grad_check::max_pool2d_builtin()
+grad_check::max_pool2d_simple()
+grad_check::relu()
+grad_check::rnn()
+grad_check::scale_shift1d()
+grad_check::scale_shift2d()
+grad_check::sigmoid()
+grad_check::softmax()
+grad_check::tanh()
+print("")
+
+# Example model
+grad_check::two_layer_affine_l2_net()
+print("")
+
+print("---")
+print("Grad checks complete -- look for any ERRORs or WARNINGs.")
+print("If any tests involving ReLUs failed, try a few times " +
+      "to ensure that they were not false negatives due to " +
+      "kinks being crossed.")
+print("")
+
+print("")
+print("Starting other tests.")
+print("---")
+
+test::batch_norm1d()
+test::batch_norm2d()
+test::conv2d()
+test::cross_entropy_loss()
+test::im2col()
+test::max_pool2d()
+test::padding()
+test::tanh()
+
+print("---")
+print("Other tests complete -- look for any ERRORs or WARNINGs.")
+print("")
+print("")
+


[05/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn` library to `scripts/nn`

Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/rnn.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/rnn.dml b/scripts/staging/SystemML-NN/nn/layers/rnn.dml
deleted file mode 100644
index 3c6faae..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/rnn.dml
+++ /dev/null
@@ -1,183 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Simple (Vanilla) RNN layer.
- */
-source("nn/layers/tanh.dml") as tanh
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
-                   boolean return_sequences, matrix[double] out0)
-    return (matrix[double] out, matrix[double] cache_out) {
-  /*
-   * Computes the forward pass for a simple RNN layer with M neurons.
-   * The input data has N sequences of T examples, each with D features.
-   *
-   * In a simple RNN, the output of the previous timestep is fed back
-   * in as an additional input at the current timestep.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, T*D).
-   *  - W: Weights, of shape (D+M, M).
-   *  - b: Biases, of shape (1, M).
-   *  - T: Length of example sequences (number of timesteps).
-   *  - D: Dimensionality of the input features (number of features).
-   *  - return_sequences: Whether to return `out` at all timesteps,
-   *      or just for the final timestep.
-   *  - out0: Output matrix from previous timestep, of shape (N, M).
-   *      Note: This is *optional* and could just be an empty matrix.
-   *
-   * Outputs:
-   *  - out: If `return_sequences` is True, outputs for all timesteps,
-   *      of shape (N, T*M).  Else, outputs for the final timestep, of
-   *      shape (N, M).
-   *  - cache_out: Cache of outputs, of shape (T, N*M).
-   *      Note: This is used for performance during training.
-   */
-  N = nrow(X)
-  M = ncol(W)
-  out_prev = out0
-  if (return_sequences) {
-    out = matrix(0, rows=N, cols=T*M)
-  }
-  else {
-    out = matrix(0, rows=N, cols=M)
-  }
-  # caches to be used during the backward pass for performance
-  cache_out = matrix(0, rows=T, cols=N*M)
-
-  for (t in 1:T) {  # each timestep
-    X_t = X[,(t-1)*D+1:t*D]  # shape (N, D)
-    input = cbind(X_t, out_prev)  # shape (N, D+M)
-    out_t = tanh::forward(input %*% W + b)  # shape (N, M)
-    # store
-    if (return_sequences) {
-      out[,(t-1)*M+1:t*M] = out_t
-    }
-    else {
-      out = out_t
-    }
-    out_prev = out_t
-    cache_out[t,] = matrix(out_t, rows=1, cols=N*M)  # reshape
-  }
-}
-
-backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, matrix[double] b,
-                    int T, int D, boolean given_sequences, matrix[double] out0,
-                    matrix[double] cache_out)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db, matrix[double] dout0) {
-  /*
-   * Computes the backward pass for a simple RNN layer with M neurons.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream.  If `given_sequences`
-   *      is True, contains gradients on outputs for all timesteps,
-   *      of shape (N, T*M).  Else, contains gradient on output for
-   *      the final timestep, of shape (N, M).
-   *  - X: Inputs, of shape (N, T*D).
-   *  - W: Weights, of shape (D+M, M).
-   *  - b: Biases, of shape (1, M).
-   *  - T: Length of example sequences (number of timesteps).
-   *  - D: Dimensionality of the input features (number of features).
-   *  - given_sequences: Whether `dout` is for all timesteps,
-   *      or just for the final timestep.  This is based on whether
-   *      `return_sequences` was true in the forward pass.
-   *  - out0: Output matrix from previous timestep, of shape (N, M).
-   *      Note: This is *optional* and could just be an empty matrix.
-   *  - cache_out: Cache of outputs, of shape (T, N*M).
-   *      Note: This is used for performance during training.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, T*D).
-   *  - dW: Gradient wrt `W`, of shape (D+M, 4M).
-   *  - db: Gradient wrt `b`, of shape (1, 4M).
-   *  - dout0: Gradient wrt `out0`, of shape (N, M).
-   */
-  N = nrow(X)
-  M = ncol(W)
-  dX = matrix(0, rows=N, cols=T*D)
-  dW = matrix(0, rows=D+M, cols=M)
-  db = matrix(0, rows=1, cols=M)
-  dout0 = matrix(0, rows=N, cols=M)
-  if (!given_sequences) {
-    # only given dout for output at final timestep, so prepend empty douts for all other timesteps
-    dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout)  # shape (N, T*M)
-  }
-
-  t = T
-  for (iter in 1:T) {  # each timestep in reverse order
-    X_t = X[,(t-1)*D+1:t*D]  # shape (N, D)
-    dout_t = dout[,(t-1)*M+1:t*M]  # shape (N, M)
-    out_t = matrix(cache_out[t,], rows=N, cols=M)  # shape (N, M)
-    if (t == 1) {
-      out_prev = out0  # shape (N, M)
-    }
-    else {
-      out_prev = matrix(cache_out[t-1,], rows=N, cols=M)  # shape (N, M)
-    }
-    input = cbind(X_t, out_prev)  # shape (N, D+M)
-    dout_t_raw = (1-out_t^2) * dout_t  # into tanh, shape (N, M)
-    dW = dW + t(input) %*% dout_t_raw  # shape (D+M, M)
-    db = db + colSums(dout_t_raw)  # shape (1, M)
-    dinput = dout_t_raw %*% t(W)  # shape (N, D+M)
-    dX[,(t-1)*D+1:t*D] = dinput[,1:D]
-    dout_prev = dinput[,D+1:D+M]  # shape (N, M)
-    if (t == 1) {
-      dout0 = dout_prev  # shape (N, M)
-    }
-    else {
-      dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev  # shape (N, M)
-    }
-    t = t - 1
-  }
-}
-
-init = function(int N, int D, int M)
-    return (matrix[double] W, matrix[double] b, matrix[double] out0) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * We use the Glorot uniform heuristic which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
-   *  - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
-   *
-   * Inputs:
-   *  - N: Number of examples in batch.
-   *  - D: Dimensionality of the input features (number of features).
-   *  - M: Number of neurons in this layer.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (D+M, M).
-   *  - b: Biases, of shape (1, M).
-   *  - out0: Empty previous timestep output matrix, of shape (N, M).
-   */
-  fan_in = D+M
-  fan_out = M
-  scale = sqrt(6/(fan_in+fan_out))
-  W = rand(rows=D+M, cols=M, min=-scale, max=scale, pdf="uniform")
-  b = matrix(0, rows=1, cols=M)
-  out0 = matrix(0, rows=N, cols=M)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml b/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml
deleted file mode 100644
index 7e162a3..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml
+++ /dev/null
@@ -1,95 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 1D Scale & Shift layer.
- */
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta)
-    return (matrix[double] out) {
-  /*
-   * Computes the forward pass for a 1D scale & shift layer. The input
-   * data has N examples, each with D features.
-   *
-   * A 1D scale & shift layer introduces learnable parameters
-   * (gamma, beta) to scale and shift the input on a per-feature basis.
-   *
-   *   `y = x*gamma + beta`
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, D).
-   *  - gamma: Scale parameters, of shape (1, D).
-   *  - beta: Shift parameters, of shape (1, D).
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, D).
-   */
-  # Scale and shift
-  out = X*gamma + beta  # shape (N, D)
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
-                    matrix[double] X, matrix[double] gamma, matrix[double] beta)
-      return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
-  /*
-   * Computes the backward pass for a 1D scale & shift layer.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of shape (N, D).
-   *  - out: Outputs from the forward pass, of shape (N, D).
-   *  - X: Inputs, of shape (N, D).
-   *  - gamma: Scale parameters, of shape (1, D).
-   *  - beta: Shift parameters, of shape (1, D).
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, D).
-   *  - dgamma: Gradient wrt `W`, of shape (1, D).
-   *  - dbeta: Gradient wrt `b`, of shape (1, D).
-   *
-   */
-  # Compute gradients during training
-  dgamma = colSums(dout*X)  # shape (1, D)
-  dbeta = colSums(dout)  # shape (1, D)
-  dX = dout * gamma  # shape (N, D)
-}
-
-init = function(int D)
-    return (matrix[double] gamma, matrix[double] beta) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * By default, we initialize to an identity function, with a scale
-   * filler of `1`, and a shift filler of `0`.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - D: Dimensionality of the input features (number of features).
-   *
-   * Outputs:
-   *  - gamma: Scale parameters, of shape (1, D).
-   *  - beta: Shift parameters, of shape (1, D).
-   */
-   gamma = matrix(1, rows=1, cols=D)
-   beta = matrix(0, rows=1, cols=D)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml b/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml
deleted file mode 100644
index 79c884a..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml
+++ /dev/null
@@ -1,107 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Scale & Shift layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
-                   int C, int Hin, int Win)
-    return (matrix[double] out) {
-  /*
-   * Computes the forward pass for a 2D scale & shift layer.  The input
-   * data has N examples, each represented as a 3D volume unrolled into
-   * a single vector.
-   *
-   * A 2D scale & shift layer introduces learnable parameters
-   * (gamma, beta) to scale and shift the input on a per-channel basis.
-   *
-   *   `y = x*gamma + beta`
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - gamma: Scale parameters, of shape (C, 1).
-   *  - beta: Shift parameters, of shape (C, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hin*Win).
-   */
-  # Scale and shift
-  scaled = bias_multiply(X, gamma)  # shape (N, C*Hin*Win)
-  out = bias_add(scaled, beta)  # shape (N, C*Hin*Win)
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
-                    matrix[double] X, matrix[double] gamma, matrix[double] beta,
-                    int C, int Hin, int Win)
-      return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
-  /*
-   * Computes the backward pass for a 2D scale & shift layer.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
-   *  - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
-   *  - X: Input data matrix to the forward pass, of
-   *      shape (N, C*Hin*Win).
-   *  - gamma: Scale parameters, of shape (C, 1).
-   *  - beta: Shift parameters, of shape (C, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dgamma: Gradient wrt `W`, of shape (C, 1).
-   *  - dbeta: Gradient wrt `b`, of shape (C, 1).
-   *
-   */
-  # Compute gradients during training
-  dgamma = util::channel_sums(dout*X, C, Hin, Win)  # shape (C, 1)
-  dbeta = util::channel_sums(dout, C, Hin, Win)  # shape (C, 1)
-  dX = bias_multiply(dout, gamma)  # shape (N, C*Hin*Win)
-}
-
-init = function(int C)
-    return (matrix[double] gamma, matrix[double] beta) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * By default, we initialize to an identity function, with a scale
-   * filler of `1`, and a shift filler of `0`.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - C: Number of input channels (dimensionality of input depth).
-   *
-   * Outputs:
-   *  - gamma: Scale parameters, of shape (C, 1).
-   *  - beta: Shift parameters, of shape (C, 1).
-   */
-   gamma = matrix(1, rows=C, cols=1)
-   beta = matrix(0, rows=C, cols=1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml b/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
deleted file mode 100644
index 2d85adc..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
+++ /dev/null
@@ -1,62 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Sigmoid nonlinearity layer.
- */
-
-forward = function(matrix[double] X)
-    return (matrix[double] out) {
-  /*
-   * Computes the forward pass for a sigmoid nonlinearity layer.
-   *
-   *   `sigmoid(x) = 1 / (1 + e^-x)`
-   *
-   * If `X` contains a single feature column, the output of a sigmoid
-   * layer can be interpreted as a predicted probability of a true
-   * class when paired with a log loss function in a binary
-   * classification problem.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *
-   * Outputs:
-   *  - out: Outputs, of same shape as `X`.
-   */
-  out = 1 / (1+exp(-X))
-}
-
-backward = function(matrix[double] dout, matrix[double] X)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a sigmoid nonlinearity layer.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
-   *  - X: Inputs, of shape (any, any).
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of same shape as `X`.
-   */
-  out = 1 / (1+exp(-X))
-  dX = out * (1-out) * dout
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/softmax.dml b/scripts/staging/SystemML-NN/nn/layers/softmax.dml
deleted file mode 100644
index 68a7bc7..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/softmax.dml
+++ /dev/null
@@ -1,87 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Softmax classifier layer.
- */
-
-forward = function(matrix[double] scores)
-    return (matrix[double] probs) {
-  /*
-   * Computes the forward pass for a softmax classifier.  The inputs
-   * are interpreted as unnormalized, log-probabilities for each of
-   * N examples, and the softmax function transforms them to normalized
-   * probabilities.
-   *
-   * This can be interpreted as a generalization of the sigmoid
-   * function to multiple classes.
-   *
-   *   `probs_ij = e^scores_ij / sum(e^scores_i)`
-   *
-   * Inputs:
-   *  - scores: Inputs, of shape (N, D).
-   *
-   * Outputs:
-   *  - probs: Outputs, of shape (N, D).
-   */
-  # For numerical stability, we subtract the max score of an example from all scores for that
-  # example.  This is equivalent to the original formulation:
-  # e^scores_i / sum(e^scores_i) == C*e^scores_i / C*sum(e^scores_i)
-  #                              == e^(scores_i+log(C)) / sum(e^(scores_i+log(C))
-  # set log(C) = -max(scores_i):
-  #                              == e^(scores_i-max(scores_i)) / sum(e^(scores_i-max(scores_i))
-  scores = scores - rowMaxs(scores)  # numerical stability
-  unnorm_probs = exp(scores)  # unnormalized probabilities
-  probs = unnorm_probs / rowSums(unnorm_probs)  # normalized probabilities
-}
-
-backward = function(matrix[double] dprobs, matrix[double] scores)
-    return (matrix[double] dscores) {
-  /*
-   * Computes the backward pass for a softmax classifier.
-   *
-   * Note that dscores_ij has multiple source branches:
-   *
-   *   ```
-   *   dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij)
-   *   dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j
-   *
-   *   dloss/dscores_ij =
-   *      (dloss/dprobs_ij * dprobs_ij/dscores_ij)
-   *      + sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij)
-   *   ```
-   *
-   * Inputs:
-   *  - dprobs: Gradient wrt `probs` from upstream, of shape (N, D).
-   *  - scores: Inputs, of shape (N, D).
-   *
-   * Outputs:
-   *  - dscores: Gradient wrt `scores`, of shape (N, D).
-   */
-  scores = scores - rowMaxs(scores)  # numerical stability
-  unnorm_probs = exp(scores)  # unnormalized probabilities
-  probs = unnorm_probs / rowSums(unnorm_probs)  # normalized probabilities
-  # After some cancellation:
-  # dscores = dprobs*probs - probs*rowSums(dprobs*probs)
-  dtemp = dprobs * probs
-  dscores = dtemp - probs*rowSums(dtemp)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/tanh.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/tanh.dml b/scripts/staging/SystemML-NN/nn/layers/tanh.dml
deleted file mode 100644
index d849d70..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/tanh.dml
+++ /dev/null
@@ -1,65 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Tanh nonlinearity layer.
- */
-source("nn/layers/sigmoid.dml") as sigmoid
-
-forward = function(matrix[double] X)
-    return (matrix[double] out) {
-  /*
-   * Computes the forward pass for a tanh nonlinearity layer.
-   *
-   *   ```
-   *   tanh(x) = (e^x - e^-x) / (e^x + e^-x)
-   *           = 2 * sigmoid(2x) - 1
-   *   ```
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *
-   * Outputs:
-   *  - out: Outputs, of same shape as `X`.
-   */
-  # out = (exp(X) - exp(-X)) / (exp(X) + exp(-X))
-  # Simplification of the above formulation to use the sigmoid function:
-  sigma2X = sigmoid::forward(2*X)
-  out = 2*sigma2X - 1
-}
-
-backward = function(matrix[double] dout, matrix[double] X)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a tanh nonlinearity layer.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
-   *  - X: Inputs, of shape (any, any).
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of same shape as `X`.
-   */
-  sigma2X = sigmoid::forward(2*X)
-  out = 2*sigma2X - 1
-  dX = (1-out^2) * dout
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml b/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
deleted file mode 100644
index 85b1c41..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
+++ /dev/null
@@ -1,77 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Adagrad optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon,
-                  matrix[double] cache)
-    return (matrix[double] X, matrix[double] cache) {
-  /*
-   * Performs an Adagrad update.
-   *
-   * This is an adaptive learning rate optimizer that maintains the
-   * sum of squared gradients to automatically adjust the effective
-   * learning rate.
-   *
-   * Reference:
-   *  - Adaptive Subgradient Methods for Online Learning and Stochastic
-   *    Optimization, Duchi et al.
-   *      - http://jmlr.org/papers/v12/duchi11a.html
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient wrt `X` of a loss function being optimized, of
-   *      same shape as `X`.
-   *  - lr: Learning rate.
-   *  - epsilon: Smoothing term to avoid divide by zero errors.
-   *      Typical values are in the range of [1e-8, 1e-4].
-   *  - cache: State that maintains per-parameter sum of squared
-   *      gradients, of same shape as `X`.
-   *
-   * Outputs:
-   *  - X: Updated parameters `X`, of same shape as input `X`.
-   *  - cache: State that maintains per-parameter sum of squared
-   *      gradients, of same shape as `X`.
-   */
-  cache = cache + dX^2
-  X = X - (lr * dX / (sqrt(cache)+epsilon))
-}
-
-init = function(matrix[double] X)
-    return (matrix[double] cache) {
-  /*
-   * Initialize the state for this optimizer.
-   *
-   * Note: This is just a convenience function, and state
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *
-   * Outputs:
-   *  - cache: State that maintains per-parameter sum of squared
-   *      gradients, of same shape as `X`.
-   */
-  cache = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/adam.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/adam.dml b/scripts/staging/SystemML-NN/nn/optim/adam.dml
deleted file mode 100644
index 4b6fa2a..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/adam.dml
+++ /dev/null
@@ -1,97 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Adam optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double beta1, double beta2,
-                  double epsilon, int t, matrix[double] m, matrix[double] v)
-    return (matrix[double] X, matrix[double] m, matrix[double] v) {
-  /*
-   * Performs an Adam update.
-   *
-   * Reference:
-   *  - Adam: A Method for Stochastic Optimization, Kingma, Ba.
-   *    - http://arxiv.org/abs/1412.6980
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient wrt `X` of a loss function being optimized, of
-   *      same shape as `X`.
-   *  - lr: Learning rate.  Recommended value is 0.001.
-   *  - beta1: Exponential decay rate for the 1st moment estimates.
-   *      Recommended value is 0.9.
-   *  - beta2: Exponential decay rate for the 2nd moment estimates.
-   *      Recommended value is 0.999.
-   *  - epsilon: Smoothing term to avoid divide by zero errors.
-   *      Recommended value is 1e-8.
-   *  - t: Timestep, starting at 0.
-   *  - m: State containing the 1st moment (mean) estimate by
-   *      maintaining exponential moving averages of the gradients, of
-   *      same shape as `X`.
-   *  - v: State containing the 2nd raw moment (uncentered variance)
-   *      estimate by maintaining exponential moving averages of the
-   *      squared gradients, of same shape as `X`.
-   *
-   * Outputs:
-   *  - X: Updated parameters `X`, of same shape as input `X`.
-   *  - m: Updated state containing the 1st moment (mean) estimate by
-   *      maintaining exponential moving averages of the gradients, of
-   *      same shape as `X`.
-   *  - v: Updated state containing the 2nd raw moment (uncentered
-   *      variance) estimate by maintaining exponential moving averages
-   *      of the squared gradients, of same shape as `X`.
-   */
-  t = t + 1
-  m = beta1*m + (1-beta1)*dX  # update biased 1st moment estimate
-  v = beta2*v + (1-beta2)*dX^2  # update biased 2nd raw moment estimate
-  # m = m / (1-beta1^t)  # compute bias-corrected 1st moment estimate
-  # v = v / (1-beta2^t)  # compute bias-corrected 2nd raw moment estimate
-  # X = X - (lr * m / (sqrt(v)+epsilon))  # param update
-  # Simplified for computational efficiency:
-  lr = lr * sqrt(1-beta2^t) / (1-beta1^t)
-  X = X - (lr * m / (sqrt(v)+epsilon))
-}
-
-init = function(matrix[double] X)
-    return (matrix[double] m, matrix[double] v) {
-  /*
-   * Initialize the state for this optimizer.
-   *
-   * Note: This is just a convenience function, and state
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *
-   * Outputs:
-   *  - m: Initial state containing the 1st moment (mean) estimate by
-   *      maintaining exponential moving averages of the gradients, of
-   *      same shape as `X`.
-   *  - v: Initial state containing the 2nd raw moment (uncentered
-   *      variance) estimate by maintaining exponential moving averages
-   *      of the squared gradients, of same shape as `X`.
-   */
-  m = matrix(0, rows=nrow(X), cols=ncol(X))
-  v = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml b/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
deleted file mode 100644
index 1feccaf..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
+++ /dev/null
@@ -1,79 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * RMSprop optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double decay_rate,
-                  double epsilon, matrix[double] cache)
-    return (matrix[double] X, matrix[double] cache) {
-  /*
-   * Performs an RMSprop update.
-   *
-   * This is an adaptive learning rate optimizer that can be viewed
-   * as an adjustment of the Adagrad method to use a moving average
-   * of the sum of squared gradients in order to improve convergence.
-   *
-   * Reference:
-   *  - Neural Networks for Machine Learning, Lecture 6a, Hinton,
-   *    slide 29.
-   *    - http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient wrt `X` of a loss function being optimized, of
-   *      same shape as `X`.
-   *  - lr: Learning rate.
-   *  - decay_rate: Term controlling the rate of the moving average.
-   *      Typical values are in the range of [0.9, 0.999].
-   *  - epsilon: Smoothing term to avoid divide by zero errors.
-   *      Typical values are in the range of [1e-8, 1e-4].
-   *  - cache: State that maintains the moving average of the squared
-   *      gradients, of same shape as `X`.
-   *
-   * Outputs:
-   *  - X: Updated parameters `X`, of same shape as input `X`.
-   *  - cache: Updated state that maintains the moving average of the
-   *      squared gradients, of same shape as `X`.
-   */
-  cache = decay_rate*cache + (1-decay_rate)*dX^2
-  X = X - (lr * dX / (sqrt(cache)+epsilon))
-}
-
-init = function(matrix[double] X)
-    return (matrix[double] cache) {
-  /*
-   * Initialize the state for this optimizer.
-   *
-   * Note: This is just a convenience function, and state
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *
-   * Outputs:
-   *  - cache: State that maintains the moving average of the squared
-   *      gradients, of same shape as `X`.
-   */
-  cache = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/sgd.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd.dml b/scripts/staging/SystemML-NN/nn/optim/sgd.dml
deleted file mode 100644
index 3ba7eba..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/sgd.dml
+++ /dev/null
@@ -1,42 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Stochastic Gradient Descent (SGD) optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr)
-    return (matrix[double] X) {
-  /*
-   * Performs a vanilla SGD update.
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient wrt `X` of a loss function being optimized, of
-   *      same shape as `X`.
-   *  - lr: Learning rate.
-   *
-   * Outputs:
-   *  - X: Updated parameters `X`, of same shape as input `X`.
-   */
-  X = X - lr*dX
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
deleted file mode 100644
index 85922da..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
+++ /dev/null
@@ -1,71 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Stochastic Gradient Descent with momentum (SGD-momentum) optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
-    return (matrix[double] X, matrix[double] v) {
-  /*
-   * Performs an SGD update with momentum.
-   *
-   * In SGD with momentum, we assume that the parameters have a velocity
-   * that continues with some momentum, and that is influenced by the
-   * gradient.
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient wrt `X` of a loss function being optimized, of
-   *      same shape as `X`.
-   *  - lr: Learning rate.
-   *  - mu: Momentum value.
-   *      Typical values are in the range of [0.5, 0.99], usually
-   *      started at the lower end and annealed towards the higher end.
-   *  - v: State maintaining the velocity of the parameters `X`, of same
-   *      shape as `X`.
-   *
-   * Outputs:
-   *  - X: Updated parameters `X`, of same shape as input `X`.
-   *  - v: Updated velocity of the parameters `X`, of same shape as
-   *      input `X`.
-   */
-  v = mu*v - lr*dX  # update velocity
-  X = X + v  # update position
-}
-
-init = function(matrix[double] X)
-    return (matrix[double] v) {
-  /*
-   * Initialize the state for this optimizer.
-   *
-   * Note: This is just a convenience function, and state
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *
-   * Outputs:
-   *  - v: Initial velocity of the parameters `X`.
-   */
-  v = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
deleted file mode 100644
index 3b62c6e..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
+++ /dev/null
@@ -1,81 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Stochastic Gradient Descent with Nesterov momentum (SGD-Nesterov) optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
-    return (matrix[double] X, matrix[double] v) {
-  /*
-   * Performs an SGD update with Nesterov momentum.
-   *
-   * As with regular SGD with momentum, in SGD with Nesterov momentum,
-   * we assume that the parameters have a velocity that continues
-   * with some momentum, and that is influenced by the gradient.
-   * In this view specifically, we perform the position update from the
-   * position that the momentum is about to carry the parameters to,
-   * rather than from the previous position.  Additionally, we always
-   * store the parameters in their position after momentum.
-   *
-   * Reference:
-   *  - Advances in optimizing Recurrent Networks, Bengio et al.,
-   *    section 3.5.
-   *    - http://arxiv.org/abs/1212.0901
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient wrt `X` of a loss function being optimized, of
-   *      same shape as `X`.
-   *  - lr: Learning rate.
-   *  - mu: Momentum value.
-   *      Typical values are in the range of [0.5, 0.99], usually
-   *      started at the lower end and annealed towards the higher end.
-   *  - v: State maintaining the velocity of the parameters `X`, of same
-   *      shape as `X`.
-   *
-   * Outputs:
-   *  - X: Updated parameters X, of same shape as input X.
-   *  - v: Updated velocity of the parameters X, of same shape as
-   *      input v.
-   */
-  v_prev = v
-  v = mu*v - lr*dX  # update velocity
-  X = X - mu*v_prev + (1+mu)*v  # update position, including momentum
-}
-
-init = function(matrix[double] X)
-    return (matrix[double] v) {
-  /*
-   * Initialize the state for this optimizer.
-   *
-   * Note: This is just a convenience function, and state
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *
-   * Outputs:
-   *  - v: Initial velocity of the parameters `X`.
-   */
-  v = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/README.md
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/README.md b/scripts/staging/SystemML-NN/nn/test/README.md
deleted file mode 100644
index b714d50..0000000
--- a/scripts/staging/SystemML-NN/nn/test/README.md
+++ /dev/null
@@ -1,32 +0,0 @@
-<!--
-{% comment %}
-Licensed to the Apache Software Foundation (ASF) under one or more
-contributor license agreements.  See the NOTICE file distributed with
-this work for additional information regarding copyright ownership.
-The ASF licenses this file to you under the Apache License, Version 2.0
-(the "License"); you may not use this file except in compliance with
-the License.  You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-{% endcomment %}
--->
-
-# SystemML-NN Tests
-
-#### This folder contains tests for the *SystemML-NN* (`nn`) deep learning library.
-
----
-## Tests
-#### All layers are tested for correct derivatives ("gradient-checking"), and many layers also have correctness tests against simpler reference implementations.
-* `grad_check.dml` - Contains gradient-checks for all layers as individual DML functions.
-* `test.dml` - Contains correctness tests for several of the more complicated layers by checking against simple reference implementations, such as `conv_simple.dml`.  All tests are formulated as individual DML functions.
-* `tests.dml` - A DML script that runs all of the tests in `grad_check.dml` and `test.dml`.
-
-## Execution
-* `spark-submit SystemML.jar -f nn/test/tests.dml` from the base of the project.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml b/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
deleted file mode 100644
index 9f126d0..0000000
--- a/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
+++ /dev/null
@@ -1,213 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- *
- * This implementation is intended to be a simple, reference version.
- */
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
-                   int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial convolutional layer with
-   * F filters.  The input data has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * This implementation is intended to be a simple, reference version.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *  - padw: Padding for left and right sides.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  F = nrow(W)
-  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
-  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
-  # Create output volume
-  out = matrix(0, rows=N, cols=F*Hout*Wout)
-
-  # Convolution - Simple reference implementation
-  parfor (n in 1:N) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
-    # Pad image
-    Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
-    parfor (c in 1:C) {
-      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
-      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
-    }
-    # Convolve image with filters
-    parfor (f in 1:F, check=0) {  # all filters
-      parfor (hout in 1:Hout, check=0) {  # all output rows
-        h0 = (hout-1)*strideh + 1
-        parfor (wout in 1:Wout, check=0) {  # all output columns
-          w0 = (wout-1)*stridew + 1
-          # Create a patch of the input example corresponding spatially to the filter sizes
-          Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
-          parfor (c in 1:C, check=0) {
-            Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)  # reshape
-            Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf], rows=1,
-                                         cols=Hf*Wf)  # reshape
-          }
-          out[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] =
-              W[f,] %*% matrix(Xn_padded_patch, rows=C*Hf*Wf, cols=1) + b[f,]
-        }
-      }
-    }
-  }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
-                    matrix[double] X, matrix[double] W, matrix[double] b,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
-  /*
-   * Computes the backward pass for a 2D spatial convolutional layer
-   * with F filters.
-   *
-   * This implementation is intended to be a simple, reference version.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *  - padw: Padding for left and right sides.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt `b`, of shape (F, 1).
-   */
-  N = nrow(X)
-  F = nrow(W)
-
-  # Create gradient volumes
-  dX = matrix(0, rows=N, cols=C*Hin*Win)
-  dW = matrix(0, rows=F, cols=C*Hf*Wf)
-  db = matrix(0, rows=F, cols=1)
-
-  # Partial derivatives for convolution - Simple reference implementation
-  for (n in 1:N) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
-    # Pad image
-    Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
-    parfor (c in 1:C) {
-      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
-      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
-    }
-    dXn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
-    for (f in 1:F) {  # all filters
-      for (hout in 1:Hout) {  # all output rows
-        h0 = (hout-1) * strideh + 1
-        for (wout in 1:Wout) {  # all output columns
-          w0 = (wout-1) * stridew + 1
-          # Create a patch of the input example corresponding spatially to the filter sizes
-          Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
-          dXn_padded_patch = matrix(W[f,] * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout],
-                                    rows=C, cols=Hf*Wf)  # reshape
-          for (c in 1:C) {
-            Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)  # reshape
-            Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf],
-                                         rows=1, cols=Hf*Wf)  # reshape
-            dXn_padded_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
-            dXn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf] = matrix(dXn_padded_patch[c,],
-                                                              rows=Hf, cols=Wf)  # reshape
-            dXn_padded[c,] = dXn_padded[c,] + matrix(dXn_padded_slice,
-                                                     rows=1, cols=(Hin+2*padh)*(Win+2*padw))
-          }
-          dW[f,] = dW[f,]
-                   + matrix(Xn_padded_patch, rows=1, cols=C*Hf*Wf)
-                   * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
-          db[f,] = db[f,] + dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
-        }
-      }
-    }
-    # Unpad derivs on input
-    dXn = matrix(0, rows=C, cols=Hin*Win)
-    parfor (c in 1:C, check=0) {
-      dXn_padded_slice = matrix(dXn_padded[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
-      dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
-      dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
-    }
-    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
-  }
-}
-
-init = function(int F, int C, int Hf, int Wf)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * We use the heuristic by He et al., which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * unit-Gaussian weights by a factor of sqrt(2/n), under the
-   * assumption of relu neurons.
-   *  - http://arxiv.org/abs/1502.01852
-   *
-   * Inputs:
-   *  - F: Number of filters.
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   */
-  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
-  b = matrix(0, rows=F, cols=1)
-}
-


[09/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn` library to `scripts/nn`

Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/softmax.dml b/scripts/nn/layers/softmax.dml
new file mode 100644
index 0000000..68a7bc7
--- /dev/null
+++ b/scripts/nn/layers/softmax.dml
@@ -0,0 +1,87 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Softmax classifier layer.
+ */
+
+forward = function(matrix[double] scores)
+    return (matrix[double] probs) {
+  /*
+   * Computes the forward pass for a softmax classifier.  The inputs
+   * are interpreted as unnormalized, log-probabilities for each of
+   * N examples, and the softmax function transforms them to normalized
+   * probabilities.
+   *
+   * This can be interpreted as a generalization of the sigmoid
+   * function to multiple classes.
+   *
+   *   `probs_ij = e^scores_ij / sum(e^scores_i)`
+   *
+   * Inputs:
+   *  - scores: Inputs, of shape (N, D).
+   *
+   * Outputs:
+   *  - probs: Outputs, of shape (N, D).
+   */
+  # For numerical stability, we subtract the max score of an example from all scores for that
+  # example.  This is equivalent to the original formulation:
+  # e^scores_i / sum(e^scores_i) == C*e^scores_i / C*sum(e^scores_i)
+  #                              == e^(scores_i+log(C)) / sum(e^(scores_i+log(C))
+  # set log(C) = -max(scores_i):
+  #                              == e^(scores_i-max(scores_i)) / sum(e^(scores_i-max(scores_i))
+  scores = scores - rowMaxs(scores)  # numerical stability
+  unnorm_probs = exp(scores)  # unnormalized probabilities
+  probs = unnorm_probs / rowSums(unnorm_probs)  # normalized probabilities
+}
+
+backward = function(matrix[double] dprobs, matrix[double] scores)
+    return (matrix[double] dscores) {
+  /*
+   * Computes the backward pass for a softmax classifier.
+   *
+   * Note that dscores_ij has multiple source branches:
+   *
+   *   ```
+   *   dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij)
+   *   dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j
+   *
+   *   dloss/dscores_ij =
+   *      (dloss/dprobs_ij * dprobs_ij/dscores_ij)
+   *      + sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij)
+   *   ```
+   *
+   * Inputs:
+   *  - dprobs: Gradient wrt `probs` from upstream, of shape (N, D).
+   *  - scores: Inputs, of shape (N, D).
+   *
+   * Outputs:
+   *  - dscores: Gradient wrt `scores`, of shape (N, D).
+   */
+  scores = scores - rowMaxs(scores)  # numerical stability
+  unnorm_probs = exp(scores)  # unnormalized probabilities
+  probs = unnorm_probs / rowSums(unnorm_probs)  # normalized probabilities
+  # After some cancellation:
+  # dscores = dprobs*probs - probs*rowSums(dprobs*probs)
+  dtemp = dprobs * probs
+  dscores = dtemp - probs*rowSums(dtemp)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/tanh.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/tanh.dml b/scripts/nn/layers/tanh.dml
new file mode 100644
index 0000000..d849d70
--- /dev/null
+++ b/scripts/nn/layers/tanh.dml
@@ -0,0 +1,65 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Tanh nonlinearity layer.
+ */
+source("nn/layers/sigmoid.dml") as sigmoid
+
+forward = function(matrix[double] X)
+    return (matrix[double] out) {
+  /*
+   * Computes the forward pass for a tanh nonlinearity layer.
+   *
+   *   ```
+   *   tanh(x) = (e^x - e^-x) / (e^x + e^-x)
+   *           = 2 * sigmoid(2x) - 1
+   *   ```
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (any, any).
+   *
+   * Outputs:
+   *  - out: Outputs, of same shape as `X`.
+   */
+  # out = (exp(X) - exp(-X)) / (exp(X) + exp(-X))
+  # Simplification of the above formulation to use the sigmoid function:
+  sigma2X = sigmoid::forward(2*X)
+  out = 2*sigma2X - 1
+}
+
+backward = function(matrix[double] dout, matrix[double] X)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a tanh nonlinearity layer.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+   *  - X: Inputs, of shape (any, any).
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
+   */
+  sigma2X = sigmoid::forward(2*X)
+  out = 2*sigma2X - 1
+  dX = (1-out^2) * dout
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/adagrad.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/optim/adagrad.dml b/scripts/nn/optim/adagrad.dml
new file mode 100644
index 0000000..85b1c41
--- /dev/null
+++ b/scripts/nn/optim/adagrad.dml
@@ -0,0 +1,77 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Adagrad optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon,
+                  matrix[double] cache)
+    return (matrix[double] X, matrix[double] cache) {
+  /*
+   * Performs an Adagrad update.
+   *
+   * This is an adaptive learning rate optimizer that maintains the
+   * sum of squared gradients to automatically adjust the effective
+   * learning rate.
+   *
+   * Reference:
+   *  - Adaptive Subgradient Methods for Online Learning and Stochastic
+   *    Optimization, Duchi et al.
+   *      - http://jmlr.org/papers/v12/duchi11a.html
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
+   *  - lr: Learning rate.
+   *  - epsilon: Smoothing term to avoid divide by zero errors.
+   *      Typical values are in the range of [1e-8, 1e-4].
+   *  - cache: State that maintains per-parameter sum of squared
+   *      gradients, of same shape as `X`.
+   *
+   * Outputs:
+   *  - X: Updated parameters `X`, of same shape as input `X`.
+   *  - cache: State that maintains per-parameter sum of squared
+   *      gradients, of same shape as `X`.
+   */
+  cache = cache + dX^2
+  X = X - (lr * dX / (sqrt(cache)+epsilon))
+}
+
+init = function(matrix[double] X)
+    return (matrix[double] cache) {
+  /*
+   * Initialize the state for this optimizer.
+   *
+   * Note: This is just a convenience function, and state
+   * may be initialized manually if needed.
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *
+   * Outputs:
+   *  - cache: State that maintains per-parameter sum of squared
+   *      gradients, of same shape as `X`.
+   */
+  cache = matrix(0, rows=nrow(X), cols=ncol(X))
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/adam.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/optim/adam.dml b/scripts/nn/optim/adam.dml
new file mode 100644
index 0000000..4b6fa2a
--- /dev/null
+++ b/scripts/nn/optim/adam.dml
@@ -0,0 +1,97 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Adam optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr, double beta1, double beta2,
+                  double epsilon, int t, matrix[double] m, matrix[double] v)
+    return (matrix[double] X, matrix[double] m, matrix[double] v) {
+  /*
+   * Performs an Adam update.
+   *
+   * Reference:
+   *  - Adam: A Method for Stochastic Optimization, Kingma, Ba.
+   *    - http://arxiv.org/abs/1412.6980
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
+   *  - lr: Learning rate.  Recommended value is 0.001.
+   *  - beta1: Exponential decay rate for the 1st moment estimates.
+   *      Recommended value is 0.9.
+   *  - beta2: Exponential decay rate for the 2nd moment estimates.
+   *      Recommended value is 0.999.
+   *  - epsilon: Smoothing term to avoid divide by zero errors.
+   *      Recommended value is 1e-8.
+   *  - t: Timestep, starting at 0.
+   *  - m: State containing the 1st moment (mean) estimate by
+   *      maintaining exponential moving averages of the gradients, of
+   *      same shape as `X`.
+   *  - v: State containing the 2nd raw moment (uncentered variance)
+   *      estimate by maintaining exponential moving averages of the
+   *      squared gradients, of same shape as `X`.
+   *
+   * Outputs:
+   *  - X: Updated parameters `X`, of same shape as input `X`.
+   *  - m: Updated state containing the 1st moment (mean) estimate by
+   *      maintaining exponential moving averages of the gradients, of
+   *      same shape as `X`.
+   *  - v: Updated state containing the 2nd raw moment (uncentered
+   *      variance) estimate by maintaining exponential moving averages
+   *      of the squared gradients, of same shape as `X`.
+   */
+  t = t + 1
+  m = beta1*m + (1-beta1)*dX  # update biased 1st moment estimate
+  v = beta2*v + (1-beta2)*dX^2  # update biased 2nd raw moment estimate
+  # m = m / (1-beta1^t)  # compute bias-corrected 1st moment estimate
+  # v = v / (1-beta2^t)  # compute bias-corrected 2nd raw moment estimate
+  # X = X - (lr * m / (sqrt(v)+epsilon))  # param update
+  # Simplified for computational efficiency:
+  lr = lr * sqrt(1-beta2^t) / (1-beta1^t)
+  X = X - (lr * m / (sqrt(v)+epsilon))
+}
+
+init = function(matrix[double] X)
+    return (matrix[double] m, matrix[double] v) {
+  /*
+   * Initialize the state for this optimizer.
+   *
+   * Note: This is just a convenience function, and state
+   * may be initialized manually if needed.
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *
+   * Outputs:
+   *  - m: Initial state containing the 1st moment (mean) estimate by
+   *      maintaining exponential moving averages of the gradients, of
+   *      same shape as `X`.
+   *  - v: Initial state containing the 2nd raw moment (uncentered
+   *      variance) estimate by maintaining exponential moving averages
+   *      of the squared gradients, of same shape as `X`.
+   */
+  m = matrix(0, rows=nrow(X), cols=ncol(X))
+  v = matrix(0, rows=nrow(X), cols=ncol(X))
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/rmsprop.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/optim/rmsprop.dml b/scripts/nn/optim/rmsprop.dml
new file mode 100644
index 0000000..1feccaf
--- /dev/null
+++ b/scripts/nn/optim/rmsprop.dml
@@ -0,0 +1,79 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * RMSprop optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr, double decay_rate,
+                  double epsilon, matrix[double] cache)
+    return (matrix[double] X, matrix[double] cache) {
+  /*
+   * Performs an RMSprop update.
+   *
+   * This is an adaptive learning rate optimizer that can be viewed
+   * as an adjustment of the Adagrad method to use a moving average
+   * of the sum of squared gradients in order to improve convergence.
+   *
+   * Reference:
+   *  - Neural Networks for Machine Learning, Lecture 6a, Hinton,
+   *    slide 29.
+   *    - http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
+   *  - lr: Learning rate.
+   *  - decay_rate: Term controlling the rate of the moving average.
+   *      Typical values are in the range of [0.9, 0.999].
+   *  - epsilon: Smoothing term to avoid divide by zero errors.
+   *      Typical values are in the range of [1e-8, 1e-4].
+   *  - cache: State that maintains the moving average of the squared
+   *      gradients, of same shape as `X`.
+   *
+   * Outputs:
+   *  - X: Updated parameters `X`, of same shape as input `X`.
+   *  - cache: Updated state that maintains the moving average of the
+   *      squared gradients, of same shape as `X`.
+   */
+  cache = decay_rate*cache + (1-decay_rate)*dX^2
+  X = X - (lr * dX / (sqrt(cache)+epsilon))
+}
+
+init = function(matrix[double] X)
+    return (matrix[double] cache) {
+  /*
+   * Initialize the state for this optimizer.
+   *
+   * Note: This is just a convenience function, and state
+   * may be initialized manually if needed.
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *
+   * Outputs:
+   *  - cache: State that maintains the moving average of the squared
+   *      gradients, of same shape as `X`.
+   */
+  cache = matrix(0, rows=nrow(X), cols=ncol(X))
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/sgd.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/optim/sgd.dml b/scripts/nn/optim/sgd.dml
new file mode 100644
index 0000000..3ba7eba
--- /dev/null
+++ b/scripts/nn/optim/sgd.dml
@@ -0,0 +1,42 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Stochastic Gradient Descent (SGD) optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr)
+    return (matrix[double] X) {
+  /*
+   * Performs a vanilla SGD update.
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
+   *  - lr: Learning rate.
+   *
+   * Outputs:
+   *  - X: Updated parameters `X`, of same shape as input `X`.
+   */
+  X = X - lr*dX
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/sgd_momentum.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/optim/sgd_momentum.dml b/scripts/nn/optim/sgd_momentum.dml
new file mode 100644
index 0000000..85922da
--- /dev/null
+++ b/scripts/nn/optim/sgd_momentum.dml
@@ -0,0 +1,71 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Stochastic Gradient Descent with momentum (SGD-momentum) optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
+    return (matrix[double] X, matrix[double] v) {
+  /*
+   * Performs an SGD update with momentum.
+   *
+   * In SGD with momentum, we assume that the parameters have a velocity
+   * that continues with some momentum, and that is influenced by the
+   * gradient.
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
+   *  - lr: Learning rate.
+   *  - mu: Momentum value.
+   *      Typical values are in the range of [0.5, 0.99], usually
+   *      started at the lower end and annealed towards the higher end.
+   *  - v: State maintaining the velocity of the parameters `X`, of same
+   *      shape as `X`.
+   *
+   * Outputs:
+   *  - X: Updated parameters `X`, of same shape as input `X`.
+   *  - v: Updated velocity of the parameters `X`, of same shape as
+   *      input `X`.
+   */
+  v = mu*v - lr*dX  # update velocity
+  X = X + v  # update position
+}
+
+init = function(matrix[double] X)
+    return (matrix[double] v) {
+  /*
+   * Initialize the state for this optimizer.
+   *
+   * Note: This is just a convenience function, and state
+   * may be initialized manually if needed.
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *
+   * Outputs:
+   *  - v: Initial velocity of the parameters `X`.
+   */
+  v = matrix(0, rows=nrow(X), cols=ncol(X))
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/sgd_nesterov.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/optim/sgd_nesterov.dml b/scripts/nn/optim/sgd_nesterov.dml
new file mode 100644
index 0000000..3b62c6e
--- /dev/null
+++ b/scripts/nn/optim/sgd_nesterov.dml
@@ -0,0 +1,81 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Stochastic Gradient Descent with Nesterov momentum (SGD-Nesterov) optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
+    return (matrix[double] X, matrix[double] v) {
+  /*
+   * Performs an SGD update with Nesterov momentum.
+   *
+   * As with regular SGD with momentum, in SGD with Nesterov momentum,
+   * we assume that the parameters have a velocity that continues
+   * with some momentum, and that is influenced by the gradient.
+   * In this view specifically, we perform the position update from the
+   * position that the momentum is about to carry the parameters to,
+   * rather than from the previous position.  Additionally, we always
+   * store the parameters in their position after momentum.
+   *
+   * Reference:
+   *  - Advances in optimizing Recurrent Networks, Bengio et al.,
+   *    section 3.5.
+   *    - http://arxiv.org/abs/1212.0901
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
+   *  - lr: Learning rate.
+   *  - mu: Momentum value.
+   *      Typical values are in the range of [0.5, 0.99], usually
+   *      started at the lower end and annealed towards the higher end.
+   *  - v: State maintaining the velocity of the parameters `X`, of same
+   *      shape as `X`.
+   *
+   * Outputs:
+   *  - X: Updated parameters X, of same shape as input X.
+   *  - v: Updated velocity of the parameters X, of same shape as
+   *      input v.
+   */
+  v_prev = v
+  v = mu*v - lr*dX  # update velocity
+  X = X - mu*v_prev + (1+mu)*v  # update position, including momentum
+}
+
+init = function(matrix[double] X)
+    return (matrix[double] v) {
+  /*
+   * Initialize the state for this optimizer.
+   *
+   * Note: This is just a convenience function, and state
+   * may be initialized manually if needed.
+   *
+   * Inputs:
+   *  - X: Parameters to update, of shape (any, any).
+   *
+   * Outputs:
+   *  - v: Initial velocity of the parameters `X`.
+   */
+  v = matrix(0, rows=nrow(X), cols=ncol(X))
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/README.md
----------------------------------------------------------------------
diff --git a/scripts/nn/test/README.md b/scripts/nn/test/README.md
new file mode 100644
index 0000000..b714d50
--- /dev/null
+++ b/scripts/nn/test/README.md
@@ -0,0 +1,32 @@
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+# SystemML-NN Tests
+
+#### This folder contains tests for the *SystemML-NN* (`nn`) deep learning library.
+
+---
+## Tests
+#### All layers are tested for correct derivatives ("gradient-checking"), and many layers also have correctness tests against simpler reference implementations.
+* `grad_check.dml` - Contains gradient-checks for all layers as individual DML functions.
+* `test.dml` - Contains correctness tests for several of the more complicated layers by checking against simple reference implementations, such as `conv_simple.dml`.  All tests are formulated as individual DML functions.
+* `tests.dml` - A DML script that runs all of the tests in `grad_check.dml` and `test.dml`.
+
+## Execution
+* `spark-submit SystemML.jar -f nn/test/tests.dml` from the base of the project.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/conv2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/conv2d_simple.dml b/scripts/nn/test/conv2d_simple.dml
new file mode 100644
index 0000000..9f126d0
--- /dev/null
+++ b/scripts/nn/test/conv2d_simple.dml
@@ -0,0 +1,213 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ *
+ * This implementation is intended to be a simple, reference version.
+ */
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+                   int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial convolutional layer with
+   * F filters.  The input data has N examples, each represented as a 3D
+   * volume unrolled into a single vector.
+   *
+   * This implementation is intended to be a simple, reference version.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+  # Create output volume
+  out = matrix(0, rows=N, cols=F*Hout*Wout)
+
+  # Convolution - Simple reference implementation
+  parfor (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+    # Pad image
+    Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+    parfor (c in 1:C) {
+      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+    }
+    # Convolve image with filters
+    parfor (f in 1:F, check=0) {  # all filters
+      parfor (hout in 1:Hout, check=0) {  # all output rows
+        h0 = (hout-1)*strideh + 1
+        parfor (wout in 1:Wout, check=0) {  # all output columns
+          w0 = (wout-1)*stridew + 1
+          # Create a patch of the input example corresponding spatially to the filter sizes
+          Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
+          parfor (c in 1:C, check=0) {
+            Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)  # reshape
+            Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf], rows=1,
+                                         cols=Hf*Wf)  # reshape
+          }
+          out[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] =
+              W[f,] %*% matrix(Xn_padded_patch, rows=C*Hf*Wf, cols=1) + b[f,]
+        }
+      }
+    }
+  }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+                    matrix[double] X, matrix[double] W, matrix[double] b,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+  /*
+   * Computes the backward pass for a 2D spatial convolutional layer
+   * with F filters.
+   *
+   * This implementation is intended to be a simple, reference version.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt `b`, of shape (F, 1).
+   */
+  N = nrow(X)
+  F = nrow(W)
+
+  # Create gradient volumes
+  dX = matrix(0, rows=N, cols=C*Hin*Win)
+  dW = matrix(0, rows=F, cols=C*Hf*Wf)
+  db = matrix(0, rows=F, cols=1)
+
+  # Partial derivatives for convolution - Simple reference implementation
+  for (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+    # Pad image
+    Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+    parfor (c in 1:C) {
+      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+    }
+    dXn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+    for (f in 1:F) {  # all filters
+      for (hout in 1:Hout) {  # all output rows
+        h0 = (hout-1) * strideh + 1
+        for (wout in 1:Wout) {  # all output columns
+          w0 = (wout-1) * stridew + 1
+          # Create a patch of the input example corresponding spatially to the filter sizes
+          Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
+          dXn_padded_patch = matrix(W[f,] * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout],
+                                    rows=C, cols=Hf*Wf)  # reshape
+          for (c in 1:C) {
+            Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)  # reshape
+            Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf],
+                                         rows=1, cols=Hf*Wf)  # reshape
+            dXn_padded_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+            dXn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf] = matrix(dXn_padded_patch[c,],
+                                                              rows=Hf, cols=Wf)  # reshape
+            dXn_padded[c,] = dXn_padded[c,] + matrix(dXn_padded_slice,
+                                                     rows=1, cols=(Hin+2*padh)*(Win+2*padw))
+          }
+          dW[f,] = dW[f,]
+                   + matrix(Xn_padded_patch, rows=1, cols=C*Hf*Wf)
+                   * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
+          db[f,] = db[f,] + dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
+        }
+      }
+    }
+    # Unpad derivs on input
+    dXn = matrix(0, rows=C, cols=Hin*Win)
+    parfor (c in 1:C, check=0) {
+      dXn_padded_slice = matrix(dXn_padded[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
+      dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
+      dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
+    }
+    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
+  }
+}
+
+init = function(int F, int C, int Hf, int Wf)
+    return (matrix[double] W, matrix[double] b) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
+   *
+   * Inputs:
+   *  - F: Number of filters.
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *
+   * Outputs:
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   */
+  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+  b = matrix(0, rows=F, cols=1)
+}
+


[04/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn` library to `scripts/nn`

Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/grad_check.dml b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
deleted file mode 100644
index f3bc9a7..0000000
--- a/scripts/staging/SystemML-NN/nn/test/grad_check.dml
+++ /dev/null
@@ -1,1769 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Gradient checks for various architectures.
- */
-source("nn/layers/affine.dml") as affine
-source("nn/layers/batch_norm1d.dml") as batch_norm1d
-source("nn/layers/batch_norm2d.dml") as batch_norm2d
-source("nn/layers/conv2d.dml") as conv2d
-source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/dropout.dml") as dropout
-source("nn/layers/l1_loss.dml") as l1_loss
-source("nn/layers/l1_reg.dml") as l1_reg
-source("nn/layers/l2_loss.dml") as l2_loss
-source("nn/layers/l2_reg.dml") as l2_reg
-source("nn/layers/log_loss.dml") as log_loss
-source("nn/layers/lstm.dml") as lstm
-source("nn/layers/max_pool2d.dml") as max_pool2d
-source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
-source("nn/layers/relu.dml") as relu
-source("nn/layers/rnn.dml") as rnn
-source("nn/layers/scale_shift1d.dml") as scale_shift1d
-source("nn/layers/scale_shift2d.dml") as scale_shift2d
-source("nn/layers/sigmoid.dml") as sigmoid
-source("nn/layers/softmax.dml") as softmax
-source("nn/layers/tanh.dml") as tanh
-source("nn/test/conv2d_simple.dml") as conv2d_simple
-source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
-source("nn/test/util.dml") as test_util
-
-affine = function() {
-  /*
-   * Gradient check for the affine layer.
-   */
-  print("Grad checking the affine layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  D = 100 # num features
-  M = 10 # num neurons
-  X = rand(rows=N, cols=D)
-  y = rand(rows=N, cols=M)
-  [W, b] = affine::init(D, M)
-
-  # Compute analytical gradients of loss wrt parameters
-  out = affine::forward(X, W, b)
-  dout = l2_loss::backward(out, y)
-  [dX, dW, db] = affine::backward(dout, X, W, b)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      outmh = affine::forward(X, W, b)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      outph = affine::forward(X, W, b)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W.")
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      outmh = affine::forward(X, W, b)
-      lossmh = l2_loss::forward(outmh, y)
-      W[i,j] = old + h
-      outph = affine::forward(X, W, b)
-      lossph = l2_loss::forward(outph, y)
-      W[i,j] = old  # reset
-      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b.")
-  for (i in 1:nrow(b)) {
-    for (j in 1:ncol(b)) {
-      # Compute numerical derivative
-      old = as.scalar(b[i,j])
-      b[i,j] = old - h
-      outmh = affine::forward(X, W, b)
-      lossmh = l2_loss::forward(outmh, y)
-      b[i,j] = old + h
-      outph = affine::forward(X, W, b)
-      lossph = l2_loss::forward(outph, y)
-      b[i,j] = old  # reset
-      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
-    }
-  }
-}
-
-batch_norm1d = function() {
-  /*
-   * Gradient check for the 1D batch normalization layer.
-   */
-  print("Grad checking the 1D batch normalization layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  D = 100 # num features
-  mu = 0.9  # momentum
-  eps = 1e-5  # epsilon
-  X = rand(rows=N, cols=D)
-  y = rand(rows=N, cols=D)
-  gamma = rand(rows=1, cols=D)
-  beta = rand(rows=1, cols=D)
-  ema_mean = rand(rows=1, cols=D)
-  ema_var = rand(rows=1, cols=D)
-  #[dummy, dummy, ema_mean, ema_var] = batch_norm1d::init(D)
-
-  # Check training & testing modes
-  for (i in 1:2) {
-    if (i == 1)
-      mode = 'train'
-    else
-      mode = 'test'
-    print(" - Grad checking the '"+mode+"' mode.")
-
-    # Compute analytical gradients of loss wrt parameters
-    [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-        batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-    dout = l2_loss::backward(out, y)
-    [dX, dgamma, dbeta] = batch_norm1d::backward(dout, out, ema_mean_upd, ema_var_upd,
-                                                 cache_mean, cache_var, cache_norm,
-                                                 X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-
-    # Grad check
-    h = 1e-5
-    print("   - Grad checking X.")
-    for (i in 1:nrow(X)) {
-      for (j in 1:ncol(X)) {
-        # Compute numerical derivative
-        old = as.scalar(X[i,j])
-        X[i,j] = old - h
-        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-        lossmh = l2_loss::forward(outmh, y)
-        X[i,j] = old + h
-        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-        lossph = l2_loss::forward(outph, y)
-        X[i,j] = old  # reset
-        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-      }
-    }
-
-    print("   - Grad checking gamma.")
-    for (i in 1:nrow(gamma)) {
-      for (j in 1:ncol(gamma)) {
-        # Compute numerical derivative
-        old = as.scalar(gamma[i,j])
-        gamma[i,j] = old - h
-        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-        lossmh = l2_loss::forward(outmh, y)
-        gamma[i,j] = old + h
-        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-        lossph = l2_loss::forward(outph, y)
-        gamma[i,j] = old  # reset
-        dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
-                                                    lossph, lossmh)
-      }
-    }
-
-    print("   - Grad checking beta.")
-    for (i in 1:nrow(beta)) {
-      for (j in 1:ncol(beta)) {
-        # Compute numerical derivative
-        old = as.scalar(beta[i,j])
-        beta[i,j] = old - h
-        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-        lossmh = l2_loss::forward(outmh, y)
-        beta[i,j] = old + h
-        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-        lossph = l2_loss::forward(outph, y)
-        beta[i,j] = old  # reset
-        dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
-                                                    lossph, lossmh)
-      }
-    }
-  }
-}
-
-batch_norm2d = function() {
-  /*
-   * Gradient check for the 2D (spatial) batch normalization layer.
-   */
-  print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  C = 2  # num channels
-  Hin = 5  # input height
-  Win = 5  # input width
-  mu = 0.9  # momentum
-  eps = 1e-5  # epsilon
-  X = rand(rows=N, cols=C*Hin*Win)
-  y = rand(rows=N, cols=C*Hin*Win)
-  gamma = rand(rows=C, cols=1)
-  beta = rand(rows=C, cols=1)
-  ema_mean = rand(rows=C, cols=1)
-  ema_var = rand(rows=C, cols=1)
-  #[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C)
-
-  # Check training & testing modes
-  for (i in 1:2) {
-    if (i == 1)
-      mode = 'train'
-    else
-      mode = 'test'
-    print(" - Grad checking the '"+mode+"' mode.")
-
-    # Compute analytical gradients of loss wrt parameters
-    [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-        batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-    dout = l2_loss::backward(out, y)
-    [dX, dgamma, dbeta] = batch_norm2d::backward(dout, out, ema_mean_upd, ema_var_upd,
-                                                 cache_mean, cache_var, cache_norm,
-                                                 X, gamma, beta, C, Hin, Win, mode,
-                                                 ema_mean, ema_var, mu, eps)
-
-    # Grad check
-    h = 1e-5
-    print("   - Grad checking X.")
-    for (i in 1:nrow(X)) {
-      for (j in 1:ncol(X)) {
-        # Compute numerical derivative
-        old = as.scalar(X[i,j])
-        X[i,j] = old - h
-        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-        lossmh = l2_loss::forward(outmh, y)
-        X[i,j] = old + h
-        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-        lossph = l2_loss::forward(outph, y)
-        X[i,j] = old  # reset
-        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-      }
-    }
-
-    print("   - Grad checking gamma.")
-    for (i in 1:nrow(gamma)) {
-      for (j in 1:ncol(gamma)) {
-        # Compute numerical derivative
-        old = as.scalar(gamma[i,j])
-        gamma[i,j] = old - h
-        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-        lossmh = l2_loss::forward(outmh, y)
-        gamma[i,j] = old + h
-        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-        lossph = l2_loss::forward(outph, y)
-        gamma[i,j] = old  # reset
-        dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
-                                                    lossph, lossmh)
-      }
-    }
-
-    print("   - Grad checking beta.")
-    for (i in 1:nrow(beta)) {
-      for (j in 1:ncol(beta)) {
-        # Compute numerical derivative
-        old = as.scalar(beta[i,j])
-        beta[i,j] = old - h
-        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-        lossmh = l2_loss::forward(outmh, y)
-        beta[i,j] = old + h
-        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-        lossph = l2_loss::forward(outph, y)
-        beta[i,j] = old  # reset
-        dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
-                                                    lossph, lossmh)
-      }
-    }
-  }
-}
-
-conv2d = function() {
-  /*
-   * Gradient check for the 2D convolutional layer using `im2col`.
-   */
-  print("Grad checking the `im2col` 2D convolutional layer with L2 loss.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 2  # num channels
-  Hin = 5  # input height
-  Win = 5  # input width
-  F = 2  # num filters
-  Hf = 3  # filter height
-  Wf = 3  # filter width
-  stride = 1
-  pad = 1
-  X = rand(rows=N, cols=C*Hin*Win)
-  y = rand(rows=N, cols=F*Hin*Win)
-
-  # Create layers
-  [W, b] = conv2d::init(F, C, Hf, Wf)
-
-  # Compute analytical gradients of loss wrt parameters
-  [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-  dout = l2_loss::backward(out, y)
-  [dX, dW, db] = conv2d::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                  pad, pad)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W.")
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      W[i,j] = old + h
-      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      W[i,j] = old  # reset
-      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b.")
-  for (i in 1:nrow(b)) {
-    for (j in 1:ncol(b)) {
-      # Compute numerical derivative
-      old = as.scalar(b[i,j])
-      b[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      b[i,j] = old + h
-      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      b[i,j] = old  # reset
-      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
-    }
-  }
-}
-
-conv2d_builtin = function() {
-  /*
-   * Gradient check for the 2D convolutional layer using built-in
-   * functions.
-   */
-  print("Grad checking the built-in 2D convolutional layer with L2 loss.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 2  # num channels
-  Hin = 5  # input height
-  Win = 5  # input width
-  F = 2  # num filters
-  Hf = 3  # filter height
-  Wf = 3  # filter width
-  stride = 1
-  pad = 1
-  X = rand(rows=N, cols=C*Hin*Win)
-  y = rand(rows=N, cols=F*Hin*Win)
-
-  # Create layers
-  [W, b] = conv2d_builtin::init(F, C, Hf, Wf)
-
-  # Compute analytical gradients of loss wrt parameters
-  [out, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                              pad, pad)
-  dout = l2_loss::backward(out, y)
-  [dX, dW, db] = conv2d_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
-                                          stride, stride, pad, pad)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                    pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                    pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W.")
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                    pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      W[i,j] = old + h
-      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                    pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      W[i,j] = old  # reset
-      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b.")
-  for (i in 1:nrow(b)) {
-    for (j in 1:ncol(b)) {
-      # Compute numerical derivative
-      old = as.scalar(b[i,j])
-      b[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                    pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      b[i,j] = old + h
-      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                    pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      b[i,j] = old  # reset
-      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
-    }
-  }
-}
-
-conv2d_simple = function() {
-  /*
-   * Gradient check for the simple reference 2D convolutional layer.
-   */
-  print("Grad checking the simple reference 2D convolutional layer with L2 loss.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 2  # num channels
-  Hin = 5  # input height
-  Win = 5  # input width
-  F = 2  # num filters
-  Hf = 3  # filter height
-  Wf = 3  # filter width
-  stride = 1
-  pad = 1
-  X = rand(rows=N, cols=C*Hin*Win)
-  y = rand(rows=N, cols=F*Hin*Win)
-
-  # Create layers
-  [W, b] = conv2d_simple::init(F, C, Hf, Wf)
-
-  # Compute analytical gradients of loss wrt parameters
-  [out, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-  dout = l2_loss::backward(out, y)
-  [dX, dW, db] = conv2d_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
-                                         stride, stride, pad, pad)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                   pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                   pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W.")
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                   pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      W[i,j] = old + h
-      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                   pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      W[i,j] = old  # reset
-      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b.")
-  for (i in 1:nrow(b)) {
-    for (j in 1:ncol(b)) {
-      # Compute numerical derivative
-      old = as.scalar(b[i,j])
-      b[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                   pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      b[i,j] = old + h
-      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                   pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      b[i,j] = old  # reset
-      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
-    }
-  }
-}
-
-cross_entropy_loss = function() {
-  /*
-   * Gradient check for the cross-entropy loss function.
-   */
-  print("Grad checking the cross-entropy loss function.")
-
-  # Generate data
-  N = 3 # num examples
-  K = 10 # num targets
-  pred = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
-  pred = pred / rowSums(pred)  # normalized probs
-  y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
-  y = y / rowSums(y)  # normalized probs
-
-  # Compute analytical gradient
-  dpred = cross_entropy_loss::backward(pred, y)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(pred)) {
-    for (j in 1:ncol(pred)) {
-      # Compute numerical derivative
-      old = as.scalar(pred[i,j])
-      pred[i,j] = old - h
-      lossmh = cross_entropy_loss::forward(pred, y)
-      pred[i,j] = old + h
-      lossph = cross_entropy_loss::forward(pred, y)
-      pred[i,j] = old  # reset W[i,j]
-      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
-    }
-  }
-}
-
-dropout = function() {
-  /*
-   * Gradient check for the (inverted) dropout layer.
-   */
-  print("Grad checking the (inverted) dropout layer with L2 loss.")
-
-  # Generate data
-  N = 3  # num examples
-  M = 100  # num neurons
-  p = 0.5  # probability of dropping neuron output
-  seed = as.integer(floor(as.scalar(rand(rows=1, cols=1, min=1, max=100000))))  # random seed
-  X = rand(rows=N, cols=M)
-  y = rand(rows=N, cols=M)
-
-  # Compute analytical gradients of loss wrt parameters
-  [out, mask] = dropout::forward(X, p, seed)
-  dout = l2_loss::backward(out, y)
-  dX = dropout::backward(dout, X, p, mask)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      [outmh, mask] = dropout::forward(X, p, seed)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      [outph, mask] = dropout::forward(X, p, seed)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-}
-
-l1_loss = function() {
-  /*
-   * Gradient check for the L1 loss function.
-   */
-  print("Grad checking the L1 loss function.")
-
-  # Generate data
-  N = 3 # num examples
-  D = 2 # num targets
-  pred = rand(rows=N, cols=D)
-  y = rand(rows=N, cols=D)
-
-  # Compute analytical gradient
-  dpred = l1_loss::backward(pred, y)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(pred)) {
-    for (j in 1:ncol(pred)) {
-      # Compute numerical derivative
-      old = as.scalar(pred[i,j])
-      pred[i,j] = old - h
-      lossmh = l1_loss::forward(pred, y)
-      pred[i,j] = old + h
-      lossph = l1_loss::forward(pred, y)
-      pred[i,j] = old  # reset W[i,j]
-      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
-    }
-  }
-}
-
-l1_reg = function() {
-  /*
-   * Gradient check for the L1 regularization function.
-   */
-  print("Grad checking the L1 regularization function.")
-
-  # Generate data
-  D = 5 # num features
-  M = 3 # num neurons
-  lambda = 0.01
-  W = rand(rows=D, cols=M)
-
-  # Compute analytical gradient
-  dW = l1_reg::backward(W, lambda)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      reg_lossmh = l1_reg::forward(W, lambda)
-      W[i,j] = old + h
-      reg_lossph = l1_reg::forward(W, lambda)
-      W[i,j] = old  # reset W[i,j]
-      dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
-                                                  reg_lossph, reg_lossmh)
-    }
-  }
-}
-
-l2_loss = function() {
-  /*
-   * Gradient check for the L2 loss function.
-   */
-  print("Grad checking the L2 loss function.")
-
-  # Generate data
-  N = 3 # num examples
-  D = 2 # num targets
-  pred = rand(rows=N, cols=D)
-  y = rand(rows=N, cols=D)
-
-  # Compute analytical gradient
-  dpred = l2_loss::backward(pred, y)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(pred)) {
-    for (j in 1:ncol(pred)) {
-      # Compute numerical derivative
-      old = as.scalar(pred[i,j])
-      pred[i,j] = old - h
-      lossmh = l2_loss::forward(pred, y)
-      pred[i,j] = old + h
-      lossph = l2_loss::forward(pred, y)
-      pred[i,j] = old  # reset W[i,j]
-      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
-    }
-  }
-}
-
-l2_reg = function() {
-  /*
-   * Gradient check for the L2 regularization function.
-   */
-  print("Grad checking the L2 regularization function.")
-
-  # Generate data
-  D = 5 # num features
-  M = 3 # num neurons
-  lambda = 0.01
-  W = rand(rows=D, cols=M)
-
-  # Compute analytical gradient
-  dW = l2_reg::backward(W, lambda)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      reg_lossmh = l2_reg::forward(W, lambda)
-      W[i,j] = old + h
-      reg_lossph = l2_reg::forward(W, lambda)
-      W[i,j] = old  # reset W[i,j]
-      dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
-                                                  reg_lossph, reg_lossmh)
-    }
-  }
-}
-
-log_loss = function() {
-  /*
-   * Gradient check for the log loss function.
-   */
-  print("Grad checking the log loss function.")
-
-  # Generate data
-  N = 20 # num examples
-  D = 1 # num targets
-  pred = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
-  y = round(rand(rows=N, cols=D, min=0, max=1, pdf="uniform"))
-
-  # Compute analytical gradient
-  dpred = log_loss::backward(pred, y)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(pred)) {
-    for (j in 1:ncol(pred)) {
-      # Compute numerical derivative
-      old = as.scalar(pred[i,j])
-      pred[i,j] = old - h
-      lossmh = log_loss::forward(pred, y)
-      pred[i,j] = old + h
-      lossph = log_loss::forward(pred, y)
-      pred[i,j] = old  # reset W[i,j]
-      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
-    }
-  }
-}
-
-lstm = function() {
-  /*
-   * Gradient check for the LSTM layer.
-   */
-  print("Grad checking the LSTM layer with L2 loss.")
-
-  # Generate data
-  N = 3  # num examples
-  D = 10  # num features
-  T = 15  # num timesteps (sequence length)
-  M = 5 # num neurons
-  return_seq = TRUE
-  X = rand(rows=N, cols=T*D)
-  y = rand(rows=N, cols=T*M)
-  yc = rand(rows=N, cols=M)
-  out0 = rand(rows=N, cols=M)
-  c0 = rand(rows=N, cols=M)
-  [W, b, dummy, dummy2] = lstm::init(N, D, M)
-
-  # Compute analytical gradients of loss wrt parameters
-  [out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-  dout = l2_loss::backward(out, y)
-  dc = l2_loss::backward(c, yc)
-  [dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0,
-                                            cache_out, cache_c, cache_ifog)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      X[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W.")
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      W[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      W[i,j] = old  # reset
-      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b.")
-  for (i in 1:nrow(b)) {
-    for (j in 1:ncol(b)) {
-      # Compute numerical derivative
-      old = as.scalar(b[i,j])
-      b[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      b[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      b[i,j] = old  # reset
-      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking out0.")
-  for (i in 1:nrow(out0)) {
-    for (j in 1:ncol(out0)) {
-      # Compute numerical derivative
-      old = as.scalar(out0[i,j])
-      out0[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      out0[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      out0[i,j] = old  # reset
-      dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking c0.")
-  for (i in 1:nrow(c0)) {
-    for (j in 1:ncol(c0)) {
-      # Compute numerical derivative
-      old = as.scalar(c0[i,j])
-      c0[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      c0[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      c0[i,j] = old  # reset
-      dc0_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
-    }
-  }
-}
-
-max_pool2d = function() {
-  /*
-   * Gradient check for the 2D max pooling layer.
-   */
-  print("Grad checking the 2D max pooling layer with L2 loss.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 2  # num channels
-  Hin = 4  # input height
-  Win = 4  # input width
-  Hf = 2  # pool filter height
-  Wf = 2  # pool filter width
-  stride = 2
-  X = rand(rows=N, cols=C*Hin*Win)
-
-  for (pad in 0:1) {
-    print(" - Grad checking w/ pad="+pad+".")
-    Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
-    Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
-    y = rand(rows=N, cols=C*Hout*Wout)
-
-    # Compute analytical gradients of loss wrt parameters
-    [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-    dout = l2_loss::backward(out, y)
-    dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-
-    # Grad check
-    h = 1e-5
-    for (i in 1:nrow(X)) {
-      for (j in 1:ncol(X)) {
-        # Compute numerical derivative
-        old = as.scalar(X[i,j])
-        X[i,j] = old - h
-        [outmh, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-        lossmh = l2_loss::forward(outmh, y)
-        X[i,j] = old + h
-        [outph, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-        lossph = l2_loss::forward(outph, y)
-        X[i,j] = old  # reset
-        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-      }
-    }
-  }
-}
-
-max_pool2d_builtin = function() {
-  /*
-   * Gradient check for the 2D max pooling layer.
-   */
-  print("Grad checking the built-in 2D max pooling layer with L2 loss.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 2  # num channels
-  Hin = 4  # input height
-  Win = 4  # input width
-  Hf = 2  # pool filter height
-  Wf = 2  # pool filter width
-  stride = 2
-  X = rand(rows=N, cols=C*Hin*Win)
-
-  for (pad in 0:1) {
-    print(" - Grad checking w/ pad="+pad+".")
-    Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
-    Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
-    y = rand(rows=N, cols=C*Hout*Wout)
-
-    # Compute analytical gradients of loss wrt parameters
-    [out, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
-                                                    pad, pad)
-    dout = l2_loss::backward(out, y)
-    dX = max_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
-                                      pad, pad)
-
-    # Grad check
-    h = 1e-5
-    for (i in 1:nrow(X)) {
-      for (j in 1:ncol(X)) {
-        # Compute numerical derivative
-        old = as.scalar(X[i,j])
-        X[i,j] = old - h
-        [outmh, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
-                                                          pad, pad)
-        lossmh = l2_loss::forward(outmh, y)
-        X[i,j] = old + h
-        [outph, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
-                                                          pad, pad)
-        lossph = l2_loss::forward(outph, y)
-        X[i,j] = old  # reset
-        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-      }
-    }
-  }
-}
-
-max_pool2d_simple = function() {
-  /*
-   * Gradient check for the simple reference 2D max pooling layer.
-   */
-  print("Grad checking the simple reference 2D max pooling layer with L2 loss.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 2  # num channels
-  Hin = 4  # input height
-  Win = 4  # input width
-  Hf = 2  # pool filter height
-  Wf = 2  # pool filter width
-  stride = 2
-  X = rand(rows=N, cols=C*Hin*Win)
-
-  for (pad in 0:1) {
-    print(" - Grad checking w/ pad="+pad+".")
-    Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
-    Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
-    y = rand(rows=N, cols=C*Hout*Wout)
-
-    # Compute analytical gradients of loss wrt parameters
-    [out, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-    dout = l2_loss::backward(out, y)
-    dX = max_pool2d_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
-                                     pad, pad)
-
-    # Grad check
-    h = 1e-5
-    for (i in 1:nrow(X)) {
-      for (j in 1:ncol(X)) {
-        # Compute numerical derivative
-        old = as.scalar(X[i,j])
-        X[i,j] = old - h
-        [outmh, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
-                                                         pad, pad)
-        lossmh = l2_loss::forward(outmh, y)
-        X[i,j] = old + h
-        [outph, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
-                                                         pad, pad)
-        lossph = l2_loss::forward(outph, y)
-        X[i,j] = old  # reset
-        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-      }
-    }
-  }
-}
-
-relu = function() {
-  /*
-   * Gradient check for the ReLU nonlinearity layer.
-   *
-   * NOTE: This could result in a false-negative in which the test
-   * fails due to a kink being crossed in the nonlinearity.  This
-   * occurs when the tests, f(x-h) and f(x+h), end up on opposite
-   * sides of the zero threshold of max(0, fx).  For now, just run
-   * the tests again.  In the future, we can explicitly check for
-   * this and rerun the test automatically.
-   */
-  print("Grad checking the ReLU nonlinearity layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  M = 10 # num neurons
-  X = rand(rows=N, cols=M, min=-5, max=5)
-  y = rand(rows=N, cols=M)
-
-  # Compute analytical gradients of loss wrt parameters
-  out = relu::forward(X)
-  dout = l2_loss::backward(out, y)
-  dX = relu::backward(dout, X)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      outmh = relu::forward(X)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      outph = relu::forward(X)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-}
-
-rnn = function() {
-  /*
-   * Gradient check for the simple RNN layer.
-   */
-  print("Grad checking the simple RNN layer with L2 loss.")
-
-  # Generate data
-  N = 3  # num examples
-  D = 10  # num features
-  T = 15  # num timesteps (sequence length)
-  M = 5 # num neurons
-  return_seq = TRUE
-  X = rand(rows=N, cols=T*D)
-  y = rand(rows=N, cols=T*M)
-  out0 = rand(rows=N, cols=M)
-  [W, b, dummy] = rnn::init(N, D, M)
-
-  # Compute analytical gradients of loss wrt parameters
-  [out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-  dout = l2_loss::backward(out, y)
-  [dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W.")
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossmh = l2_loss::forward(outmh, y)
-      W[i,j] = old + h
-      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossph = l2_loss::forward(outph, y)
-      W[i,j] = old  # reset
-      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b.")
-  for (i in 1:nrow(b)) {
-    for (j in 1:ncol(b)) {
-      # Compute numerical derivative
-      old = as.scalar(b[i,j])
-      b[i,j] = old - h
-      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossmh = l2_loss::forward(outmh, y)
-      b[i,j] = old + h
-      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossph = l2_loss::forward(outph, y)
-      b[i,j] = old  # reset
-      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking out0.")
-  for (i in 1:nrow(out0)) {
-    for (j in 1:ncol(out0)) {
-      # Compute numerical derivative
-      old = as.scalar(out0[i,j])
-      out0[i,j] = old - h
-      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossmh = l2_loss::forward(outmh, y)
-      out0[i,j] = old + h
-      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossph = l2_loss::forward(outph, y)
-      out0[i,j] = old  # reset
-      dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
-    }
-  }
-}
-
-scale_shift1d = function() {
-  /*
-   * Gradient check for the 1D scale & shift layer.
-   */
-  print("Grad checking the 1D scale & shift layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  D = 100 # num features
-  X = rand(rows=N, cols=D)
-  y = rand(rows=N, cols=D)
-  [gamma, beta] = scale_shift1d::init(D)
-
-  # Compute analytical gradients of loss wrt parameters
-  out = scale_shift1d::forward(X, gamma, beta)
-  dout = l2_loss::backward(out, y)
-  [dX, dgamma, dbeta] = scale_shift1d::backward(dout, out, X, gamma, beta)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      outmh = scale_shift1d::forward(X, gamma, beta)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      outph = scale_shift1d::forward(X, gamma, beta)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking gamma.")
-  for (i in 1:nrow(gamma)) {
-    for (j in 1:ncol(gamma)) {
-      # Compute numerical derivative
-      old = as.scalar(gamma[i,j])
-      gamma[i,j] = old - h
-      outmh = scale_shift1d::forward(X, gamma, beta)
-      lossmh = l2_loss::forward(outmh, y)
-      gamma[i,j] = old + h
-      outph = scale_shift1d::forward(X, gamma, beta)
-      lossph = l2_loss::forward(outph, y)
-      gamma[i,j] = old  # reset
-      dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
-                                                  lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking beta.")
-  for (i in 1:nrow(beta)) {
-    for (j in 1:ncol(beta)) {
-      # Compute numerical derivative
-      old = as.scalar(beta[i,j])
-      beta[i,j] = old - h
-      outmh = scale_shift1d::forward(X, gamma, beta)
-      lossmh = l2_loss::forward(outmh, y)
-      beta[i,j] = old + h
-      outph = scale_shift1d::forward(X, gamma, beta)
-      lossph = l2_loss::forward(outph, y)
-      beta[i,j] = old  # reset
-      dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
-                                                  lossph, lossmh)
-    }
-  }
-}
-
-scale_shift2d = function() {
-  /*
-   * Gradient check for the 2D scale & shift layer.
-   */
-  print("Grad checking the 2D scale & shift layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  C = 2  # num channels
-  Hin = 5  # input height
-  Win = 5  # input width
-  X = rand(rows=N, cols=C*Hin*Win)
-  y = rand(rows=N, cols=C*Hin*Win)
-  [gamma, beta] = scale_shift2d::init(C)
-
-  # Compute analytical gradients of loss wrt parameters
-  out = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
-  dout = l2_loss::backward(out, y)
-  [dX, dgamma, dbeta] = scale_shift2d::backward(dout, out, X, gamma, beta, C, Hin, Win)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking gamma.")
-  for (i in 1:nrow(gamma)) {
-    for (j in 1:ncol(gamma)) {
-      # Compute numerical derivative
-      old = as.scalar(gamma[i,j])
-      gamma[i,j] = old - h
-      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
-      lossmh = l2_loss::forward(outmh, y)
-      gamma[i,j] = old + h
-      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
-      lossph = l2_loss::forward(outph, y)
-      gamma[i,j] = old  # reset
-      dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
-                                                  lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking beta.")
-  for (i in 1:nrow(beta)) {
-    for (j in 1:ncol(beta)) {
-      # Compute numerical derivative
-      old = as.scalar(beta[i,j])
-      beta[i,j] = old - h
-      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
-      lossmh = l2_loss::forward(outmh, y)
-      beta[i,j] = old + h
-      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
-      lossph = l2_loss::forward(outph, y)
-      beta[i,j] = old  # reset
-      dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
-                                                  lossph, lossmh)
-    }
-  }
-}
-
-sigmoid = function() {
-  /*
-   * Gradient check for the sigmoid nonlinearity layer.
-   */
-  print("Grad checking the sigmoid nonlinearity layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  M = 10 # num neurons
-  X = rand(rows=N, cols=M)
-  y = rand(rows=N, cols=M)
-
-  # Compute analytical gradients of loss wrt parameters
-  out = sigmoid::forward(X)
-  dout = l2_loss::backward(out, y)
-  dX = sigmoid::backward(dout, X)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      outmh = sigmoid::forward(X)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      outph = sigmoid::forward(X)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-}
-
-softmax = function() {
-  /*
-   * Gradient check for the softmax layer.
-   */
-  print("Grad checking the softmax layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  D = 10 # num classes
-  X = rand(rows=N, cols=D)
-  y = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
-  y = y / rowSums(y)
-
-  # Compute analytical gradients of loss wrt parameters
-  out = softmax::forward(X)
-  dout = l2_loss::backward(out, y)
-  dX = softmax::backward(dout, X)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      outmh = softmax::forward(X)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      outph = softmax::forward(X)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-}
-
-tanh = function() {
-  /*
-   * Gradient check for the hyperbolic tangent (tanh) nonlinearity
-   * layer.
-   */
-  print("Grad checking the tanh nonlinearity layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  M = 10 # num neurons
-  X = rand(rows=N, cols=M)
-  y = rand(rows=N, cols=M)
-
-  # Compute analytical gradients of loss wrt parameters
-  out = tanh::forward(X)
-  dout = l2_loss::backward(out, y)
-  dX = tanh::backward(dout, X)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      outmh = tanh::forward(X)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      outph = tanh::forward(X)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-}
-
-two_layer_affine_l2_net = function() {
-  /*
-   * Gradient check for a two-layer, fully-connected, feed-forward
-   * network with ReLU nonlinearity and L2 loss.
-   *
-   * NOTE: This could result in a false-negative in which the test
-   * fails due to a kink being crossed in the ReLU nonlinearity.  This
-   * occurs when the tests, f(x-h) and f(x+h), end up on opposite
-   * sides of the zero threshold of max(0, fx).  For now, just run
-   * the tests again.  In the future, we can explicitly check for
-   * this and rerun the test automatically.
-   */
-  print("Grad checking a two-layer, fully-connected, feed-forward network with a ReLU " +
-        "nonlinearity, and an L2 loss function.")
-
-  # Generate input data
-  N = 1000 # num examples
-  D = 100 # num features
-  yD = 5 # num targets
-  X = rand(rows=N, cols=D, pdf="normal")
-  y = rand(rows=N, cols=yD)
-
-  # Create 2-layer, fully-connected network
-  M = 10 # number of hidden neurons
-  [W1, b1] = affine::init(D, M)
-  [W2, b2] = affine::init(M, yD)
-
-  # Optimize for short "burn-in" time to move to characteristic
-  # mode of operation and unmask any real issues.
-  print(" - Burn-in:")
-  lr = 0.0001
-  decay = 0.99
-  for(i in 1:5) {
-    # Compute forward and backward passes of net
-    [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
-    print("   - L2 loss: " + loss)
-
-    # Optimize with basic SGD
-    W1 = W1 - lr * dW1
-    b1 = b1 - lr * db1
-    W2 = W2 - lr * dW2
-    b2 = b2 - lr * db2
-    lr = lr * decay
-  }
-
-  # Compute analytical gradients
-  [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:2) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old_x = as.scalar(X[i,j])
-      X[i,j] = old_x - h
-      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      X[i,j] = old_x + h
-      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      X[i,j] = old_x  # reset X[i,j]
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W1.")
-  for (i in 1:nrow(W1)) {
-    for (j in 1:ncol(W1)) {
-      # Compute numerical derivative
-      old_w = as.scalar(W1[i,j])
-      W1[i,j] = old_w - h
-      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      W1[i,j] = old_w + h
-      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      W1[i,j] = old_w  # reset W[i,j]
-      dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W2.")
-  for (i in 1:nrow(W2)) {
-    for (j in 1:ncol(W2)) {
-      # Compute numerical derivative
-      old_w = as.scalar(W2[i,j])
-      W2[i,j] = old_w - h
-      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      W2[i,j] = old_w + h
-      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      W2[i,j] = old_w  # reset W[i,j]
-      dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b1.")
-  for (i in 1:nrow(b1)) {
-    for (j in 1:ncol(b1)) {
-      # Compute numerical derivative
-      old_b = as.scalar(b1[i,j])
-      b1[i,j] = old_b - h
-      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      b1[i,j] = old_b + h
-      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      b1[i,j] = old_b  # reset b[1,j]
-      dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b2.")
-  for (i in 1:nrow(b2)) {
-    for (j in 1:ncol(b2)) {
-      # Compute numerical derivative
-      old_b = as.scalar(b2[i,j])
-      b2[i,j] = old_b - h
-      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      b2[i,j] = old_b + h
-      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      b2[i,j] = old_b  # reset b[1,j]
-      dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh)
-    }
-  }
-}
-
-/*
- * Test network with forward/backward functions.
- */
-two_layer_affine_l2_net_run = function(matrix[double] X, matrix[double] y,
-                                       matrix[double] W1, matrix[double] b1,
-                                       matrix[double] W2, matrix[double] b2)
-    return (matrix[double] pred, double loss,
-            matrix[double] dX,
-            matrix[double] dW1, matrix[double] db1,
-            matrix[double] dW2, matrix[double] db2) {
-  # Compute forward pass
-  [loss, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-
-  # Compute backward pass
-  [dX, dpred, daout, dhout, dW1, db1, dW2, db2] =
-      two_layer_affine_l2_net_backward(X, y, pred, aout, hout, W1, b1, W2, b2)
-}
-
-two_layer_affine_l2_net_forward = function(matrix[double] X, matrix[double] y,
-                                           matrix[double] W1, matrix[double] b1,
-                                           matrix[double] W2, matrix[double] b2)
-    return (double loss, matrix[double] pred, matrix[double] aout, matrix[double] hout) {
-  # Compute forward pass
-  hout = affine::forward(X, W1, b1)
-  aout = relu::forward(hout)
-  pred = affine::forward(aout, W2, b2)
-
-  # Compute loss
-  loss = l2_loss::forward(pred, y)
-}
-
-two_layer_affine_l2_net_backward = function(matrix[double] X, matrix[double] y, matrix[double] pred,
-                                            matrix[double] aout, matrix[double] hout,
-                                            matrix[double] W1, matrix[double] b1,
-                                            matrix[double] W2, matrix[double] b2)
-    return (matrix[double] dX, matrix[double] dpred,
-            matrix[double] daout, matrix[double] dhout,
-            matrix[double] dW1, matrix[double] db1, matrix[double] dW2, matrix[double] db2) {
-  # Compute backward pass
-  dpred = l2_loss::backward(pred, y)
-  [daout, dW2, db2] = affine::backward(dpred, aout, W2, b2)
-  dhout = relu::backward(daout, hout)
-  [dX, dW1, db1] = affine::backward(dhout, X, W1, b1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml b/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
deleted file mode 100644
index 188bd6e..0000000
--- a/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
+++ /dev/null
@@ -1,172 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Max Pooling layer.
- *
- * This implementation is intended to be a simple, reference version.
- */
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * This implementation is intended to be a simple, reference version.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
-  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
-  # Create output volume
-  out = matrix(0, rows=N, cols=C*Hout*Wout)
-
-  # Max pooling
-  parfor (n in 1:N, check=0) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
-
-    # Pad image
-    pad_value = -1/0
-    Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
-    parfor (c in 1:C) {
-      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
-      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
-    }
-    img = Xn_padded  # shape (C, (Hin+2*padh)*(Win+2*padw))
-
-    parfor (c in 1:C, check=0) {  # all channels
-      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      parfor (hout in 1:Hout, check=0) {  # all output rows
-        hin = (hout-1) * strideh + 1
-        parfor (wout in 1:Wout, check=0) {  # all output columns
-          win = (wout-1) * stridew + 1
-          out[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout] = max(img_slice[hin:hin+Hf-1,
-                                                               win:win+Wf-1])
-        }
-      }
-    }
-  }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   */
-  N = nrow(X)
-
-  # Create gradient volume
-  dX = matrix(0, rows=N, cols=C*Hin*Win)
-
-  # Gradient of max pooling
-  for (n in 1:N) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
-
-    # Pad image
-    pad_value = -1/0
-    Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
-    parfor (c in 1:C) {
-      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
-      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
-    }
-    img = Xn_padded
-
-    dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
-    for (c in 1:C) {  # all channels
-      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
-      for (hout in 1:Hout, check=0) {  # all output rows
-        hin = (hout-1) * strideh + 1
-        for (wout in 1:Wout) {  # all output columns
-          win = (wout-1) * stridew + 1
-          img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
-          max_val_ind = img_slice_patch == max(img_slice_patch)  # max value indicator matrix
-          # gradient passes through only for the max value(s) in this patch
-          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
-          dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
-                                                   + dimg_slice_patch
-        }
-      }
-      dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
-    }
-
-    # Unpad derivs on input
-    dXn = matrix(0, rows=C, cols=Hin*Win)
-    parfor (c in 1:C, check=0) {
-      dXn_padded_slice = matrix(dimg[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
-      dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
-      dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
-    }
-    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
-  }
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/run_tests.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/run_tests.dml b/scripts/staging/SystemML-NN/nn/test/run_tests.dml
deleted file mode 100644
index d8173a9..0000000
--- a/scripts/staging/SystemML-NN/nn/test/run_tests.dml
+++ /dev/null
@@ -1,90 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Script to run tests.
- */
-source("nn/test/grad_check.dml") as grad_check
-source("nn/test/test.dml") as test
-
-print("")
-print("Starting grad checks.")
-print("---")
-
-# Loss & loss-related functions
-grad_check::cross_entropy_loss()
-grad_check::l1_loss()
-grad_check::l1_reg()
-grad_check::l2_loss()
-grad_check::l2_reg()
-grad_check::log_loss()
-print("")
-
-# Core layers
-grad_check::affine()
-grad_check::batch_norm1d()
-grad_check::batch_norm2d()
-grad_check::conv2d()
-grad_check::conv2d_builtin()
-grad_check::conv2d_simple()
-grad_check::dropout()
-grad_check::lstm()
-grad_check::max_pool2d()
-grad_check::max_pool2d_builtin()
-grad_check::max_pool2d_simple()
-grad_check::relu()
-grad_check::rnn()
-grad_check::scale_shift1d()
-grad_check::scale_shift2d()
-grad_check::sigmoid()
-grad_check::softmax()
-grad_check::tanh()
-print("")
-
-# Example model
-grad_check::two_layer_affine_l2_net()
-print("")
-
-print("---")
-print("Grad checks complete -- look for any ERRORs or WARNINGs.")
-print("If any tests involving ReLUs failed, try a few times " +
-      "to ensure that they were not false negatives due to " +
-      "kinks being crossed.")
-print("")
-
-print("")
-print("Starting other tests.")
-print("---")
-
-test::batch_norm1d()
-test::batch_norm2d()
-test::conv2d()
-test::cross_entropy_loss()
-test::im2col()
-test::max_pool2d()
-test::padding()
-test::tanh()
-
-print("---")
-print("Other tests complete -- look for any ERRORs or WARNINGs.")
-print("")
-print("")
-


[11/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn` library to `scripts/nn`

Posted by du...@apache.org.
[SYSTEMML-1524] Graduate `nn` library to `scripts/nn`

This graduates the SystemML `nn` deep learning library from the staging
directory to the top-level `scripts` directory.  The aim is to have the
library ready for full release by the 1.0 release, alongside Caffe2DML,
GPU support, and native BLAS.

Closes #472.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/43c321d1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/43c321d1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/43c321d1

Branch: refs/heads/master
Commit: 43c321d18675d9b76483e0d1d8b156196172efdb
Parents: 1f5cf69
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Wed Apr 26 14:40:46 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Wed Apr 26 14:40:46 2017 -0700

----------------------------------------------------------------------
 scripts/nn/README.md                            |  183 ++
 scripts/nn/examples/Example - MNIST LeNet.ipynb |  189 ++
 .../Example - MNIST Softmax Classifier.ipynb    |  179 ++
 scripts/nn/examples/README.md                   |   74 +
 scripts/nn/examples/get_mnist_data.sh           |   28 +
 scripts/nn/examples/mnist_lenet-predict.dml     |   91 +
 scripts/nn/examples/mnist_lenet-train.dml       |  123 ++
 scripts/nn/examples/mnist_lenet.dml             |  331 ++++
 scripts/nn/examples/mnist_softmax-predict.dml   |   77 +
 scripts/nn/examples/mnist_softmax-train.dml     |  110 ++
 scripts/nn/examples/mnist_softmax.dml           |  178 ++
 scripts/nn/layers/affine.dml                    |   92 +
 scripts/nn/layers/batch_norm1d.dml              |  210 +++
 scripts/nn/layers/batch_norm2d.dml              |  238 +++
 scripts/nn/layers/conv2d.dml                    |  194 ++
 scripts/nn/layers/conv2d_builtin.dml            |  160 ++
 scripts/nn/layers/cross_entropy_loss.dml        |   78 +
 scripts/nn/layers/dropout.dml                   |   76 +
 scripts/nn/layers/l1_loss.dml                   |   72 +
 scripts/nn/layers/l1_reg.dml                    |   56 +
 scripts/nn/layers/l2_loss.dml                   |   72 +
 scripts/nn/layers/l2_reg.dml                    |   56 +
 scripts/nn/layers/log_loss.dml                  |   76 +
 scripts/nn/layers/lstm.dml                      |  260 +++
 scripts/nn/layers/max_pool2d.dml                |  159 ++
 scripts/nn/layers/max_pool2d_builtin.dml        |  103 +
 scripts/nn/layers/relu.dml                      |   59 +
 scripts/nn/layers/rnn.dml                       |  183 ++
 scripts/nn/layers/scale_shift1d.dml             |   95 +
 scripts/nn/layers/scale_shift2d.dml             |  107 ++
 scripts/nn/layers/sigmoid.dml                   |   62 +
 scripts/nn/layers/softmax.dml                   |   87 +
 scripts/nn/layers/tanh.dml                      |   65 +
 scripts/nn/optim/adagrad.dml                    |   77 +
 scripts/nn/optim/adam.dml                       |   97 +
 scripts/nn/optim/rmsprop.dml                    |   79 +
 scripts/nn/optim/sgd.dml                        |   42 +
 scripts/nn/optim/sgd_momentum.dml               |   71 +
 scripts/nn/optim/sgd_nesterov.dml               |   81 +
 scripts/nn/test/README.md                       |   32 +
 scripts/nn/test/conv2d_simple.dml               |  213 +++
 scripts/nn/test/grad_check.dml                  | 1769 ++++++++++++++++++
 scripts/nn/test/max_pool2d_simple.dml           |  172 ++
 scripts/nn/test/run_tests.dml                   |   90 +
 scripts/nn/test/test.dml                        |  549 ++++++
 scripts/nn/test/util.dml                        |  155 ++
 scripts/nn/util.dml                             |  202 ++
 scripts/staging/SystemML-NN/README.md           |  183 --
 .../nn/examples/Example - MNIST LeNet.ipynb     |  189 --
 .../Example - MNIST Softmax Classifier.ipynb    |  179 --
 .../staging/SystemML-NN/nn/examples/README.md   |   74 -
 .../SystemML-NN/nn/examples/get_mnist_data.sh   |   28 -
 .../nn/examples/mnist_lenet-predict.dml         |   91 -
 .../nn/examples/mnist_lenet-train.dml           |  123 --
 .../SystemML-NN/nn/examples/mnist_lenet.dml     |  331 ----
 .../nn/examples/mnist_softmax-predict.dml       |   77 -
 .../nn/examples/mnist_softmax-train.dml         |  110 --
 .../SystemML-NN/nn/examples/mnist_softmax.dml   |  178 --
 .../staging/SystemML-NN/nn/layers/affine.dml    |   92 -
 .../SystemML-NN/nn/layers/batch_norm1d.dml      |  210 ---
 .../SystemML-NN/nn/layers/batch_norm2d.dml      |  238 ---
 .../staging/SystemML-NN/nn/layers/conv2d.dml    |  194 --
 .../SystemML-NN/nn/layers/conv2d_builtin.dml    |  160 --
 .../nn/layers/cross_entropy_loss.dml            |   78 -
 .../staging/SystemML-NN/nn/layers/dropout.dml   |   76 -
 .../staging/SystemML-NN/nn/layers/l1_loss.dml   |   72 -
 .../staging/SystemML-NN/nn/layers/l1_reg.dml    |   56 -
 .../staging/SystemML-NN/nn/layers/l2_loss.dml   |   72 -
 .../staging/SystemML-NN/nn/layers/l2_reg.dml    |   56 -
 .../staging/SystemML-NN/nn/layers/log_loss.dml  |   76 -
 scripts/staging/SystemML-NN/nn/layers/lstm.dml  |  260 ---
 .../SystemML-NN/nn/layers/max_pool2d.dml        |  159 --
 .../nn/layers/max_pool2d_builtin.dml            |  103 -
 scripts/staging/SystemML-NN/nn/layers/relu.dml  |   59 -
 scripts/staging/SystemML-NN/nn/layers/rnn.dml   |  183 --
 .../SystemML-NN/nn/layers/scale_shift1d.dml     |   95 -
 .../SystemML-NN/nn/layers/scale_shift2d.dml     |  107 --
 .../staging/SystemML-NN/nn/layers/sigmoid.dml   |   62 -
 .../staging/SystemML-NN/nn/layers/softmax.dml   |   87 -
 scripts/staging/SystemML-NN/nn/layers/tanh.dml  |   65 -
 .../staging/SystemML-NN/nn/optim/adagrad.dml    |   77 -
 scripts/staging/SystemML-NN/nn/optim/adam.dml   |   97 -
 .../staging/SystemML-NN/nn/optim/rmsprop.dml    |   79 -
 scripts/staging/SystemML-NN/nn/optim/sgd.dml    |   42 -
 .../SystemML-NN/nn/optim/sgd_momentum.dml       |   71 -
 .../SystemML-NN/nn/optim/sgd_nesterov.dml       |   81 -
 scripts/staging/SystemML-NN/nn/test/README.md   |   32 -
 .../SystemML-NN/nn/test/conv2d_simple.dml       |  213 ---
 .../staging/SystemML-NN/nn/test/grad_check.dml  | 1769 ------------------
 .../SystemML-NN/nn/test/max_pool2d_simple.dml   |  172 --
 .../staging/SystemML-NN/nn/test/run_tests.dml   |   90 -
 scripts/staging/SystemML-NN/nn/test/test.dml    |  549 ------
 scripts/staging/SystemML-NN/nn/test/util.dml    |  155 --
 scripts/staging/SystemML-NN/nn/util.dml         |  202 --
 94 files changed, 7752 insertions(+), 7752 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/README.md
----------------------------------------------------------------------
diff --git a/scripts/nn/README.md b/scripts/nn/README.md
new file mode 100644
index 0000000..b80f2c6
--- /dev/null
+++ b/scripts/nn/README.md
@@ -0,0 +1,183 @@
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+# SystemML-NN
+
+### A deep learning library for [Apache SystemML](https://github.com/apache/incubator-systemml).
+
+## Examples:
+#### Please see the [`examples`](nn/examples) folder for more detailed examples, or view the following two quick examples.
+### Neural net for regression with vanilla SGD:
+```python
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/l2_loss.dml") as l2_loss
+source("nn/layers/relu.dml") as relu
+source("nn/optim/sgd.dml") as sgd
+
+# Generate input data
+N = 1024 # num examples
+D = 100 # num features
+t = 1 # num targets
+X = rand(rows=N, cols=D, pdf="normal")
+y = rand(rows=N, cols=t)
+
+# Create 2-layer network:
+## affine1 -> relu1 -> affine2
+M = 64 # number of neurons
+[W1, b1] = affine::init(D, M)
+[W2, b2] = affine::init(M, t)
+
+# Initialize optimizer
+lr = 0.05  # learning rate
+mu = 0.9  # momentum
+decay = 0.99  # learning rate decay constant
+
+# Optimize
+print("Starting optimization")
+batch_size = 32
+epochs = 5
+iters = 1024 / batch_size
+for (e in 1:epochs) {
+  for(i in 1:iters) {
+    # Get next batch
+    X_batch = X[i:i+batch_size-1,]
+    y_batch = y[i:i+batch_size-1,]
+
+    # Compute forward pass
+    out1 = affine::forward(X_batch, W1, b1)
+    outr1 = relu::forward(out1)
+    out2 = affine::forward(outr1, W2, b2)
+
+    # Compute loss
+    loss = l2_loss::forward(out2, y_batch)
+    print("L2 loss: " + loss)
+
+    # Compute backward pass
+    dout2 = l2_loss::backward(out2, y_batch)
+    [doutr1, dW2, db2] = affine::backward(dout2, outr1, W2, b2)
+    dout1 = relu::backward(doutr1, out1)
+    [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)
+
+    # Optimize with vanilla SGD
+    W1 = sgd::update(W1, dW1, lr)
+    b1 = sgd::update(b1, db1, lr)
+    W2 = sgd::update(W2, dW2, lr)
+    b2 = sgd::update(b2, db2, lr)
+  }
+  # Decay learning rate
+  lr = lr * decay
+}
+```
+
+### Neural net for multi-class classification with dropout and SGD w/ Nesterov momentum:
+```python
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/dropout.dml") as dropout
+source("nn/layers/relu.dml") as relu
+source("nn/layers/softmax.dml") as softmax
+source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+
+# Generate input data
+N = 1024 # num examples
+D = 100 # num features
+t = 5 # num targets
+X = rand(rows=N, cols=D, pdf="normal")
+classes = round(rand(rows=N, cols=1, min=1, max=t, pdf="uniform"))
+y = matrix(0, rows=N, cols=t)
+parfor (i in 1:N) {
+  y[i, as.scalar(classes[i,1])] = 1  # one-hot encoding
+}
+
+# Create network:
+# affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> dropout2 -> affine3 -> softmax
+H1 = 64 # number of neurons in 1st hidden layer
+H2 = 64 # number of neurons in 2nd hidden layer
+p = 0.5  # dropout probability
+[W1, b1] = affine::init(D, H1)
+[W2, b2] = affine::init(H1, H2)
+[W3, b3] = affine::init(H2, t)
+
+# Initialize SGD w/ Nesterov momentum optimizer
+lr = 0.05  # learning rate
+mu = 0.5  # momentum
+decay = 0.99  # learning rate decay constant
+vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
+vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
+vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
+
+# Optimize
+print("Starting optimization")
+batch_size = 64
+epochs = 10
+iters = 1024 / batch_size
+for (e in 1:epochs) {
+  for(i in 1:iters) {
+    # Get next batch
+    X_batch = X[i:i+batch_size-1,]
+    y_batch = y[i:i+batch_size-1,]
+
+    # Compute forward pass
+    ## layer 1:
+    out1 = affine::forward(X_batch, W1, b1)
+    outr1 = relu::forward(out1)
+    [outd1, maskd1] = dropout::forward(outr1, p, -1)
+    ## layer 2:
+    out2 = affine::forward(outd1, W2, b2)
+    outr2 = relu::forward(out2)
+    [outd2, maskd2] = dropout::forward(outr2, p, -1)
+    ## layer 3:
+    out3 = affine::forward(outd2, W3, b3)
+    probs = softmax::forward(out3)
+
+    # Compute loss
+    loss = cross_entropy_loss::forward(probs, y_batch)
+    print("Cross entropy loss: " + loss)
+
+    # Compute backward pass
+    ## loss:
+    dprobs = cross_entropy_loss::backward(probs, y_batch)
+    ## layer 3:
+    dout3 = softmax::backward(dprobs, out3)
+    [doutd2, dW3, db3] = affine::backward(dout3, outd2, W3, b3)
+    ## layer 2:
+    doutr2 = dropout::backward(doutd2, outr2, p, maskd2)
+    dout2 = relu::backward(doutr2, out2)
+    [doutd1, dW2, db2] = affine::backward(dout2, outd1, W2, b2)
+    ## layer 1:
+    doutr1 = dropout::backward(doutd1, outr1, p, maskd1)
+    dout1 = relu::backward(doutr1, out1)
+    [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)
+
+    # Optimize with SGD w/ Nesterov momentum
+    [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
+    [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
+    [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
+    [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
+    [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
+    [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
+  }
+  # Anneal momentum towards 0.999
+  mu = mu + (0.999 - mu)/(1+epochs-e)
+  # Decay learning rate
+  lr = lr * decay
+}
+```

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/Example - MNIST LeNet.ipynb
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/Example - MNIST LeNet.ipynb b/scripts/nn/examples/Example - MNIST LeNet.ipynb
new file mode 100644
index 0000000..0423269
--- /dev/null
+++ b/scripts/nn/examples/Example - MNIST LeNet.ipynb	
@@ -0,0 +1,189 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a SystemML MLContext object\n",
+    "from systemml import MLContext, dml\n",
+    "ml = MLContext(sc)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download Data - MNIST"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9].  Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%sh\n",
+    "mkdir -p data/mnist/\n",
+    "cd data/mnist/\n",
+    "curl -O https://pjreddie.com/media/files/mnist_train.csv\n",
+    "curl -O https://pjreddie.com/media/files/mnist_test.csv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## SystemML \"LeNet\" Neural Network"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "script_string = \"\"\"\n",
+    "source(\"nn/examples/mnist_lenet.dml\") as mnist_lenet\n",
+    "\n",
+    "# Read training data\n",
+    "data = read($data, format=\"csv\")\n",
+    "n = nrow(data)\n",
+    "\n",
+    "# Extract images and labels\n",
+    "images = data[,2:ncol(data)]\n",
+    "labels = data[,1]\n",
+    "\n",
+    "# Scale images to [-1,1], and one-hot encode the labels\n",
+    "images = (images / 255.0) * 2 - 1\n",
+    "labels = table(seq(1, n), labels+1, n, 10)\n",
+    "\n",
+    "# Split into training (55,000 examples) and validation (5,000 examples)\n",
+    "X = images[5001:nrow(images),]\n",
+    "X_val = images[1:5000,]\n",
+    "y = labels[5001:nrow(images),]\n",
+    "y_val = labels[1:5000,]\n",
+    "\n",
+    "# Train\n",
+    "epochs = 10\n",
+    "[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)\n",
+    "\"\"\"\n",
+    "script = (dml(script_string).input(\"$data\", \"data/mnist/mnist_train.csv\")\n",
+    "                            .input(C=1, Hin=28, Win=28)\n",
+    "                            .output(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))\n",
+    "W1, b1, W2, b2, W3, b3, W4, b4 = (ml.execute(script)\n",
+    "                                    .get(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. Compute Test Accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "script_string = \"\"\"\n",
+    "source(\"nn/examples/mnist_lenet.dml\") as mnist_lenet\n",
+    "\n",
+    "# Read test data\n",
+    "data = read($data, format=\"csv\")\n",
+    "n = nrow(data)\n",
+    "\n",
+    "# Extract images and labels\n",
+    "X_test = data[,2:ncol(data)]\n",
+    "y_test = data[,1]\n",
+    "\n",
+    "# Scale images to [-1,1], and one-hot encode the labels\n",
+    "X_test = (X_test / 255.0) * 2 - 1\n",
+    "y_test = table(seq(1, n), y_test+1, n, 10)\n",
+    "\n",
+    "# Eval on test set\n",
+    "probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)\n",
+    "[loss, accuracy] = mnist_lenet::eval(probs, y_test)\n",
+    "\n",
+    "print(\"Test Accuracy: \" + accuracy)\n",
+    "\"\"\"\n",
+    "script = dml(script_string).input(**{\"$data\": \"data/mnist/mnist_train.csv\",\n",
+    "                                     \"C\": 1, \"Hin\": 28, \"Win\": 28,\n",
+    "                                     \"W1\": W1, \"b1\": b1,\n",
+    "                                     \"W2\": W2, \"b2\": b2,\n",
+    "                                     \"W3\": W3, \"b3\": b3,\n",
+    "                                     \"W4\": W4, \"b4\": b4})\n",
+    "ml.execute(script)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. Extract Model Into Spark DataFrames For Future Use"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "W1_df = W1.toDF()\n",
+    "b1_df = b1.toDF()\n",
+    "W2_df = W2.toDF()\n",
+    "b2_df = b2.toDF()\n",
+    "W3_df = W3.toDF()\n",
+    "b3_df = b3.toDF()\n",
+    "W4_df = W4.toDF()\n",
+    "b4_df = b4.toDF()\n",
+    "W1_df, b1_df, W2_df, b2_df, W3_df, b3_df, W4_df, b4_df"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 + Spark 2.x + SystemML",
+   "language": "python",
+   "name": "pyspark3_2.x"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/Example - MNIST Softmax Classifier.ipynb
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/Example - MNIST Softmax Classifier.ipynb b/scripts/nn/examples/Example - MNIST Softmax Classifier.ipynb
new file mode 100644
index 0000000..5e7182a
--- /dev/null
+++ b/scripts/nn/examples/Example - MNIST Softmax Classifier.ipynb	
@@ -0,0 +1,179 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# Create a SystemML MLContext object\n",
+    "from systemml import MLContext, dml\n",
+    "ml = MLContext(sc)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download Data - MNIST"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9].  Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%%sh\n",
+    "mkdir -p data/mnist/\n",
+    "cd data/mnist/\n",
+    "curl -O https://pjreddie.com/media/files/mnist_train.csv\n",
+    "curl -O https://pjreddie.com/media/files/mnist_test.csv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## SystemML Softmax Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. Train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training = \"\"\"\n",
+    "source(\"nn/examples/mnist_softmax.dml\") as mnist_softmax\n",
+    "\n",
+    "# Read training data\n",
+    "data = read($data, format=\"csv\")\n",
+    "n = nrow(data)\n",
+    "\n",
+    "# Extract images and labels\n",
+    "images = data[,2:ncol(data)]\n",
+    "labels = data[,1]\n",
+    "\n",
+    "# Scale images to [0,1], and one-hot encode the labels\n",
+    "images = images / 255.0\n",
+    "labels = table(seq(1, n), labels+1, n, 10)\n",
+    "\n",
+    "# Split into training (55,000 examples) and validation (5,000 examples)\n",
+    "X = images[5001:nrow(images),]\n",
+    "X_val = images[1:5000,]\n",
+    "y = labels[5001:nrow(images),]\n",
+    "y_val = labels[1:5000,]\n",
+    "\n",
+    "# Train\n",
+    "epochs = 1\n",
+    "[W, b] = mnist_softmax::train(X, y, X_val, y_val, epochs)\n",
+    "\"\"\"\n",
+    "script = dml(training).input(\"$data\", \"data/mnist/mnist_train.csv\").output(\"W\", \"b\")\n",
+    "W, b = ml.execute(script).get(\"W\", \"b\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. Compute Test Accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "testing = \"\"\"\n",
+    "source(\"nn/examples/mnist_softmax.dml\") as mnist_softmax\n",
+    "\n",
+    "# Read test data\n",
+    "data = read($data, format=\"csv\")\n",
+    "n = nrow(data)\n",
+    "\n",
+    "# Extract images and labels\n",
+    "X_test = data[,2:ncol(data)]\n",
+    "y_test = data[,1]\n",
+    "\n",
+    "# Scale images to [0,1], and one-hot encode the labels\n",
+    "X_test = X_test / 255.0\n",
+    "y_test = table(seq(1, n), y_test+1, n, 10)\n",
+    "\n",
+    "# Eval on test set\n",
+    "probs = mnist_softmax::predict(X_test, W, b)\n",
+    "[loss, accuracy] = mnist_softmax::eval(probs, y_test)\n",
+    "\n",
+    "print(\"Test Accuracy: \" + accuracy)\n",
+    "\"\"\"\n",
+    "script = dml(testing).input(\"$data\", \"data/mnist/mnist_test.csv\", W=W, b=b)\n",
+    "ml.execute(script)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. Extract Model Into Spark DataFrames For Future Use"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "W_df = W.toDF()\n",
+    "b_df = b.toDF()\n",
+    "W_df, b_df"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/README.md
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/README.md b/scripts/nn/examples/README.md
new file mode 100644
index 0000000..d5e9d04
--- /dev/null
+++ b/scripts/nn/examples/README.md
@@ -0,0 +1,74 @@
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+# SystemML-NN Examples
+
+#### This folder contains scripts and PySpark Jupyter notebooks serving as examples of using the *SystemML-NN* (`nn`) deep learning library.
+
+---
+
+# Examples
+### MNIST Softmax Classifier
+
+* This example trains a softmax classifier, which is essentially a multi-class logistic regression model, on the MNIST data.  The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
+* Notebook: `Example - MNIST Softmax Classifier.ipynb`.
+* DML Functions: `mnist_softmax.dml`
+* Training script: `mnist_softmax-train.dml`
+* Prediction script: `mnist_softmax-predict.dml`
+
+### MNIST "LeNet" Neural Net
+
+* This example trains a neural network on the MNIST data using a ["LeNet" architecture](http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf). The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
+* Notebook: `Example - MNIST LeNet.ipynb`.
+* DML Functions: `mnist_lenet.dml`
+* Training script: `mnist_lenet-train.dml`
+* Prediction script: `mnist_lenet-predict.dml`
+
+---
+
+# Setup
+## Code
+* To run the examples, please first download and unzip the project via GitHub using the "Clone or download" button on the [homepage of the project](https://github.com/dusenberrymw/systemml-nn), *or* via the following commands:
+
+  ```
+  git clone https://github.com/dusenberrymw/systemml-nn.git
+  ```
+
+* Then, move into the `systemml-nn` folder via:
+  ```
+  cd systemml-nn
+  ```
+
+## Data
+* These examples use the classic [MNIST](http://yann.lecun.com/exdb/mnist/) dataset, which contains labeled 28x28 pixel images of handwritten digits in the range of 0-9.  There are 60,000 training images, and 10,000 testing images.  Of the 60,000 training images, 5,000 will be used as validation images.
+* **Download**:
+  * **Notebooks**: The data will be automatically downloaded as a step in either of the example notebooks.
+  * **Training scripts**: Please run `get_mnist_data.sh` to download the data separately.
+
+## Execution
+* These examples contain scripts written in SystemML's R-like language (`*.dml`), as well as PySpark Jupyter notebooks (`*.ipynb`).  The scripts contain the math for the algorithms, enclosed in functions, and the notebooks serve as full, end-to-end examples of reading in data, training models using the functions within the scripts, and evaluating final performance.
+* **Notebooks**: To run the notebook examples, please install the SystemML Python package with `pip install systemml`, and then startup Jupyter in the following manner from this directory (or for more information, please see [this great blog post](http://spark.tc/0-to-life-changing-application-with-apache-systemml/)):
+
+  ```
+  PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook" pyspark --master local[*] --driver-memory 3G --driver-class-path SystemML.jar --jars SystemML.jar
+  ```
+
+  Note that all printed output, such as training statistics, from the SystemML scripts will be sent to the terminal in which Jupyter was started (for now...).
+
+* **Scripts**: To run the scripts from the command line using `spark-submit`, please see the comments located at the top of the `-train` and `-predict` scripts.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/get_mnist_data.sh
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/get_mnist_data.sh b/scripts/nn/examples/get_mnist_data.sh
new file mode 100755
index 0000000..deb0c40
--- /dev/null
+++ b/scripts/nn/examples/get_mnist_data.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+DIR="$(cd "$(dirname "$0")" && pwd)"
+mkdir -p $DIR/data/mnist/
+cd $DIR/data/mnist/
+curl -O https://pjreddie.com/media/files/mnist_train.csv
+curl -O https://pjreddie.com/media/files/mnist_test.csv
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/mnist_lenet-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/mnist_lenet-predict.dml b/scripts/nn/examples/mnist_lenet-predict.dml
new file mode 100644
index 0000000..85a5307
--- /dev/null
+++ b/scripts/nn/examples/mnist_lenet-predict.dml
@@ -0,0 +1,91 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST LeNet - Predict
+#
+# This script computes the class probability predictions of a
+# trained convolutional net using the "LeNet" architecture on
+# images of handwritten digits.
+#
+# Inputs:
+#  - X: File containing training images.
+#     The format is "pixel_1, pixel_2, ..., pixel_n".
+#  - C: Number of color chanels in the images.
+#  - Hin: Input image height.
+#  - Win: Input image width.
+#  - model_dir: Directory containing the trained weights and biases
+#     of the model.
+#  - out_dir: Directory to store class probability predictions for
+#     each image.
+#  - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
+#     Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+#  - probs: File containing class probability predictions for each
+#     image.
+#
+# Data:
+# The X file should contain images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels.
+#
+# Sample Invocation (running from outside the `nn` folder):
+# 1. Download images.
+#
+#   For example, save images to `nn/examples/data/mnist/images.csv`.
+#
+# 2. Execute using Spark
+#   ```
+#   spark-submit --master local[*] --driver-memory 5G
+#   --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+#   $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_lenet-predict.dml
+#   -nvargs X=nn/examples/data/mnist/images.csv C=1 Hin=28 Win=28
+#   model_dir=nn/examples/model/mnist_lenet out_dir=nn/examples/data/mnist
+#   ```
+#
+source("nn/examples/mnist_lenet.dml") as mnist_lenet
+
+# Read training data
+fmt = ifdef($fmt, "csv")
+X = read($X, format=fmt)
+C = $C
+Hin = $Hin
+Win = $Win
+
+# Scale images to [-1,1]
+X = (X / 255.0) * 2 - 1
+
+# Read model coefficients
+W1 = read($model_dir+"/W1")
+b1 = read($model_dir+"/b1")
+W2 = read($model_dir+"/W2")
+b2 = read($model_dir+"/b2")
+W3 = read($model_dir+"/W3")
+b3 = read($model_dir+"/b3")
+W4 = read($model_dir+"/W4")
+b4 = read($model_dir+"/b4")
+
+# Predict classes
+probs = mnist_lenet::predict(X, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
+
+# Output results
+write(probs, $out_dir+"/probs."+fmt, format=fmt)
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/mnist_lenet-train.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/mnist_lenet-train.dml b/scripts/nn/examples/mnist_lenet-train.dml
new file mode 100644
index 0000000..0fc733e
--- /dev/null
+++ b/scripts/nn/examples/mnist_lenet-train.dml
@@ -0,0 +1,123 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST LeNet - Train
+#
+# This script trains a convolutional net using the "LeNet" architecture
+# on images of handwritten digits.
+#
+# Inputs:
+#  - train: File containing labeled MNIST training images.
+#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
+#  - test: File containing labeled MNIST test images.
+#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
+#  - C: Number of color chanels in the images.
+#  - Hin: Input image height.
+#  - Win: Input image width.
+#  - epochs: [DEFAULT: 10] Total number of full training loops over
+#     the full data set.
+#  - out_dir: [DEFAULT: "."] Directory to store weights and bias
+#     matrices of trained model, as well as final test accuracy.
+#  - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
+#     Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+#  - W1, W2, W3, W4: Files containing the trained weights of the model.
+#  - b1, b2, b3, b4: Files containing the trained biases of the model.
+#  - accuracy: File containing the final accuracy on the test data.
+#
+# Data:
+# The MNIST dataset contains labeled images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels, and each label is
+# one of 10 possible digits in [0,9].
+#
+# Sample Invocation (running from outside the `nn` folder):
+# 1. Download data (60,000 training examples, and 10,000 test examples)
+#   ```
+#   nn/examples/get_mnist_data.sh
+#   ```
+#
+# 2. Execute using Spark
+#   ```
+#   spark-submit --master local[*] --driver-memory 10G
+#   --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+#   $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_lenet-train.dml
+#   -nvargs train=nn/examples/data/mnist/mnist_train.csv test=nn/examples/data/mnist/mnist_test.csv
+#   C=1 Hin=28 Win=28 epochs=10 out_dir=nn/examples/model/mnist_lenet
+#   ```
+#
+source("nn/examples/mnist_lenet.dml") as mnist_lenet
+
+# Read training data & settings
+fmt = ifdef($fmt, "csv")
+train = read($train, format=fmt)
+test = read($test, format=fmt)
+C = $C
+Hin = $Hin
+Win = $Win
+epochs = ifdef($epochs, 10)
+out_dir = ifdef($out_dir, ".")
+
+# Extract images and labels
+images = train[,2:ncol(train)]
+labels = train[,1]
+X_test = test[,2:ncol(test)]
+y_test = test[,1]
+
+# Scale images to [-1,1], and one-hot encode the labels
+n = nrow(train)
+n_test = nrow(test)
+images = (images / 255.0) * 2 - 1
+labels = table(seq(1, n), labels+1, n, 10)
+X_test = (X_test / 255.0) * 2 - 1
+y_test = table(seq(1, n_test), y_test+1, n_test, 10)
+
+# Split into training (55,000 examples) and validation (5,000 examples)
+X = images[5001:nrow(images),]
+X_val = images[1:5000,]
+y = labels[5001:nrow(images),]
+y_val = labels[1:5000,]
+
+# Train
+[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)
+
+# Write model out
+write(W1, out_dir+"/W1")
+write(b1, out_dir+"/b1")
+write(W2, out_dir+"/W2")
+write(b2, out_dir+"/b2")
+write(W3, out_dir+"/W3")
+write(b3, out_dir+"/b3")
+write(W4, out_dir+"/W4")
+write(b4, out_dir+"/b4")
+
+# Eval on test set
+probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
+[loss, accuracy] = mnist_lenet::eval(probs, y_test)
+
+# Output results
+print("Test Accuracy: " + accuracy)
+write(accuracy, out_dir+"/accuracy")
+
+print("")
+print("")
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/mnist_lenet.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/mnist_lenet.dml b/scripts/nn/examples/mnist_lenet.dml
new file mode 100644
index 0000000..e5755c4
--- /dev/null
+++ b/scripts/nn/examples/mnist_lenet.dml
@@ -0,0 +1,331 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * MNIST LeNet Example
+ */
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/conv2d_builtin.dml") as conv2d
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/dropout.dml") as dropout
+source("nn/layers/l2_reg.dml") as l2_reg
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
+source("nn/layers/relu.dml") as relu
+source("nn/layers/softmax.dml") as softmax
+source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+
+train = function(matrix[double] X, matrix[double] y,
+                 matrix[double] X_val, matrix[double] y_val,
+                 int C, int Hin, int Win, int epochs)
+    return (matrix[double] W1, matrix[double] b1,
+            matrix[double] W2, matrix[double] b2,
+            matrix[double] W3, matrix[double] b3,
+            matrix[double] W4, matrix[double] b4) {
+  /*
+   * Trains a convolutional net using the "LeNet" architecture.
+   *
+   * The input matrix, X, has N examples, each represented as a 3D
+   * volume unrolled into a single vector.  The targets, y, have K
+   * classes, and are one-hot encoded.
+   *
+   * Inputs:
+   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - y: Target matrix, of shape (N, K).
+   *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
+   *  - y_val: Target validation matrix, of shape (N, K).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - epochs: Total number of full training loops over the full data set.
+   *
+   * Outputs:
+   *  - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
+   *  - b1: 1st layer biases vector, of shape (F1, 1).
+   *  - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
+   *  - b2: 2nd layer biases vector, of shape (F2, 1).
+   *  - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
+   *  - b3: 3rd layer biases vector, of shape (1, N3).
+   *  - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
+   *  - b4: 4th layer biases vector, of shape (1, K).
+   */
+  N = nrow(X)
+  K = ncol(y)
+
+  # Create network:
+  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
+  Hf = 5  # filter height
+  Wf = 5  # filter width
+  stride = 1
+  pad = 2  # For same dimensions, (Hf - stride) / 2
+
+  F1 = 32  # num conv filters in conv1
+  F2 = 64  # num conv filters in conv2
+  N3 = 512  # num nodes in affine3
+  # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)
+
+  [W1, b1] = conv2d::init(F1, C, Hf, Wf)  # inputs: (N, C*Hin*Win)
+  [W2, b2] = conv2d::init(F2, F1, Hf, Wf)  # inputs: (N, F1*(Hin/2)*(Win/2))
+  [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3)  # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
+  [W4, b4] = affine::init(N3, K)  # inputs: (N, N3)
+  W4 = W4 / sqrt(2)  # different initialization, since being fed into softmax, instead of relu
+
+  # Initialize SGD w/ Nesterov momentum optimizer
+  lr = 0.01  # learning rate
+  mu = 0.9  #0.5  # momentum
+  decay = 0.95  # learning rate decay constant
+  vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
+  vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
+  vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
+  vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)
+
+  # Regularization
+  lambda = 5e-04
+
+  # Optimize
+  print("Starting optimization")
+  batch_size = 64
+  iters = ceil(N / batch_size)
+  for (e in 1:epochs) {
+    for(i in 1:iters) {
+      # Get next batch
+      beg = ((i-1) * batch_size) %% N + 1
+      end = min(N, beg + batch_size - 1)
+      X_batch = X[beg:end,]
+      y_batch = y[beg:end,]
+
+      # Compute forward pass
+      ## layer 1: conv1 -> relu1 -> pool1
+      [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
+                                                pad, pad)
+      outr1 = relu::forward(outc1)
+      [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+                                                    strideh=2, stridew=2, pad=0, pad=0)
+      ## layer 2: conv2 -> relu2 -> pool2
+      [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
+                                                stride, stride, pad, pad)
+      outr2 = relu::forward(outc2)
+      [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+                                                    strideh=2, stridew=2, pad=0, pad=0)
+      ## layer 3:  affine3 -> relu3 -> dropout
+      outa3 = affine::forward(outp2, W3, b3)
+      outr3 = relu::forward(outa3)
+      [outd3, maskd3] = dropout::forward(outr3, 0.5, -1)
+      ## layer 4:  affine4 -> softmax
+      outa4 = affine::forward(outd3, W4, b4)
+      probs = softmax::forward(outa4)
+
+      # Compute loss & accuracy for training & validation data every 100 iterations.
+      if (i %% 100 == 0) {
+        # Compute training loss & accuracy
+        loss_data = cross_entropy_loss::forward(probs, y_batch)
+        loss_reg_W1 = l2_reg::forward(W1, lambda)
+        loss_reg_W2 = l2_reg::forward(W2, lambda)
+        loss_reg_W3 = l2_reg::forward(W3, lambda)
+        loss_reg_W4 = l2_reg::forward(W4, lambda)
+        loss = loss_data + loss_reg_W1 + loss_reg_W2 + loss_reg_W3 + loss_reg_W4
+        accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
+
+        # Compute validation loss & accuracy
+        probs_val = predict(X_val, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
+        loss_val = cross_entropy_loss::forward(probs_val, y_val)
+        accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
+
+        # Output results
+        print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
+              + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+      }
+
+      # Compute data backward pass
+      ## loss:
+      dprobs = cross_entropy_loss::backward(probs, y_batch)
+      ## layer 4:  affine4 -> softmax
+      douta4 = softmax::backward(dprobs, outa4)
+      [doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4)
+      ## layer 3:  affine3 -> relu3 -> dropout
+      doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3)
+      douta3 = relu::backward(doutr3, outa3)
+      [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
+      ## layer 2: conv2 -> relu2 -> pool2
+      doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+                                    strideh=2, stridew=2, pad=0, pad=0)
+      doutc2 = relu::backward(doutr2, outc2)
+      [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
+                                            Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
+      ## layer 1: conv1 -> relu1 -> pool1
+      doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+                                    strideh=2, stridew=2, pad=0, pad=0)
+      doutc1 = relu::backward(doutr1, outc1)
+      [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win,
+                                              Hf, Wf, stride, stride, pad, pad)
+
+      # Compute regularization backward pass
+      dW1_reg = l2_reg::backward(W1, lambda)
+      dW2_reg = l2_reg::backward(W2, lambda)
+      dW3_reg = l2_reg::backward(W3, lambda)
+      dW4_reg = l2_reg::backward(W4, lambda)
+      dW1 = dW1 + dW1_reg
+      dW2 = dW2 + dW2_reg
+      dW3 = dW3 + dW3_reg
+      dW4 = dW4 + dW4_reg
+
+      # Optimize with SGD w/ Nesterov momentum
+      [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
+      [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
+      [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
+      [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
+      [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
+      [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
+      [W4, vW4] = sgd_nesterov::update(W4, dW4, lr, mu, vW4)
+      [b4, vb4] = sgd_nesterov::update(b4, db4, lr, mu, vb4)
+    }
+    # Anneal momentum towards 0.999
+    #mu = mu + (0.999 - mu)/(1+epochs-e)
+    # Decay learning rate
+    lr = lr * decay
+  }
+}
+
+predict = function(matrix[double] X, int C, int Hin, int Win,
+                   matrix[double] W1, matrix[double] b1,
+                   matrix[double] W2, matrix[double] b2,
+                   matrix[double] W3, matrix[double] b3,
+                   matrix[double] W4, matrix[double] b4)
+    return (matrix[double] probs) {
+  /*
+   * Computes the class probability predictions of a convolutional
+   * net using the "LeNet" architecture.
+   *
+   * The input matrix, X, has N examples, each represented as a 3D
+   * volume unrolled into a single vector.
+   *
+   * Inputs:
+   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
+   *  - b1: 1st layer biases vector, of shape (F1, 1).
+   *  - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
+   *  - b2: 2nd layer biases vector, of shape (F2, 1).
+   *  - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
+   *  - b3: 3rd layer biases vector, of shape (1, N3).
+   *  - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
+   *  - b4: 4th layer biases vector, of shape (1, K).
+   *
+   * Outputs:
+   *  - probs: Class probabilities, of shape (N, K).
+   */
+  N = nrow(X)
+
+  # Network:
+  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
+  Hf = 5  # filter height
+  Wf = 5  # filter width
+  stride = 1
+  pad = 2  # For same dimensions, (Hf - stride) / 2
+
+  F1 = nrow(W1)  # num conv filters in conv1
+  F2 = nrow(W2)  # num conv filters in conv2
+  N3 = ncol(W3)  # num nodes in affine3
+  K = ncol(W4)  # num nodes in affine4, equal to number of target dimensions (num classes)
+
+  # Compute predictions over mini-batches
+  probs = matrix(0, rows=N, cols=K)
+  batch_size = 64
+  iters = ceil(N / batch_size)
+  for(i in 1:iters) {
+    # Get next batch
+    beg = ((i-1) * batch_size) %% N + 1
+    end = min(N, beg + batch_size - 1)
+    X_batch = X[beg:end,]
+
+    # Compute forward pass
+    ## layer 1: conv1 -> relu1 -> pool1
+    [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
+                                              pad, pad)
+    outr1 = relu::forward(outc1)
+    [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+                                                  strideh=2, stridew=2, pad=0, pad=0)
+    ## layer 2: conv2 -> relu2 -> pool2
+    [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
+                                              stride, stride, pad, pad)
+    outr2 = relu::forward(outc2)
+    [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+                                                  strideh=2, stridew=2, pad=0, pad=0)
+    ## layer 3:  affine3 -> relu3
+    outa3 = affine::forward(outp2, W3, b3)
+    outr3 = relu::forward(outa3)
+    ## layer 4:  affine4 -> softmax
+    outa4 = affine::forward(outr3, W4, b4)
+    probs_batch = softmax::forward(outa4)
+
+    # Store predictions
+    probs[beg:end,] = probs_batch
+  }
+}
+
+eval = function(matrix[double] probs, matrix[double] y)
+    return (double loss, double accuracy) {
+  /*
+   * Evaluates a convolutional net using the "LeNet" architecture.
+   *
+   * The probs matrix contains the class probability predictions
+   * of K classes over N examples.  The targets, y, have K classes,
+   * and are one-hot encoded.
+   *
+   * Inputs:
+   *  - probs: Class probabilities, of shape (N, K).
+   *  - y: Target matrix, of shape (N, K).
+   *
+   * Outputs:
+   *  - loss: Scalar loss, of shape (1).
+   *  - accuracy: Scalar accuracy, of shape (1).
+   */
+  # Compute loss & accuracy
+  loss = cross_entropy_loss::forward(probs, y)
+  correct_pred = rowIndexMax(probs) == rowIndexMax(y)
+  accuracy = mean(correct_pred)
+}
+
+generate_dummy_data = function()
+    return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
+  /*
+   * Generate a dummy dataset similar to the MNIST dataset.
+   *
+   * Outputs:
+   *  - X: Input data matrix, of shape (N, D).
+   *  - y: Target matrix, of shape (N, K).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   */
+  # Generate dummy input data
+  N = 1024  # num examples
+  C = 1  # num input channels
+  Hin = 28  # input height
+  Win = 28  # input width
+  K = 10  # num target classes
+  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+  classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))
+  y = table(seq(1, N), classes)  # one-hot encoding
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/mnist_softmax-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/mnist_softmax-predict.dml b/scripts/nn/examples/mnist_softmax-predict.dml
new file mode 100644
index 0000000..4c8c434
--- /dev/null
+++ b/scripts/nn/examples/mnist_softmax-predict.dml
@@ -0,0 +1,77 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST Softmax - Predict
+#
+# This script computes the class probability predictions of a
+# trained softmax classifier on images of handwritten digits.
+#
+# Inputs:
+#  - X: File containing training images.
+#     The format is "pixel_1, pixel_2, ..., pixel_n".
+#  - model_dir: Directory containing the trained weights and biases
+#     of the model.
+#  - out_dir: Directory to store class probability predictions for
+#     each image.
+#  - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
+#     Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+#  - probs: File containing class probability predictions for each
+#     image.
+#
+# Data:
+# The X file should contain images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels.
+#
+# Sample Invocation:
+# 1. Download images.
+#
+#   For example, save images to `nn/examples/data/mnist/images.csv`.
+#
+# 2. Execute using Spark
+#   ```
+#   spark-submit --master local[*] --driver-memory 5G
+#   --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+#   $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_softmax-predict.dml
+#   -nvargs X=nn/examples/data/mnist/images.csv
+#   model_dir=nn/examples/model/mnist_softmax out_dir=nn/examples/data/mnist
+#
+source("nn/examples/mnist_softmax.dml") as mnist_softmax
+
+# Read training data
+fmt = ifdef($fmt, "csv")
+X = read($X, format=fmt)
+
+# Scale images to [0,1], and one-hot encode the labels
+X = X / 255.0
+
+# Read model coefficients
+W = read($model_dir+"/W")
+b = read($model_dir+"/b")
+
+# Predict classes
+probs = mnist_softmax::predict(X, W, b)
+
+# Output results
+write(probs, $out_dir+"/probs."+fmt, format=fmt)
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/mnist_softmax-train.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/mnist_softmax-train.dml b/scripts/nn/examples/mnist_softmax-train.dml
new file mode 100644
index 0000000..09970f0
--- /dev/null
+++ b/scripts/nn/examples/mnist_softmax-train.dml
@@ -0,0 +1,110 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST Softmax - Train
+#
+# This script trains a softmax classifier on images of handwritten
+# digits.
+#
+# Inputs:
+#  - train: File containing labeled MNIST training images.
+#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
+#  - test: File containing labeled MNIST test images.
+#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
+#  - out_dir: Directory to store weights and bias matrices of
+#     trained model, as well as final test accuracy.
+#  - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
+#     Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+#  - W: File containing the trained weights of the model.
+#  - b: File containing the trained biases of the model.
+#  - accuracy: File containing the final accuracy on the test data.
+#
+# Data:
+# The MNIST dataset contains labeled images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels, and each label is
+# one of 10 possible digits in [0,9].
+#
+# Sample Invocation (running from wihtin the `examples` folder):
+# 1. Download data (60,000 training examples, and 10,000 test examples)
+#   ```
+#   nn/examples/get_mnist_data.sh
+#   ```
+#
+# 2. Execute using Spark
+#   ```
+#   spark-submit --master local[*] --driver-memory 10G
+#   --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+#   $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_softmax-train.dml
+#   -nvargs train=nn/examples/data/mnist/mnist_train.csv test=nn/examples/data/mnist/mnist_test.csv
+#   epochs=1 out_dir=nn/examples/model/mnist_softmax
+#   ```
+#
+source("nn/examples/mnist_softmax.dml") as mnist_softmax
+
+# Read training data
+fmt = ifdef($fmt, "csv")
+train = read($train, format=fmt)
+test = read($test, format=fmt)
+epochs = ifdef($epochs, 1)
+out_dir = ifdef($out_dir, ".")
+
+# Extract images and labels
+images = train[,2:ncol(train)]
+labels = train[,1]
+X_test = test[,2:ncol(test)]
+y_test = test[,1]
+
+# Scale images to [0,1], and one-hot encode the labels
+n = nrow(train)
+n_test = nrow(test)
+classes = 10
+images = images / 255.0
+labels = table(seq(1, n), labels+1, n, classes)
+X_test = X_test / 255.0
+y_test = table(seq(1, n_test), y_test+1, n_test, classes)
+
+# Split into training (55,000 examples) and validation (5,000 examples)
+X = images[5001:nrow(images),]
+X_val = images[1:5000,]
+y = labels[5001:nrow(images),]
+y_val = labels[1:5000,]
+
+# Train
+[W, b] = mnist_softmax::train(X, y, X_val, y_val, epochs)
+
+# Write model out
+write(W, out_dir+"/W")
+write(b, out_dir+"/b")
+
+# Eval on test set
+probs = mnist_softmax::predict(X_test, W, b)
+[loss, accuracy] = mnist_softmax::eval(probs, y_test)
+
+# Output results
+print("Test Accuracy: " + accuracy)
+write(accuracy, out_dir+"/accuracy")
+
+print("")
+print("")
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/mnist_softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/mnist_softmax.dml b/scripts/nn/examples/mnist_softmax.dml
new file mode 100644
index 0000000..a529a12
--- /dev/null
+++ b/scripts/nn/examples/mnist_softmax.dml
@@ -0,0 +1,178 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * MNIST Softmax Example
+ */
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/softmax.dml") as softmax
+source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+
+train = function(matrix[double] X, matrix[double] y,
+                 matrix[double] X_val, matrix[double] y_val,
+                 int epochs)
+    return (matrix[double] W, matrix[double] b) {
+  /*
+   * Trains a softmax classifier.
+   *
+   * The input matrix, X, has N examples, each with D features.
+   * The targets, y, have K classes, and are one-hot encoded.
+   *
+   * Inputs:
+   *  - X: Input data matrix, of shape (N, D).
+   *  - y: Target matrix, of shape (N, K).
+   *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
+   *  - y_val: Target validation matrix, of shape (N, K).
+   *  - epochs: Total number of full training loops over the full data set.
+   *
+   * Outputs:
+   *  - W: Weights (parameters) matrix, of shape (D, M).
+   *  - b: Biases vector, of shape (1, M).
+   */
+  N = nrow(X)  # num examples
+  D = ncol(X)  # num features
+  K = ncol(y)  # num classes
+
+  # Create softmax classifier:
+  # affine -> softmax
+  [W, b] = affine::init(D, K)
+  W = W / sqrt(2.0/(D)) * sqrt(1/(D))
+
+  # Initialize SGD w/ Nesterov momentum optimizer
+  lr = 0.2  # learning rate
+  mu = 0  # momentum
+  decay = 0.99  # learning rate decay constant
+  vW = sgd_nesterov::init(W)  # optimizer momentum state for W
+  vb = sgd_nesterov::init(b)  # optimizer momentum state for b
+
+  # Optimize
+  print("Starting optimization")
+  batch_size = 50
+  iters = 1000 #ceil(N / batch_size)
+  for (e in 1:epochs) {
+    for(i in 1:iters) {
+      # Get next batch
+      beg = ((i-1) * batch_size) %% N + 1
+      end = min(N, beg + batch_size - 1)
+      X_batch = X[beg:end,]
+      y_batch = y[beg:end,]
+
+      # Compute forward pass
+      ## affine & softmax:
+      out = affine::forward(X_batch, W, b)
+      probs = softmax::forward(out)
+
+      # Compute loss & accuracy for training & validation data
+      loss = cross_entropy_loss::forward(probs, y_batch)
+      accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
+      probs_val = predict(X_val, W, b)
+      loss_val = cross_entropy_loss::forward(probs_val, y_val)
+      accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
+      print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
+            accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+
+      # Compute backward pass
+      ## loss:
+      dprobs = cross_entropy_loss::backward(probs, y_batch)
+      ## affine & softmax:
+      dout = softmax::backward(dprobs, out)
+      [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b)
+
+      # Optimize with SGD w/ Nesterov momentum
+      [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW)
+      [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb)
+    }
+    # Anneal momentum towards 0.999
+    mu = mu + (0.999 - mu)/(1+epochs-e)
+    # Decay learning rate
+    lr = lr * decay
+  }
+}
+
+predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
+    return (matrix[double] probs) {
+  /*
+   * Computes the class probability predictions of a softmax classifier.
+   *
+   * The input matrix, X, has N examples, each with D features.
+   *
+   * Inputs:
+   *  - X: Input data matrix, of shape (N, D).
+   *  - W: Weights (parameters) matrix, of shape (D, M).
+   *  - b: Biases vector, of shape (1, M).
+   *
+   * Outputs:
+   *  - probs: Class probabilities, of shape (N, K).
+   */
+  # Compute forward pass
+  ## affine & softmax:
+  out = affine::forward(X, W, b)
+  probs = softmax::forward(out)
+}
+
+eval = function(matrix[double] probs, matrix[double] y)
+    return (double loss, double accuracy) {
+  /*
+   * Evaluates a softmax classifier.
+   *
+   * The probs matrix contains the class probability predictions
+   * of K classes over N examples.  The targets, y, have K classes,
+   * and are one-hot encoded.
+   *
+   * Inputs:
+   *  - probs: Class probabilities, of shape (N, K).
+   *  - y: Target matrix, of shape (N, K).
+   *
+   * Outputs:
+   *  - loss: Scalar loss, of shape (1).
+   *  - accuracy: Scalar accuracy, of shape (1).
+   */
+  # Compute loss & accuracy
+  loss = cross_entropy_loss::forward(probs, y)
+  correct_pred = rowIndexMax(probs) == rowIndexMax(y)
+  accuracy = mean(correct_pred)
+}
+
+generate_dummy_data = function()
+    return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
+  /*
+   * Generate a dummy dataset similar to the MNIST dataset.
+   *
+   * Outputs:
+   *  - X: Input data matrix, of shape (N, D).
+   *  - y: Target matrix, of shape (N, K).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   */
+  # Generate dummy input data
+  N = 1024  # num examples
+  C = 1  # num input channels
+  Hin = 28  # input height
+  Win = 28  # input width
+  T = 10  # num targets
+  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+  classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
+  y = table(seq(1, N), classes)  # one-hot encoding
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/affine.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/affine.dml b/scripts/nn/layers/affine.dml
new file mode 100644
index 0000000..c9a740b
--- /dev/null
+++ b/scripts/nn/layers/affine.dml
@@ -0,0 +1,92 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Affine (fully-connected) layer.
+ */
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b)
+    return (matrix[double] out) {
+  /*
+   * Computes the forward pass for an affine (fully-connected) layer
+   * with M neurons.  The input data has N examples, each with D
+   * features.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, D).
+   *  - W: Weights, of shape (D, M).
+   *  - b: Biases, of shape (1, M).
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, M).
+   */
+  out = X %*% W + b
+}
+
+backward = function(matrix[double] dout, matrix[double] X,
+                    matrix[double] W, matrix[double] b)
+    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+  /*
+   * Computes the backward pass for a fully-connected (affine) layer
+   * with M neurons.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of shape (N, M).
+   *  - X: Inputs, of shape (N, D).
+   *  - W: Weights, of shape (D, M).
+   *  - b: Biases, of shape (1, M).
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, D).
+   *  - dW: Gradient wrt `W`, of shape (D, M).
+   *  - db: Gradient wrt `b`, of shape (1, M).
+   */
+  dX = dout %*% t(W)
+  dW = t(X) %*% dout
+  db = colSums(dout)
+}
+
+init = function(int D, int M)
+    return (matrix[double] W, matrix[double] b) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
+   *
+   * Inputs:
+   *  - D: Dimensionality of the input features (number of features).
+   *  - M: Number of neurons in this layer.
+   *
+   * Outputs:
+   *  - W: Weights, of shape (D, M).
+   *  - b: Biases, of shape (1, M).
+   */
+  W = rand(rows=D, cols=M, pdf="normal") * sqrt(2.0/D)
+  b = matrix(0, rows=1, cols=M)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/batch_norm1d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/batch_norm1d.dml b/scripts/nn/layers/batch_norm1d.dml
new file mode 100644
index 0000000..2ccffdb
--- /dev/null
+++ b/scripts/nn/layers/batch_norm1d.dml
@@ -0,0 +1,210 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 1D Batch Normalization layer.
+ */
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
+                   string mode, matrix[double] ema_mean, matrix[double] ema_var,
+                   double mu, double epsilon)
+    return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
+            matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
+  /*
+   * Computes the forward pass for a 1D batch normalization layer.
+   * The input data has N examples, each with D features.
+   *
+   * A batch normalization layer uses the per-feature sample mean and
+   * per-feature uncorrected sample variance during training to
+   * normalize each feature of the input data.  Additionally, it
+   * introduces learnable parameters (gamma, beta) to control the
+   * amount of normalization.
+   *
+   *   `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
+   *
+   * This implementation maintains exponential moving averages of the
+   * mean and variance during training for use during testing.
+   *
+   * Reference:
+   *  - Batch Normalization: Accelerating Deep Network Training by
+   *    Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
+   *    - https://arxiv.org/abs/1502.03167
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, D).
+   *  - gamma: Scale parameters, of shape (1, D).
+   *  - beta: Shift parameters, of shape (1, D).
+   *  - mode: 'train' or 'test' to indicate if the model is currently
+   *      being trained or tested.  During training, the current batch
+   *      mean and variance will be used to normalize the inputs, while
+   *      during testing, the exponential average of the mean and
+   *      variance over all previous batches will be used.
+   *  - ema_mean: Exponential moving average of the mean, of
+   *      shape (1, D).
+   *  - ema_var: Exponential moving average of the variance, of
+   *      shape (1, D).
+   *  - mu: Momentum value for moving averages.
+   *      Typical values are in the range of [0.9, 0.999].
+   *  - epsilon: Smoothing term to avoid divide by zero errors.
+   *      Typical values are in the range of [1e-5, 1e-3].
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, D).
+   *  - ema_mean_upd: Updated exponential moving average of the mean,
+   *      of shape (1, D).
+   *  - ema_var_upd: Updated exponential moving average of the variance,
+   *      of shape (1, D).
+   *  - cache_mean: Cache of the batch mean, of shape (1, D).
+   *      Note: This is used for performance during training.
+   *  - cache_var: Cache of the batch variance, of shape (1, D).
+   *      Note: This is used for performance during training.
+   *  - cache_norm: Cache of the normalized inputs, of shape (N, D).
+   *      Note: This is used for performance during training.
+   */
+  N = nrow(X)
+
+  if (mode == 'train') {
+    # Compute feature-wise mean and variance
+    mean = colMeans(X)  # shape (1, D)
+    # var = (1/N) * colSums((X-mean)^2)
+    var = colVars(X) * ((N-1)/N)  # compute uncorrected variance, of shape (1, D)
+    # Update moving averages
+    ema_mean_upd = mu*ema_mean + (1-mu)*mean
+    ema_var_upd = mu*ema_var + (1-mu)*var
+  }
+  else {
+    # Use moving averages of mean and variance during testing
+    mean = ema_mean
+    var = ema_var
+    ema_mean_upd = ema_mean
+    ema_var_upd = ema_var
+  }
+
+  # Normalize, shift, and scale
+  # norm = (X-mean)*(var+epsilon)^(-1/2)
+  norm = (X-mean) / sqrt(var+epsilon)  # shape (N, D)
+  out = norm*gamma + beta  # shape (N, D)
+
+  # Save variable for backward pass
+  cache_mean = mean
+  cache_var = var
+  cache_norm = norm
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+                    matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
+                    matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
+                    matrix[double] X, matrix[double] gamma, matrix[double] beta,
+                    string mode, matrix[double] ema_mean, matrix[double] ema_var,
+                    double mu, double epsilon)
+      return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+  /*
+   * Computes the backward pass for a 1D batch normalization layer.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of shape (N, D).
+   *  - out: Outputs from the forward pass, of shape (N, D).
+   *  - ema_mean_upd: Updated exponential moving average of the mean
+   *      from the forward pass, of shape (1, D).
+   *  - ema_var_upd: Updated exponential moving average of the variance
+   *      from the forward pass, of shape (1, D).
+   *  - cache_mean: Cache of the batch mean from the forward pass, of
+   *      shape (1, D).  Note: This is used for performance during
+   *      training.
+   *  - cache_var: Cache of the batch variance from the forward pass,
+   *      of shape (1, D).  Note: This is used for performance during
+   *      training.
+   *  - cache_norm: Cache of the normalized inputs from the forward
+   *      pass, of shape (N, D).  Note: This is used for performance
+   *      during training.
+   *  - X: Inputs, of shape (N, D).
+   *  - gamma: Scale parameters, of shape (1, D).
+   *  - beta: Shift parameters, of shape (1, D).
+   *  - mode: 'train' or 'test' to indicate if the model is currently
+   *      being trained or tested.  During training, the current batch
+   *      mean and variance will be used to normalize the inputs, while
+   *      during testing, the exponential average of the mean and
+   *      variance over all previous batches will be used.
+   *  - ema_mean: Exponential moving average of the mean, of
+   *      shape (1, D).
+   *  - ema_var: Exponential moving average of the variance, of
+   *      shape (1, D).
+   *  - mu: Momentum value for moving averages.
+   *      Typical values are in the range of [0.9, 0.999].
+   *  - epsilon: Smoothing term to avoid divide by zero errors.
+   *      Typical values are in the range of [1e-5, 1e-3].
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, D).
+   *  - dgamma: Gradient wrt `W`, of shape (1, D).
+   *  - dbeta: Gradient wrt `b`, of shape (1, D).
+   *
+   */
+  N = nrow(X)
+  mean = cache_mean
+  var = cache_var
+  norm = cache_norm
+  centered = X-mean
+
+  if (mode == 'train') {
+    # Compute gradients during training
+    dgamma = colSums(dout*norm)  # shape (1, D)
+    dbeta = colSums(dout)  # shape (1, D)
+    dnorm = dout * gamma  # shape (N, D)
+    dvar = (-1/2) * colSums(centered * (var+epsilon)^(-3/2) * dnorm)  # shape (1, D)
+    dmean = colSums((-dnorm/sqrt(var+epsilon)) + ((-2/N)*centered*dvar))  # shape (1, D)
+    dX = (dnorm/sqrt(var+epsilon)) + ((2/N)*centered*dvar) + ((1/N)*dmean)  # shape (N, D)
+  }
+  else {
+    # Compute gradients during testing
+    dgamma = colSums(dout*norm)  # shape (1, D)
+    dbeta = colSums(dout)  # shape (1, D)
+    dnorm = dout * gamma  # shape (N, D)
+    dX = dnorm / sqrt(var+epsilon)  # shape (N, D)
+  }
+}
+
+init = function(int D)
+    return (matrix[double] gamma, matrix[double] beta,
+            matrix[double] ema_mean, matrix[double] ema_var) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * Inputs:
+   *  - D: Dimensionality of the input features (number of features).
+   *
+   * Outputs:
+   *  - gamma: Scale parameters, of shape (1, D).
+   *  - beta: Shift parameters, of shape (1, D).
+   *  - ema_mean: Exponential moving average of the mean, of
+   *      shape (1, D).
+   *  - ema_var: Exponential moving average of the variance, of
+   *      shape (1, D).
+   */
+   gamma = matrix(1, rows=1, cols=D)
+   beta = matrix(0, rows=1, cols=D)
+   ema_mean = matrix(0, rows=1, cols=D)
+   ema_var = matrix(1, rows=1, cols=D)
+}
+


[07/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn` library to `scripts/nn`

Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/test.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/test.dml b/scripts/nn/test/test.dml
new file mode 100644
index 0000000..a5cb497
--- /dev/null
+++ b/scripts/nn/test/test.dml
@@ -0,0 +1,549 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Various tests, not including gradient checks.
+ */
+source("nn/layers/batch_norm1d.dml") as batch_norm1d
+source("nn/layers/batch_norm2d.dml") as batch_norm2d
+source("nn/layers/conv2d.dml") as conv2d
+source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/max_pool2d.dml") as max_pool2d
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
+source("nn/layers/tanh.dml") as tanh
+source("nn/test/conv2d_simple.dml") as conv2d_simple
+source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
+source("nn/test/util.dml") as test_util
+source("nn/util.dml") as util
+
+batch_norm1d = function() {
+  /*
+   * Test for the 1D batch normalization function.
+   */
+  print("Testing the 1D batch normalization function.")
+
+  # Generate data
+  N = 4  # Number of examples
+  D = 4  # Number of features
+  mode = 'train'  # execution mode
+  mu = 0.9  # momentum of moving averages
+  eps = 1e-5  # smoothing term
+  X = matrix(seq(1,16), rows=N, cols=D)
+
+  # Create layer
+  [gamma, beta, ema_mean, ema_var] = batch_norm1d::init(D)
+
+  # Forward
+  [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+      batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+
+  # Equivalency check
+  target = matrix("-1.34160721 -1.34160721 -1.34160733 -1.34160709
+                   -0.44720244 -0.44720244 -0.44720244 -0.44720232
+                    0.44720244  0.44720232  0.44720244  0.44720244
+                    1.34160733  1.34160721  1.34160733  1.34160733", rows=1, cols=N*D)
+  out = matrix(out, rows=1, cols=N*D)
+  for (i in 1:length(out)) {
+    rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
+                                           as.scalar(target[1,i]), 1e-3, 1e-4)
+  }
+}
+
+conv2d = function() {
+  /*
+   * Test for the 2D convolution functions.
+   */
+  print("Testing the 2D convolution functions.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 3  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  F = 2  # num filters
+  Hf = 3  # filter height
+  Wf = 3  # filter width
+  stride = 1
+  pad = 1
+  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+
+  # Create layer
+  [W, b] = conv2d::init(F, C, Hf, Wf)
+
+  # Forward
+  [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  [out_simple, Hout_simple, Wout_simple] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf,
+                                                                  stride, stride, pad, pad)
+  [out_builtin, Hout_builtin, Wout_builtin] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf,
+                                                                      stride, stride, pad, pad)
+
+  # Equivalency check
+  out = matrix(out, rows=1, cols=N*F*Hout*Wout)
+  out_simple = matrix(out_simple, rows=1, cols=N*F*Hout*Wout)
+  out_builtin = matrix(out_builtin, rows=1, cols=N*F*Hout*Wout)
+  for (i in 1:length(out)) {
+    rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
+                                           as.scalar(out_simple[1,i]), 1e-10, 1e-12)
+    rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
+                                           as.scalar(out_builtin[1,i]), 1e-10, 1e-12)
+  }
+}
+
+cross_entropy_loss = function() {
+  /*
+   * Test for the cross-entropy loss function.
+   *
+   * Here we make sure that the cross-entropy loss function does
+   * not propagate `infinity` values in the case that a prediction is
+`  * exactly equal to 0.
+   */
+  print("Testing the cross-entropy loss function with zero-valued predictions.")
+
+  # Generate data
+  N = 3 # num examples
+  K = 10 # num targets
+  pred = matrix(0, rows=N, cols=K)
+  y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
+  y = y / rowSums(y)  # normalized probs
+
+  loss = cross_entropy_loss::forward(pred, y)
+
+  inf = 1/0
+  if (loss == inf) {
+    print("ERROR: The cross-entropy loss function ouptuts infinity for all-zero predictions.")
+  }
+}
+
+im2col = function() {
+  /*
+   * Test for the `im2col` and `col2im` functions.
+   */
+  print("Testing the im2col and col2im functions.")
+
+	# Generate data
+  C = 3  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  Hf = 3  # filter height
+  Wf = 3  # filter width
+  stride = 2
+  pad = (Hin * stride - Hin + Hf - stride) / 2
+  Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
+  Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
+  x = rand(rows=C, cols=Hin*Win)
+
+  # pad
+  x_pad = util::pad_image(x, Hin, Win, pad, pad, 0)
+
+  # im2col
+  x_cols = util::im2col(x_pad, Hin+2*pad, Win+2*pad, Hf, Wf, stride, stride)
+
+  if (ncol(x_cols) != Hout*Wout) {
+    print("ERROR: im2col does not yield the correct output size: "
+          + ncol(x_cols)+" (actual) vs. "+Hout*Wout+" (correct).")
+  }
+
+  # col2im
+  x_pad2 = util::col2im(x_cols, C, Hin+2*pad, Win+2*pad, Hf, Wf, stride, stride, "none")
+
+  # Equivalency check
+  equivalent = test_util::all_equal(x_pad, x_pad2)
+  if (!equivalent) {
+    print("ERROR: im2col and then col2im does not yield the original image.")
+  }
+}
+
+padding = function() {
+  /*
+   * Test for the `pad_image` and `unpad_image` functions.
+   */
+  print("Testing the padding and unpadding functions.")
+
+  # Generate data
+  C = 3  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  pad = 3  # padding
+  x = rand(rows=C, cols=Hin*Win)
+
+  # Pad image
+  x_pad = util::pad_image(x, Hin, Win, pad, pad, 0)
+
+  # Check for padded rows & columns
+  for (c in 1:C) {
+    x_pad_slice = matrix(x_pad[c,], rows=Hin+2*pad, cols=Win+2*pad)
+    for (i in 1:pad) {
+      rowsum = sum(x_pad_slice[i,])
+      colsum = sum(x_pad_slice[,i])
+      if (rowsum != 0)
+        print("ERROR: Padding was not applied to row " + i + ".")
+      if (colsum != 0)
+        print("ERROR: Padding was not applied to column " + i + ".")
+    }
+  }
+
+  # Unpad image
+  x1 = util::unpad_image(x_pad, Hin, Win, pad, pad)
+
+  # Equivalency check
+  equivalent = test_util::all_equal(x, x1)
+  if (!equivalent) {
+    print("ERROR: Padding and then unpadding does not yield the original image.")
+  }
+}
+
+max_pool2d = function() {
+  /*
+   * Test for the 2D max pooling functions.
+   */
+  print("Testing the 2D max pooling functions.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 3  # num channels
+  Hin = 8  # input height
+  Win = 8  # input width
+  Hf = 2  # filter height
+  Wf = 2  # filter width
+  stride = 2
+  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+
+  for (padh in 0:3) {
+    for (padw in 0:3) {
+      print(" - Testing w/ padh="+padh+" & padw="+padw+".")
+      #if (1==1) {}  # force correct printing
+      #print("   - Testing forward")
+      [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, padh, padw)
+      [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+                                                                          stride, stride,
+                                                                          padh, padw)
+      [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win,
+                                                                              Hf, Wf,
+                                                                              stride, stride,
+                                                                              padh, padw)
+
+      # Equivalency check
+      out = matrix(out, rows=1, cols=N*C*Hout*Wout)
+      out_simple = matrix(out_simple, rows=1, cols=N*C*Hout*Wout)
+      out_builtin = matrix(out_builtin, rows=1, cols=N*C*Hout*Wout)
+      for (i in 1:length(out)) {
+        rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
+                                               as.scalar(out_simple[1,i]), 1e-10, 1e-12)
+        rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
+                                               as.scalar(out_builtin[1,i]), 1e-10, 1e-12)
+      }
+
+      #print("   - Testing backward")
+      dout = rand(rows=N, cols=C*Hout*Wout, pdf="normal")
+      dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
+                                padh, padw)
+      dX_simple = max_pool2d_simple::backward(dout, Hout_simple, Wout_simple, X, C, Hin, Win,
+                                              Hf, Wf, stride, stride, padh, padw)
+      dX_builtin = max_pool2d_builtin::backward(dout, Hout_builtin, Wout_builtin, X, C, Hin, Win,
+                                                Hf, Wf, stride, stride, padh, padw)
+
+      # Equivalency check
+      dX = matrix(dX, rows=1, cols=N*C*Hin*Win)
+      dX_simple = matrix(dX_simple, rows=1, cols=N*C*Hin*Win)
+      dX_builtin = matrix(dX_builtin, rows=1, cols=N*C*Hin*Win)
+      for (i in 1:length(dX)) {
+        rel_error = test_util::check_rel_error(as.scalar(dX[1,i]),
+                                               as.scalar(dX_simple[1,i]), 1e-10, 1e-12)
+        rel_error = test_util::check_rel_error(as.scalar(dX[1,i]),
+                                               as.scalar(dX_builtin[1,i]), 1e-10, 1e-12)
+      }
+    }
+  }
+
+  # ---
+  print(" - Testing for correct behavior against known answer w/ pad=0.")
+  # generate data
+  # -- channel 1
+  #  1  2  3  4
+  #  5  6  7  8
+  #  9 10 11 12
+  # 13 14 15 16
+  # -- channel 2
+  #  1  5  9 13
+  #  2  6 10 14
+  #  3  7 11 15
+  #  4  8 12 16
+  C = 2  # num channels
+  Hin = 4  # input height
+  Win = 4  # input width
+  X = matrix(seq(1,16,1), rows=Hin, cols=Win)
+  X = matrix(rbind(X, t(X)), rows=1, cols=C*Hin*Win)  # C=2
+  X = rbind(X, X)  # n=2
+  pad = 0
+
+  # forward
+  [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+                                                                      stride, stride, pad, pad)
+  [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
+                                                                          stride, stride, pad, pad)
+
+  # equivalency check
+  # -- channel 1
+  #   6  8
+  #  14 16
+  # -- channel 2
+  #  6  14
+  #  8  16
+  target = matrix("6 8 14 16 6 14 8 16", rows=1, cols=C*Hout*Wout)
+  target = rbind(target, target)  # n=2
+  tmp = test_util::check_all_equal(out, target)
+  tmp = test_util::check_all_equal(out_simple, target)
+  tmp = test_util::check_all_equal(out_builtin, target)
+
+  print(" - Testing for correct behavior against known answer w/ pad=1.")
+  # generate data
+  # -- channel 1
+  #  0  0  0  0  0  0
+  #  0  1  2  3  4  0
+  #  0  5  6  7  8  0
+  #  0  9 10 11 12  0
+  #  0 13 14 15 16  0
+  #  0  0  0  0  0  0
+  # -- channel 2
+  #  0  0  0  0  0  0
+  #  0  1  5  9 13  0
+  #  0  2  6 10 14  0
+  #  0  3  7 11 15  0
+  #  0  4  8 12 16  0
+  #  0  0  0  0  0  0
+  pad = 1
+
+  # forward
+  [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+                                                                      stride, stride, pad, pad)
+  [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
+                                                                          stride, stride, pad, pad)
+
+  # equivalency check
+  # -- channel 1
+  #  1  3  4
+  #  9 11 12
+  # 13 15 16
+  # -- channel 2
+  #  1  9 13
+  #  3 11 15
+  #  4 12 16
+  target = matrix("1 3 4 9 11 12 13 15 16 1 9 13 3 11 15 4 12 16", rows=1, cols=C*Hout*Wout)
+  target = rbind(target, target)  # n=2
+  tmp = test_util::check_all_equal(out, target)
+  tmp = test_util::check_all_equal(out_simple, target)
+  tmp = test_util::check_all_equal(out_builtin, target)
+
+  print(" - Testing for correct behavior against known answer w/ all negative matrix w/ pad=0.")
+  # generate data
+  # -- channel 1
+  #  -1  -2  -3  -4
+  #  -5  -6  -7  -8
+  #  -9 -10 -11 -12
+  # -13 -14 -15 -16
+  # -- channel 2
+  #  -1  -5  -9 -13
+  #  -2  -6 -10 -14
+  #  -3  -7 -11 -15
+  #  -4  -8 -12 -16
+  X = X * -1
+  pad = 0
+
+  # forward
+  [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+                                                                      stride, stride, pad, pad)
+  [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
+                                                                          stride, stride, pad, pad)
+
+  # equivalency check
+  # -- channel 1
+  #  -1  -3
+  #  -9 -11
+  # -- channel 2
+  #  -1  -9
+  #  -3 -11
+  target = matrix("-1 -3 -9 -11 -1 -9 -3 -11", rows=1, cols=C*Hout*Wout)
+  target = rbind(target, target)  # n=2
+  tmp = test_util::check_all_equal(out, target)
+  tmp = test_util::check_all_equal(out_simple, target)
+  tmp = test_util::check_all_equal(out_builtin, target)
+
+
+  print(" - Testing for correct behavior against known answer w/ all negative matrix w/ pad=1.")
+  # generate data
+  # -- channel 1
+  #  0   0   0   0   0  0
+  #  0  -1  -2  -3  -4  0
+  #  0  -5  -6  -7  -8  0
+  #  0  -9 -10 -11 -12  0
+  #  0 -13 -14 -15 -16  0
+  #  0   0   0   0   0  0
+  # -- channel 2
+  #  0   0   0   0   0  0
+  #  0  -1  -5  -9 -13  0
+  #  0  -2  -6 -10 -14  0
+  #  0  -3  -7 -11 -15  0
+  #  0  -4  -8 -12 -16  0
+  #  0   0   0   0   0  0
+  pad = 1
+
+  # forward
+  [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+                                                                      stride, stride, pad, pad)
+  [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
+                                                                          stride, stride, pad, pad)
+
+  # equivalency check
+  # -- channel 1
+  #  0  0  0
+  #  0 -6  0
+  #  0  0  0
+  # -- channel 2
+  #  0  0  0
+  #  0 -6  0
+  #  0  0  0
+  target = matrix("-1 -2 -4 -5 -6 -8 -13 -14 -16 -1 -5 -13 -2 -6 -14 -4 -8 -16",
+                  rows=1, cols=C*Hout*Wout)
+  target = rbind(target, target)  # n=2
+  tmp = test_util::check_all_equal(out, target)
+  tmp = test_util::check_all_equal(out_simple, target)
+  tmp = test_util::check_all_equal(out_builtin, target)
+}
+
+batch_norm2d = function() {
+  /*
+   * Test for the 2D (spatial) batch normalization function.
+   */
+  print("Testing the 2D (spatial) batch normalization function.")
+
+  # Generate data
+  N = 2  # Number of examples
+  C = 3  # num channels
+  Hin = 4  # input height
+  Win = 5  # input width
+  mode = 'train'  # execution mode
+  mu = 0.9  # momentum of moving averages
+  eps = 1e-5  # smoothing term
+  X = matrix("70  29 23 55 72
+              42  98 68 48 39
+              34  73 44  6 40
+              74  18 18 53 53
+
+              63  85 72 61 72
+              32  36 23 29 63
+               9  43 43 49 43
+              31  43 89 94 50
+
+              62  12 32 41 87
+              25  48 99 52 61
+              12  83 60 55 34
+              30  42 68 88 51
+
+
+              67  59 62 67 84
+               8  76 24 19 57
+              10  89 63 72  2
+              59  56 16 15 70
+
+              32  69 55 39 93
+              84  36  4 30 40
+              70 100 36 76 59
+              69  15 40 24 34
+
+              51  67 11 13 32
+              66  85 55 85 38
+              32  35 17 83 34
+              55  58 52  0 99", rows=N, cols=C*Hin*Win)
+
+  # Create layer
+  [gamma, beta, ema_mean, ema_var] = batch_norm2d::init(C)
+
+  # Forward
+  [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+      batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+
+  # Equivalency check
+  target = matrix("0.86215019 -0.76679718 -1.00517964  0.26619387  0.94161105
+                  -0.25030172  1.97460198  0.78268933 -0.01191914 -0.36949289
+                  -0.56814504  0.98134136 -0.17084086 -1.68059683 -0.32976246
+                   1.02107191 -1.20383179 -1.20383179  0.18673301  0.18673301
+
+                   0.50426388  1.41921711  0.87856293  0.42108631  0.87856293
+                  -0.78498828 -0.61863315 -1.15928721 -0.90975463  0.50426388
+                  -1.74153018 -0.32751167 -0.32751167 -0.07797909 -0.32751167
+                  -0.82657707 -0.32751167  1.58557224  1.79351616 -0.0363903
+
+                   0.4607178  -1.49978399 -0.71558321 -0.36269283  1.44096887
+                  -0.99005347 -0.08822262  1.91148913  0.06861746  0.42150795
+                  -1.49978399  1.28412855  0.38229787  0.18624771 -0.63716316
+                  -0.79400325 -0.32348287  0.69597805  1.48017895  0.0294075
+
+
+                   0.74295878  0.42511559  0.54430676  0.74295878  1.41837597
+                  -1.60113597  1.10053277 -0.96544927 -1.16410136  0.34565473
+                  -1.52167511  1.61702824  0.5840373   0.94161105 -1.83951855
+                   0.42511559  0.30592418 -1.28329265 -1.32302308  0.86215019
+
+                  -0.78498828  0.75379658  0.17155361 -0.4938668   1.75192738
+                   1.37762833 -0.61863315 -1.9494741  -0.86816585 -0.45227802
+                   0.79538536  2.04304862 -0.61863315  1.04491806  0.33790874
+                   0.75379658 -1.49199748 -0.45227802 -1.11769855 -0.70181072
+
+                   0.0294075   0.65676796 -1.53899395 -1.46057391 -0.71558321
+                   0.61755812  1.36254871  0.18624771  1.36254871 -0.48032296
+                  -0.71558321 -0.59795308 -1.30373383  1.28412855 -0.63716316
+                   0.18624771  0.30387771  0.06861746 -1.97030437  1.91148913",
+                  rows=1, cols=N*C*Hin*Win)
+  out = matrix(out, rows=1, cols=N*C*Hin*Win)
+  for (i in 1:length(out)) {
+    rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
+                                           as.scalar(target[1,i]), 1e-3, 1e-4)
+  }
+}
+
+tanh = function() {
+  /*
+   * Test for the `tanh` forward function.
+   */
+  print("Testing the tanh forward function.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 3  # num channels
+  X = rand(rows=N, cols=C, pdf="normal")
+
+  out = tanh::forward(X)
+  out_ref = (exp(X) - exp(-X)) / (exp(X) + exp(-X))
+
+  # Equivalency check
+  for (i in 1:nrow(out)) {
+    for (j in 1:ncol(out)) {
+      rel_error = test_util::check_rel_error(as.scalar(out[i,j]), as.scalar(out_ref[i,j]),
+                                             1e-10, 1e-12)
+    }
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/util.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/util.dml b/scripts/nn/test/util.dml
new file mode 100644
index 0000000..e32a885
--- /dev/null
+++ b/scripts/nn/test/util.dml
@@ -0,0 +1,155 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Test utility functions.
+ */
+
+all_equal = function(matrix[double] X1, matrix[double] X2)
+    return(boolean equivalent) {
+  /*
+   * Determine if two matrices are equivalent.
+   *
+   * Inputs:
+   *  - X1: Inputs, of shape (any, any).
+   *  - X2: Inputs, of same shape as X1.
+   *
+   * Outputs:
+   *  - equivalent: Whether or not the two matrices are equivalent.
+   */
+  equivalent = as.logical(prod(X1 == X2))
+}
+
+check_all_equal = function(matrix[double] X1, matrix[double] X2)
+    return(boolean equivalent) {
+  /*
+   * Check if two matrices are equivalent, and report any issues.
+   *
+   * Issues an "ERROR" statement if elements of the two matrices are
+   * not equal.
+   *
+   * Inputs:
+   *  - X1: Inputs, of shape (any, any).
+   *  - X2: Inputs, of same shape as X1.
+   *
+   * Outputs:
+   *  - equivalent: Whether or not the two matrices are equivalent.
+   */
+  # Determine if matrices are equivalent
+  equivalent = all_equal(X1, X2)
+
+  # Evaluate relative error
+  if (!equivalent) {
+    print("ERROR: The two matrices are not equivalent.")
+  }
+}
+
+compute_rel_error = function(double x1, double x2)
+    return (double rel_error) {
+  /*
+   * Relative error measure between two values.
+   *
+   * Uses smoothing to avoid divide-by-zero errors.
+   *
+   * Inputs:
+   *  - x1: First value.
+   *  - x2: Second value.
+   *
+   * Outputs:
+   *  - rel_error: Relative error measure between the two values.
+   */
+  rel_error = abs(x1-x2) / max(1e-8, abs(x1)+abs(x2))
+}
+
+check_rel_error = function(double x1, double x2, double thresh_error, double thresh_warn)
+    return (double rel_error) {
+  /*
+   * Check and report any issues with the relative error measure between
+   * two values.
+   *
+   * Issues an "ERROR" statement for relative errors > thresh_error,
+   * indicating that the implementation is likely incorrect.
+   *
+   * Issues a "WARNING" statement for relative errors < thresh_error
+   * but > thresh_warn, indicating that the implementation may be
+   * incorrect.
+   *
+   * Inputs:
+   *  - x1: First value.
+   *  - x2: Second value.
+   *  - thresh_error: Error threshold.
+   *  - thresh_warn: Warning threshold.
+   *
+   * Outputs:
+   *  - rel_error: Relative error measure between the two values.
+   */
+  # Compute relative error
+  rel_error = compute_rel_error(x1, x2)
+
+  # Evaluate relative error
+  if (rel_error > thresh_error) {
+    print("ERROR: Relative error " + rel_error + " > " + thresh_error + " with " + x1 +
+          " vs " + x2 + ".")
+  }
+  else if (rel_error > thresh_warn & rel_error <= thresh_error) {
+    print("WARNING: Relative error " + rel_error + " > " + thresh_warn + " & <= " + thresh_error +
+          " with " + x1 + " vs " + x2 + ".")
+  }
+}
+
+check_rel_grad_error = function(double dw_a, double dw_n, double lossph, double lossmh)
+    return (double rel_error) {
+  /*
+   * Check and report any issues with the relative error measure between
+   * the analytical and numerical partial derivatives.
+   *
+   *  - Issues an "ERROR" statement for relative errors > 1e-2,
+   *  indicating that the gradient is likely incorrect.
+   *  - Issues a "WARNING" statement for relative errors < 1e-2
+   *  but > 1e-4, indicating that the may be incorrect.
+   *
+   * Inputs:
+   *  - dw_a: Analytical partial derivative wrt w.
+   *  - dw_n: Numerical partial derivative wrt w.
+   *  - lossph: Loss evaluated with w set to w+h.
+   *  - lossmh: Loss evaluated with w set to w-h.
+   *
+   * Outputs:
+   *  - rel_error: Relative error measure between the two derivatives.
+   */
+  # Compute relative error
+  rel_error = compute_rel_error(dw_a, dw_n)
+
+  # Evaluate relative error
+  thresh_error = 1e-2
+  thresh_warn = 1e-4
+  if (rel_error > thresh_error) {
+    print("ERROR: Relative error " + rel_error + " > " + thresh_error + " with " + dw_a +
+          " analytical vs " + dw_n + " numerical, with lossph " + lossph +
+          " and lossmh " + lossmh)
+  }
+  else if (rel_error > thresh_warn & rel_error <= thresh_error) {
+    print("WARNING: Relative error " + rel_error + " > " + thresh_warn + " & <= " + thresh_error +
+          " with " + dw_a + " analytical vs " + dw_n + " numerical, with lossph " + lossph +
+          " and lossmh " + lossmh)
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/util.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/util.dml b/scripts/nn/util.dml
new file mode 100644
index 0000000..3a73f08
--- /dev/null
+++ b/scripts/nn/util.dml
@@ -0,0 +1,202 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Utility functions.
+ */
+
+channel_sums = function(matrix[double] X, int C, int Hin, int Win)
+    return (matrix[double] out) {
+  /*
+   * Computes a channel-wise summation over a 4D input.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (C, 1).
+   */
+  # Here we sum each column, reshape to (C, Hin*Win), and sum each row to result in the summation
+  # for each channel.
+  out = rowSums(matrix(colSums(X), rows=C, cols=Hin*Win))  # shape (C, 1)
+}
+
+im2col = function(matrix[double] img, int Hin, int Win, int Hf, int Wf, int strideh, int stridew)
+    return (matrix[double] img_cols) {
+  /*
+   * Rearrange local image regions (patches) into columns.
+   *
+   * Assumes image has already been padded as necessary.
+   *
+   * Inputs:
+   *  - img: Input image, of shape (C, Hin*Win), where C is the number
+   *      of input channels (depth).
+   *  - Hin: Input height, including padding.
+   *  - Win: Input width, including padding.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *
+   * Outputs:
+   *  - img_cols: Local spatial regions (patches) of the image stretched
+   *      out into columns, of shape (C*Hf*Wf, Hout*Wout).
+   */
+  C = nrow(img)
+  Hout = as.integer(floor((Hin-Hf)/strideh + 1))
+  Wout = as.integer(floor((Win-Wf)/stridew + 1))
+
+  # Note: We start with `img_cols` transposed to allow for row-major
+  # left-indexing inside the loop, which is more performant.
+  img_cols = matrix(0, rows=Hout*Wout, cols=C*Hf*Wf)  # zeros
+  parfor (hout in 1:Hout, check=0) {  # all output rows
+    hin = (hout-1)*strideh + 1
+    parfor (wout in 1:Wout, check=0) {  # all output columns
+      win = (wout-1)*stridew + 1
+      # Extract a local patch of the input image corresponding spatially to the filter sizes.
+      img_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
+      parfor (c in 1:C) {  # all channels
+        img_slice = matrix(img[c,], rows=Hin, cols=Win)  # reshape
+        img_patch[c,] = matrix(img_slice[hin:hin+Hf-1, win:win+Wf-1], rows=1, cols=Hf*Wf)
+      }
+      img_cols[(hout-1)*Wout + wout,] = t(matrix(img_patch, rows=C*Hf*Wf, cols=1))  # reshape
+    }
+  }
+  img_cols = t(img_cols)
+}
+
+col2im = function(matrix[double] img_cols, int C, int Hin, int Win, int Hf, int Wf,
+                  int strideh, int stridew, string reduction)
+    return (matrix[double] img) {
+  /*
+   * Create an image from columns of local image regions (patches).
+   *
+   * The reduction strategy determines how to deal with overlapping
+   * patches.  If it is set to "add", any overlapping patches will be
+   * added together when creating the image.  This is useful when
+   * computing gradients on the original image given gradients on the
+   * patches.  Otherwise, if "none" is provided, any overlapping
+   * patches will just override previous ones when creating the image.
+   * This is useful when recreating an image from the output of
+   * `im2col`.
+   *
+   * Assumes original image was already padded as necessary.
+   *
+   * Inputs:
+   *  - img_cols: Local spatial regions (patches) of the image stretched
+   *      out into columns, of shape (C*Hf*Wf, Hout*Wout).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height, including padding.
+   *  - Win: Input width, including padding.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - reduction: The reduction strategy to use for overlapping
+   *      patches.  Valid options are "add" and "none".
+   *
+   * Outputs:
+   *  - img: Input image, of shape (C, Hin*Win).
+   */
+  Hout = as.integer(floor((Hin-Hf)/strideh + 1))
+  Wout = as.integer(floor((Win-Wf)/stridew + 1))
+
+  img = matrix(0, rows=C, cols=Hin*Win)  # zeros
+  for (hout in 1:Hout) {  # all output rows
+    hin = (hout-1)*strideh + 1
+    for (wout in 1:Wout) {  # all output columns
+      win = (wout-1)*stridew + 1
+      # Extract a local patch of the input image corresponding spatially to the filter sizes.
+      img_patch = matrix(img_cols[,(hout-1)*Wout + wout], rows=C, cols=Hf*Wf)  # zeros
+      parfor (c in 1:C) {  # all channels
+        img_patch_slice = matrix(img_patch[c,], rows=Hf, cols=Wf)  # reshape
+        if (reduction == "add") {
+          img_slice = matrix(0, rows=Hin, cols=Win)
+          img_slice[hin:hin+Hf-1, win:win+Wf-1] = img_patch_slice
+          img[c,] = img[c,] + matrix(img_slice, rows=1, cols=Hin*Win)
+        } else {
+          img_slice = matrix(img[c,], rows=Hin, cols=Win)
+          img_slice[hin:hin+Hf-1, win:win+Wf-1] = img_patch_slice
+          img[c,] = matrix(img_slice, rows=1, cols=Hin*Win)
+        }
+      }
+    }
+  }
+}
+
+pad_image = function(matrix[double] img, int Hin, int Win, int padh, int padw, double pad_value)
+    return (matrix[double] img_padded) {
+  /*
+   * Pads an image along the height and width dimensions with zeros.
+   *
+   * Inputs:
+   *  - img: Input image, of shape (C, Hin*Win), where C is the number
+   *      of input channels (depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *  - pad_value: Value to use for the padding.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - img_padded: The input image padded along the height and width
+   *      dimensions, of shape (C, (Hin+2*padh)*(Win+2*padw)).
+   */
+  C = nrow(img)
+  img_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+  parfor (c in 1:C) {
+    img_slice = matrix(img[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+    img_padded_slice = matrix(pad_value, rows=Hin+2*padh, cols=Win+2*padw)
+    img_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = img_slice
+    img_padded[c,] = matrix(img_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+  }
+}
+
+unpad_image = function(matrix[double] img_padded, int Hin, int Win, int padh, int padw)
+    return (matrix[double] img) {
+  /*
+   * Unpads an image along the height and width dimensions.
+   *
+   * Inputs:
+   *  - img_padded: The input image padded along the height and width
+   *      dimensions, of shape (C, (Hin+2*padh)*(Win+2*padw)).
+   *  - Hin: Input height of unpadded image.
+   *  - Win: Input width of unpadded image.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *
+   * Outputs:
+   *  - img: Input image, of shape (C, Hin*Win), where C is the number
+   *      of input channels (depth).
+   */
+  C = nrow(img_padded)
+  img = matrix(0, rows=C, cols=Hin*Win)
+  parfor (c in 1:C) {
+    img_padded_slice = matrix(img_padded[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
+    img_slice = img_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
+    img[c,] = matrix(img_slice, rows=1, cols=Hin*Win)
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/README.md
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/README.md b/scripts/staging/SystemML-NN/README.md
deleted file mode 100644
index b80f2c6..0000000
--- a/scripts/staging/SystemML-NN/README.md
+++ /dev/null
@@ -1,183 +0,0 @@
-<!--
-{% comment %}
-Licensed to the Apache Software Foundation (ASF) under one or more
-contributor license agreements.  See the NOTICE file distributed with
-this work for additional information regarding copyright ownership.
-The ASF licenses this file to you under the Apache License, Version 2.0
-(the "License"); you may not use this file except in compliance with
-the License.  You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-{% endcomment %}
--->
-
-# SystemML-NN
-
-### A deep learning library for [Apache SystemML](https://github.com/apache/incubator-systemml).
-
-## Examples:
-#### Please see the [`examples`](nn/examples) folder for more detailed examples, or view the following two quick examples.
-### Neural net for regression with vanilla SGD:
-```python
-# Imports
-source("nn/layers/affine.dml") as affine
-source("nn/layers/l2_loss.dml") as l2_loss
-source("nn/layers/relu.dml") as relu
-source("nn/optim/sgd.dml") as sgd
-
-# Generate input data
-N = 1024 # num examples
-D = 100 # num features
-t = 1 # num targets
-X = rand(rows=N, cols=D, pdf="normal")
-y = rand(rows=N, cols=t)
-
-# Create 2-layer network:
-## affine1 -> relu1 -> affine2
-M = 64 # number of neurons
-[W1, b1] = affine::init(D, M)
-[W2, b2] = affine::init(M, t)
-
-# Initialize optimizer
-lr = 0.05  # learning rate
-mu = 0.9  # momentum
-decay = 0.99  # learning rate decay constant
-
-# Optimize
-print("Starting optimization")
-batch_size = 32
-epochs = 5
-iters = 1024 / batch_size
-for (e in 1:epochs) {
-  for(i in 1:iters) {
-    # Get next batch
-    X_batch = X[i:i+batch_size-1,]
-    y_batch = y[i:i+batch_size-1,]
-
-    # Compute forward pass
-    out1 = affine::forward(X_batch, W1, b1)
-    outr1 = relu::forward(out1)
-    out2 = affine::forward(outr1, W2, b2)
-
-    # Compute loss
-    loss = l2_loss::forward(out2, y_batch)
-    print("L2 loss: " + loss)
-
-    # Compute backward pass
-    dout2 = l2_loss::backward(out2, y_batch)
-    [doutr1, dW2, db2] = affine::backward(dout2, outr1, W2, b2)
-    dout1 = relu::backward(doutr1, out1)
-    [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)
-
-    # Optimize with vanilla SGD
-    W1 = sgd::update(W1, dW1, lr)
-    b1 = sgd::update(b1, db1, lr)
-    W2 = sgd::update(W2, dW2, lr)
-    b2 = sgd::update(b2, db2, lr)
-  }
-  # Decay learning rate
-  lr = lr * decay
-}
-```
-
-### Neural net for multi-class classification with dropout and SGD w/ Nesterov momentum:
-```python
-# Imports
-source("nn/layers/affine.dml") as affine
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/dropout.dml") as dropout
-source("nn/layers/relu.dml") as relu
-source("nn/layers/softmax.dml") as softmax
-source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
-
-# Generate input data
-N = 1024 # num examples
-D = 100 # num features
-t = 5 # num targets
-X = rand(rows=N, cols=D, pdf="normal")
-classes = round(rand(rows=N, cols=1, min=1, max=t, pdf="uniform"))
-y = matrix(0, rows=N, cols=t)
-parfor (i in 1:N) {
-  y[i, as.scalar(classes[i,1])] = 1  # one-hot encoding
-}
-
-# Create network:
-# affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> dropout2 -> affine3 -> softmax
-H1 = 64 # number of neurons in 1st hidden layer
-H2 = 64 # number of neurons in 2nd hidden layer
-p = 0.5  # dropout probability
-[W1, b1] = affine::init(D, H1)
-[W2, b2] = affine::init(H1, H2)
-[W3, b3] = affine::init(H2, t)
-
-# Initialize SGD w/ Nesterov momentum optimizer
-lr = 0.05  # learning rate
-mu = 0.5  # momentum
-decay = 0.99  # learning rate decay constant
-vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
-vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
-vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
-
-# Optimize
-print("Starting optimization")
-batch_size = 64
-epochs = 10
-iters = 1024 / batch_size
-for (e in 1:epochs) {
-  for(i in 1:iters) {
-    # Get next batch
-    X_batch = X[i:i+batch_size-1,]
-    y_batch = y[i:i+batch_size-1,]
-
-    # Compute forward pass
-    ## layer 1:
-    out1 = affine::forward(X_batch, W1, b1)
-    outr1 = relu::forward(out1)
-    [outd1, maskd1] = dropout::forward(outr1, p, -1)
-    ## layer 2:
-    out2 = affine::forward(outd1, W2, b2)
-    outr2 = relu::forward(out2)
-    [outd2, maskd2] = dropout::forward(outr2, p, -1)
-    ## layer 3:
-    out3 = affine::forward(outd2, W3, b3)
-    probs = softmax::forward(out3)
-
-    # Compute loss
-    loss = cross_entropy_loss::forward(probs, y_batch)
-    print("Cross entropy loss: " + loss)
-
-    # Compute backward pass
-    ## loss:
-    dprobs = cross_entropy_loss::backward(probs, y_batch)
-    ## layer 3:
-    dout3 = softmax::backward(dprobs, out3)
-    [doutd2, dW3, db3] = affine::backward(dout3, outd2, W3, b3)
-    ## layer 2:
-    doutr2 = dropout::backward(doutd2, outr2, p, maskd2)
-    dout2 = relu::backward(doutr2, out2)
-    [doutd1, dW2, db2] = affine::backward(dout2, outd1, W2, b2)
-    ## layer 1:
-    doutr1 = dropout::backward(doutd1, outr1, p, maskd1)
-    dout1 = relu::backward(doutr1, out1)
-    [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)
-
-    # Optimize with SGD w/ Nesterov momentum
-    [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
-    [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
-    [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
-    [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
-    [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
-    [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
-  }
-  # Anneal momentum towards 0.999
-  mu = mu + (0.999 - mu)/(1+epochs-e)
-  # Decay learning rate
-  lr = lr * decay
-}
-```

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb b/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb
deleted file mode 100644
index 0423269..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb	
+++ /dev/null
@@ -1,189 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Quick Setup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create a SystemML MLContext object\n",
-    "from systemml import MLContext, dml\n",
-    "ml = MLContext(sc)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Download Data - MNIST"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9].  Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%sh\n",
-    "mkdir -p data/mnist/\n",
-    "cd data/mnist/\n",
-    "curl -O https://pjreddie.com/media/files/mnist_train.csv\n",
-    "curl -O https://pjreddie.com/media/files/mnist_test.csv"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## SystemML \"LeNet\" Neural Network"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 1. Train"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "script_string = \"\"\"\n",
-    "source(\"nn/examples/mnist_lenet.dml\") as mnist_lenet\n",
-    "\n",
-    "# Read training data\n",
-    "data = read($data, format=\"csv\")\n",
-    "n = nrow(data)\n",
-    "\n",
-    "# Extract images and labels\n",
-    "images = data[,2:ncol(data)]\n",
-    "labels = data[,1]\n",
-    "\n",
-    "# Scale images to [-1,1], and one-hot encode the labels\n",
-    "images = (images / 255.0) * 2 - 1\n",
-    "labels = table(seq(1, n), labels+1, n, 10)\n",
-    "\n",
-    "# Split into training (55,000 examples) and validation (5,000 examples)\n",
-    "X = images[5001:nrow(images),]\n",
-    "X_val = images[1:5000,]\n",
-    "y = labels[5001:nrow(images),]\n",
-    "y_val = labels[1:5000,]\n",
-    "\n",
-    "# Train\n",
-    "epochs = 10\n",
-    "[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)\n",
-    "\"\"\"\n",
-    "script = (dml(script_string).input(\"$data\", \"data/mnist/mnist_train.csv\")\n",
-    "                            .input(C=1, Hin=28, Win=28)\n",
-    "                            .output(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))\n",
-    "W1, b1, W2, b2, W3, b3, W4, b4 = (ml.execute(script)\n",
-    "                                    .get(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2. Compute Test Accuracy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "script_string = \"\"\"\n",
-    "source(\"nn/examples/mnist_lenet.dml\") as mnist_lenet\n",
-    "\n",
-    "# Read test data\n",
-    "data = read($data, format=\"csv\")\n",
-    "n = nrow(data)\n",
-    "\n",
-    "# Extract images and labels\n",
-    "X_test = data[,2:ncol(data)]\n",
-    "y_test = data[,1]\n",
-    "\n",
-    "# Scale images to [-1,1], and one-hot encode the labels\n",
-    "X_test = (X_test / 255.0) * 2 - 1\n",
-    "y_test = table(seq(1, n), y_test+1, n, 10)\n",
-    "\n",
-    "# Eval on test set\n",
-    "probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)\n",
-    "[loss, accuracy] = mnist_lenet::eval(probs, y_test)\n",
-    "\n",
-    "print(\"Test Accuracy: \" + accuracy)\n",
-    "\"\"\"\n",
-    "script = dml(script_string).input(**{\"$data\": \"data/mnist/mnist_train.csv\",\n",
-    "                                     \"C\": 1, \"Hin\": 28, \"Win\": 28,\n",
-    "                                     \"W1\": W1, \"b1\": b1,\n",
-    "                                     \"W2\": W2, \"b2\": b2,\n",
-    "                                     \"W3\": W3, \"b3\": b3,\n",
-    "                                     \"W4\": W4, \"b4\": b4})\n",
-    "ml.execute(script)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 3. Extract Model Into Spark DataFrames For Future Use"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "W1_df = W1.toDF()\n",
-    "b1_df = b1.toDF()\n",
-    "W2_df = W2.toDF()\n",
-    "b2_df = b2.toDF()\n",
-    "W3_df = W3.toDF()\n",
-    "b3_df = b3.toDF()\n",
-    "W4_df = W4.toDF()\n",
-    "b4_df = b4.toDF()\n",
-    "W1_df, b1_df, W2_df, b2_df, W3_df, b3_df, W4_df, b4_df"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 + Spark 2.x + SystemML",
-   "language": "python",
-   "name": "pyspark3_2.x"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb b/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb
deleted file mode 100644
index 5e7182a..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb	
+++ /dev/null
@@ -1,179 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Quick Setup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [],
-   "source": [
-    "# Create a SystemML MLContext object\n",
-    "from systemml import MLContext, dml\n",
-    "ml = MLContext(sc)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Download Data - MNIST"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9].  Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "%%sh\n",
-    "mkdir -p data/mnist/\n",
-    "cd data/mnist/\n",
-    "curl -O https://pjreddie.com/media/files/mnist_train.csv\n",
-    "curl -O https://pjreddie.com/media/files/mnist_test.csv"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## SystemML Softmax Model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 1. Train"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "training = \"\"\"\n",
-    "source(\"nn/examples/mnist_softmax.dml\") as mnist_softmax\n",
-    "\n",
-    "# Read training data\n",
-    "data = read($data, format=\"csv\")\n",
-    "n = nrow(data)\n",
-    "\n",
-    "# Extract images and labels\n",
-    "images = data[,2:ncol(data)]\n",
-    "labels = data[,1]\n",
-    "\n",
-    "# Scale images to [0,1], and one-hot encode the labels\n",
-    "images = images / 255.0\n",
-    "labels = table(seq(1, n), labels+1, n, 10)\n",
-    "\n",
-    "# Split into training (55,000 examples) and validation (5,000 examples)\n",
-    "X = images[5001:nrow(images),]\n",
-    "X_val = images[1:5000,]\n",
-    "y = labels[5001:nrow(images),]\n",
-    "y_val = labels[1:5000,]\n",
-    "\n",
-    "# Train\n",
-    "epochs = 1\n",
-    "[W, b] = mnist_softmax::train(X, y, X_val, y_val, epochs)\n",
-    "\"\"\"\n",
-    "script = dml(training).input(\"$data\", \"data/mnist/mnist_train.csv\").output(\"W\", \"b\")\n",
-    "W, b = ml.execute(script).get(\"W\", \"b\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 2. Compute Test Accuracy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "testing = \"\"\"\n",
-    "source(\"nn/examples/mnist_softmax.dml\") as mnist_softmax\n",
-    "\n",
-    "# Read test data\n",
-    "data = read($data, format=\"csv\")\n",
-    "n = nrow(data)\n",
-    "\n",
-    "# Extract images and labels\n",
-    "X_test = data[,2:ncol(data)]\n",
-    "y_test = data[,1]\n",
-    "\n",
-    "# Scale images to [0,1], and one-hot encode the labels\n",
-    "X_test = X_test / 255.0\n",
-    "y_test = table(seq(1, n), y_test+1, n, 10)\n",
-    "\n",
-    "# Eval on test set\n",
-    "probs = mnist_softmax::predict(X_test, W, b)\n",
-    "[loss, accuracy] = mnist_softmax::eval(probs, y_test)\n",
-    "\n",
-    "print(\"Test Accuracy: \" + accuracy)\n",
-    "\"\"\"\n",
-    "script = dml(testing).input(\"$data\", \"data/mnist/mnist_test.csv\", W=W, b=b)\n",
-    "ml.execute(script)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 3. Extract Model Into Spark DataFrames For Future Use"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "W_df = W.toDF()\n",
-    "b_df = b.toDF()\n",
-    "W_df, b_df"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/README.md
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/README.md b/scripts/staging/SystemML-NN/nn/examples/README.md
deleted file mode 100644
index d5e9d04..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/README.md
+++ /dev/null
@@ -1,74 +0,0 @@
-<!--
-{% comment %}
-Licensed to the Apache Software Foundation (ASF) under one or more
-contributor license agreements.  See the NOTICE file distributed with
-this work for additional information regarding copyright ownership.
-The ASF licenses this file to you under the Apache License, Version 2.0
-(the "License"); you may not use this file except in compliance with
-the License.  You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-{% endcomment %}
--->
-
-# SystemML-NN Examples
-
-#### This folder contains scripts and PySpark Jupyter notebooks serving as examples of using the *SystemML-NN* (`nn`) deep learning library.
-
----
-
-# Examples
-### MNIST Softmax Classifier
-
-* This example trains a softmax classifier, which is essentially a multi-class logistic regression model, on the MNIST data.  The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
-* Notebook: `Example - MNIST Softmax Classifier.ipynb`.
-* DML Functions: `mnist_softmax.dml`
-* Training script: `mnist_softmax-train.dml`
-* Prediction script: `mnist_softmax-predict.dml`
-
-### MNIST "LeNet" Neural Net
-
-* This example trains a neural network on the MNIST data using a ["LeNet" architecture](http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf). The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
-* Notebook: `Example - MNIST LeNet.ipynb`.
-* DML Functions: `mnist_lenet.dml`
-* Training script: `mnist_lenet-train.dml`
-* Prediction script: `mnist_lenet-predict.dml`
-
----
-
-# Setup
-## Code
-* To run the examples, please first download and unzip the project via GitHub using the "Clone or download" button on the [homepage of the project](https://github.com/dusenberrymw/systemml-nn), *or* via the following commands:
-
-  ```
-  git clone https://github.com/dusenberrymw/systemml-nn.git
-  ```
-
-* Then, move into the `systemml-nn` folder via:
-  ```
-  cd systemml-nn
-  ```
-
-## Data
-* These examples use the classic [MNIST](http://yann.lecun.com/exdb/mnist/) dataset, which contains labeled 28x28 pixel images of handwritten digits in the range of 0-9.  There are 60,000 training images, and 10,000 testing images.  Of the 60,000 training images, 5,000 will be used as validation images.
-* **Download**:
-  * **Notebooks**: The data will be automatically downloaded as a step in either of the example notebooks.
-  * **Training scripts**: Please run `get_mnist_data.sh` to download the data separately.
-
-## Execution
-* These examples contain scripts written in SystemML's R-like language (`*.dml`), as well as PySpark Jupyter notebooks (`*.ipynb`).  The scripts contain the math for the algorithms, enclosed in functions, and the notebooks serve as full, end-to-end examples of reading in data, training models using the functions within the scripts, and evaluating final performance.
-* **Notebooks**: To run the notebook examples, please install the SystemML Python package with `pip install systemml`, and then startup Jupyter in the following manner from this directory (or for more information, please see [this great blog post](http://spark.tc/0-to-life-changing-application-with-apache-systemml/)):
-
-  ```
-  PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook" pyspark --master local[*] --driver-memory 3G --driver-class-path SystemML.jar --jars SystemML.jar
-  ```
-
-  Note that all printed output, such as training statistics, from the SystemML scripts will be sent to the terminal in which Jupyter was started (for now...).
-
-* **Scripts**: To run the scripts from the command line using `spark-submit`, please see the comments located at the top of the `-train` and `-predict` scripts.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh b/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh
deleted file mode 100755
index deb0c40..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-DIR="$(cd "$(dirname "$0")" && pwd)"
-mkdir -p $DIR/data/mnist/
-cd $DIR/data/mnist/
-curl -O https://pjreddie.com/media/files/mnist_train.csv
-curl -O https://pjreddie.com/media/files/mnist_test.csv
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml
deleted file mode 100644
index 85a5307..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml
+++ /dev/null
@@ -1,91 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST LeNet - Predict
-#
-# This script computes the class probability predictions of a
-# trained convolutional net using the "LeNet" architecture on
-# images of handwritten digits.
-#
-# Inputs:
-#  - X: File containing training images.
-#     The format is "pixel_1, pixel_2, ..., pixel_n".
-#  - C: Number of color chanels in the images.
-#  - Hin: Input image height.
-#  - Win: Input image width.
-#  - model_dir: Directory containing the trained weights and biases
-#     of the model.
-#  - out_dir: Directory to store class probability predictions for
-#     each image.
-#  - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
-#     Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-#  - probs: File containing class probability predictions for each
-#     image.
-#
-# Data:
-# The X file should contain images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels.
-#
-# Sample Invocation (running from outside the `nn` folder):
-# 1. Download images.
-#
-#   For example, save images to `nn/examples/data/mnist/images.csv`.
-#
-# 2. Execute using Spark
-#   ```
-#   spark-submit --master local[*] --driver-memory 5G
-#   --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
-#   $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_lenet-predict.dml
-#   -nvargs X=nn/examples/data/mnist/images.csv C=1 Hin=28 Win=28
-#   model_dir=nn/examples/model/mnist_lenet out_dir=nn/examples/data/mnist
-#   ```
-#
-source("nn/examples/mnist_lenet.dml") as mnist_lenet
-
-# Read training data
-fmt = ifdef($fmt, "csv")
-X = read($X, format=fmt)
-C = $C
-Hin = $Hin
-Win = $Win
-
-# Scale images to [-1,1]
-X = (X / 255.0) * 2 - 1
-
-# Read model coefficients
-W1 = read($model_dir+"/W1")
-b1 = read($model_dir+"/b1")
-W2 = read($model_dir+"/W2")
-b2 = read($model_dir+"/b2")
-W3 = read($model_dir+"/W3")
-b3 = read($model_dir+"/b3")
-W4 = read($model_dir+"/W4")
-b4 = read($model_dir+"/b4")
-
-# Predict classes
-probs = mnist_lenet::predict(X, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
-
-# Output results
-write(probs, $out_dir+"/probs."+fmt, format=fmt)
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml
deleted file mode 100644
index 0fc733e..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml
+++ /dev/null
@@ -1,123 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST LeNet - Train
-#
-# This script trains a convolutional net using the "LeNet" architecture
-# on images of handwritten digits.
-#
-# Inputs:
-#  - train: File containing labeled MNIST training images.
-#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
-#  - test: File containing labeled MNIST test images.
-#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
-#  - C: Number of color chanels in the images.
-#  - Hin: Input image height.
-#  - Win: Input image width.
-#  - epochs: [DEFAULT: 10] Total number of full training loops over
-#     the full data set.
-#  - out_dir: [DEFAULT: "."] Directory to store weights and bias
-#     matrices of trained model, as well as final test accuracy.
-#  - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
-#     Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-#  - W1, W2, W3, W4: Files containing the trained weights of the model.
-#  - b1, b2, b3, b4: Files containing the trained biases of the model.
-#  - accuracy: File containing the final accuracy on the test data.
-#
-# Data:
-# The MNIST dataset contains labeled images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels, and each label is
-# one of 10 possible digits in [0,9].
-#
-# Sample Invocation (running from outside the `nn` folder):
-# 1. Download data (60,000 training examples, and 10,000 test examples)
-#   ```
-#   nn/examples/get_mnist_data.sh
-#   ```
-#
-# 2. Execute using Spark
-#   ```
-#   spark-submit --master local[*] --driver-memory 10G
-#   --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
-#   $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_lenet-train.dml
-#   -nvargs train=nn/examples/data/mnist/mnist_train.csv test=nn/examples/data/mnist/mnist_test.csv
-#   C=1 Hin=28 Win=28 epochs=10 out_dir=nn/examples/model/mnist_lenet
-#   ```
-#
-source("nn/examples/mnist_lenet.dml") as mnist_lenet
-
-# Read training data & settings
-fmt = ifdef($fmt, "csv")
-train = read($train, format=fmt)
-test = read($test, format=fmt)
-C = $C
-Hin = $Hin
-Win = $Win
-epochs = ifdef($epochs, 10)
-out_dir = ifdef($out_dir, ".")
-
-# Extract images and labels
-images = train[,2:ncol(train)]
-labels = train[,1]
-X_test = test[,2:ncol(test)]
-y_test = test[,1]
-
-# Scale images to [-1,1], and one-hot encode the labels
-n = nrow(train)
-n_test = nrow(test)
-images = (images / 255.0) * 2 - 1
-labels = table(seq(1, n), labels+1, n, 10)
-X_test = (X_test / 255.0) * 2 - 1
-y_test = table(seq(1, n_test), y_test+1, n_test, 10)
-
-# Split into training (55,000 examples) and validation (5,000 examples)
-X = images[5001:nrow(images),]
-X_val = images[1:5000,]
-y = labels[5001:nrow(images),]
-y_val = labels[1:5000,]
-
-# Train
-[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)
-
-# Write model out
-write(W1, out_dir+"/W1")
-write(b1, out_dir+"/b1")
-write(W2, out_dir+"/W2")
-write(b2, out_dir+"/b2")
-write(W3, out_dir+"/W3")
-write(b3, out_dir+"/b3")
-write(W4, out_dir+"/W4")
-write(b4, out_dir+"/b4")
-
-# Eval on test set
-probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
-[loss, accuracy] = mnist_lenet::eval(probs, y_test)
-
-# Output results
-print("Test Accuracy: " + accuracy)
-write(accuracy, out_dir+"/accuracy")
-
-print("")
-print("")
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml
deleted file mode 100644
index e5755c4..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml
+++ /dev/null
@@ -1,331 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * MNIST LeNet Example
- */
-# Imports
-source("nn/layers/affine.dml") as affine
-source("nn/layers/conv2d_builtin.dml") as conv2d
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/dropout.dml") as dropout
-source("nn/layers/l2_reg.dml") as l2_reg
-source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
-source("nn/layers/relu.dml") as relu
-source("nn/layers/softmax.dml") as softmax
-source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
-
-train = function(matrix[double] X, matrix[double] y,
-                 matrix[double] X_val, matrix[double] y_val,
-                 int C, int Hin, int Win, int epochs)
-    return (matrix[double] W1, matrix[double] b1,
-            matrix[double] W2, matrix[double] b2,
-            matrix[double] W3, matrix[double] b3,
-            matrix[double] W4, matrix[double] b4) {
-  /*
-   * Trains a convolutional net using the "LeNet" architecture.
-   *
-   * The input matrix, X, has N examples, each represented as a 3D
-   * volume unrolled into a single vector.  The targets, y, have K
-   * classes, and are one-hot encoded.
-   *
-   * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
-   *  - y: Target matrix, of shape (N, K).
-   *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
-   *  - y_val: Target validation matrix, of shape (N, K).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - epochs: Total number of full training loops over the full data set.
-   *
-   * Outputs:
-   *  - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
-   *  - b1: 1st layer biases vector, of shape (F1, 1).
-   *  - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
-   *  - b2: 2nd layer biases vector, of shape (F2, 1).
-   *  - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
-   *  - b3: 3rd layer biases vector, of shape (1, N3).
-   *  - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
-   *  - b4: 4th layer biases vector, of shape (1, K).
-   */
-  N = nrow(X)
-  K = ncol(y)
-
-  # Create network:
-  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
-  Hf = 5  # filter height
-  Wf = 5  # filter width
-  stride = 1
-  pad = 2  # For same dimensions, (Hf - stride) / 2
-
-  F1 = 32  # num conv filters in conv1
-  F2 = 64  # num conv filters in conv2
-  N3 = 512  # num nodes in affine3
-  # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)
-
-  [W1, b1] = conv2d::init(F1, C, Hf, Wf)  # inputs: (N, C*Hin*Win)
-  [W2, b2] = conv2d::init(F2, F1, Hf, Wf)  # inputs: (N, F1*(Hin/2)*(Win/2))
-  [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3)  # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
-  [W4, b4] = affine::init(N3, K)  # inputs: (N, N3)
-  W4 = W4 / sqrt(2)  # different initialization, since being fed into softmax, instead of relu
-
-  # Initialize SGD w/ Nesterov momentum optimizer
-  lr = 0.01  # learning rate
-  mu = 0.9  #0.5  # momentum
-  decay = 0.95  # learning rate decay constant
-  vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
-  vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
-  vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
-  vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)
-
-  # Regularization
-  lambda = 5e-04
-
-  # Optimize
-  print("Starting optimization")
-  batch_size = 64
-  iters = ceil(N / batch_size)
-  for (e in 1:epochs) {
-    for(i in 1:iters) {
-      # Get next batch
-      beg = ((i-1) * batch_size) %% N + 1
-      end = min(N, beg + batch_size - 1)
-      X_batch = X[beg:end,]
-      y_batch = y[beg:end,]
-
-      # Compute forward pass
-      ## layer 1: conv1 -> relu1 -> pool1
-      [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
-                                                pad, pad)
-      outr1 = relu::forward(outc1)
-      [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
-                                                    strideh=2, stridew=2, pad=0, pad=0)
-      ## layer 2: conv2 -> relu2 -> pool2
-      [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
-                                                stride, stride, pad, pad)
-      outr2 = relu::forward(outc2)
-      [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
-                                                    strideh=2, stridew=2, pad=0, pad=0)
-      ## layer 3:  affine3 -> relu3 -> dropout
-      outa3 = affine::forward(outp2, W3, b3)
-      outr3 = relu::forward(outa3)
-      [outd3, maskd3] = dropout::forward(outr3, 0.5, -1)
-      ## layer 4:  affine4 -> softmax
-      outa4 = affine::forward(outd3, W4, b4)
-      probs = softmax::forward(outa4)
-
-      # Compute loss & accuracy for training & validation data every 100 iterations.
-      if (i %% 100 == 0) {
-        # Compute training loss & accuracy
-        loss_data = cross_entropy_loss::forward(probs, y_batch)
-        loss_reg_W1 = l2_reg::forward(W1, lambda)
-        loss_reg_W2 = l2_reg::forward(W2, lambda)
-        loss_reg_W3 = l2_reg::forward(W3, lambda)
-        loss_reg_W4 = l2_reg::forward(W4, lambda)
-        loss = loss_data + loss_reg_W1 + loss_reg_W2 + loss_reg_W3 + loss_reg_W4
-        accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
-
-        # Compute validation loss & accuracy
-        probs_val = predict(X_val, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
-        loss_val = cross_entropy_loss::forward(probs_val, y_val)
-        accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
-
-        # Output results
-        print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
-              + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
-      }
-
-      # Compute data backward pass
-      ## loss:
-      dprobs = cross_entropy_loss::backward(probs, y_batch)
-      ## layer 4:  affine4 -> softmax
-      douta4 = softmax::backward(dprobs, outa4)
-      [doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4)
-      ## layer 3:  affine3 -> relu3 -> dropout
-      doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3)
-      douta3 = relu::backward(doutr3, outa3)
-      [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
-      ## layer 2: conv2 -> relu2 -> pool2
-      doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
-                                    strideh=2, stridew=2, pad=0, pad=0)
-      doutc2 = relu::backward(doutr2, outc2)
-      [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
-                                            Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
-      ## layer 1: conv1 -> relu1 -> pool1
-      doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
-                                    strideh=2, stridew=2, pad=0, pad=0)
-      doutc1 = relu::backward(doutr1, outc1)
-      [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win,
-                                              Hf, Wf, stride, stride, pad, pad)
-
-      # Compute regularization backward pass
-      dW1_reg = l2_reg::backward(W1, lambda)
-      dW2_reg = l2_reg::backward(W2, lambda)
-      dW3_reg = l2_reg::backward(W3, lambda)
-      dW4_reg = l2_reg::backward(W4, lambda)
-      dW1 = dW1 + dW1_reg
-      dW2 = dW2 + dW2_reg
-      dW3 = dW3 + dW3_reg
-      dW4 = dW4 + dW4_reg
-
-      # Optimize with SGD w/ Nesterov momentum
-      [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
-      [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
-      [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
-      [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
-      [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
-      [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
-      [W4, vW4] = sgd_nesterov::update(W4, dW4, lr, mu, vW4)
-      [b4, vb4] = sgd_nesterov::update(b4, db4, lr, mu, vb4)
-    }
-    # Anneal momentum towards 0.999
-    #mu = mu + (0.999 - mu)/(1+epochs-e)
-    # Decay learning rate
-    lr = lr * decay
-  }
-}
-
-predict = function(matrix[double] X, int C, int Hin, int Win,
-                   matrix[double] W1, matrix[double] b1,
-                   matrix[double] W2, matrix[double] b2,
-                   matrix[double] W3, matrix[double] b3,
-                   matrix[double] W4, matrix[double] b4)
-    return (matrix[double] probs) {
-  /*
-   * Computes the class probability predictions of a convolutional
-   * net using the "LeNet" architecture.
-   *
-   * The input matrix, X, has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
-   *  - b1: 1st layer biases vector, of shape (F1, 1).
-   *  - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
-   *  - b2: 2nd layer biases vector, of shape (F2, 1).
-   *  - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
-   *  - b3: 3rd layer biases vector, of shape (1, N3).
-   *  - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
-   *  - b4: 4th layer biases vector, of shape (1, K).
-   *
-   * Outputs:
-   *  - probs: Class probabilities, of shape (N, K).
-   */
-  N = nrow(X)
-
-  # Network:
-  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
-  Hf = 5  # filter height
-  Wf = 5  # filter width
-  stride = 1
-  pad = 2  # For same dimensions, (Hf - stride) / 2
-
-  F1 = nrow(W1)  # num conv filters in conv1
-  F2 = nrow(W2)  # num conv filters in conv2
-  N3 = ncol(W3)  # num nodes in affine3
-  K = ncol(W4)  # num nodes in affine4, equal to number of target dimensions (num classes)
-
-  # Compute predictions over mini-batches
-  probs = matrix(0, rows=N, cols=K)
-  batch_size = 64
-  iters = ceil(N / batch_size)
-  for(i in 1:iters) {
-    # Get next batch
-    beg = ((i-1) * batch_size) %% N + 1
-    end = min(N, beg + batch_size - 1)
-    X_batch = X[beg:end,]
-
-    # Compute forward pass
-    ## layer 1: conv1 -> relu1 -> pool1
-    [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
-                                              pad, pad)
-    outr1 = relu::forward(outc1)
-    [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
-                                                  strideh=2, stridew=2, pad=0, pad=0)
-    ## layer 2: conv2 -> relu2 -> pool2
-    [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
-                                              stride, stride, pad, pad)
-    outr2 = relu::forward(outc2)
-    [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
-                                                  strideh=2, stridew=2, pad=0, pad=0)
-    ## layer 3:  affine3 -> relu3
-    outa3 = affine::forward(outp2, W3, b3)
-    outr3 = relu::forward(outa3)
-    ## layer 4:  affine4 -> softmax
-    outa4 = affine::forward(outr3, W4, b4)
-    probs_batch = softmax::forward(outa4)
-
-    # Store predictions
-    probs[beg:end,] = probs_batch
-  }
-}
-
-eval = function(matrix[double] probs, matrix[double] y)
-    return (double loss, double accuracy) {
-  /*
-   * Evaluates a convolutional net using the "LeNet" architecture.
-   *
-   * The probs matrix contains the class probability predictions
-   * of K classes over N examples.  The targets, y, have K classes,
-   * and are one-hot encoded.
-   *
-   * Inputs:
-   *  - probs: Class probabilities, of shape (N, K).
-   *  - y: Target matrix, of shape (N, K).
-   *
-   * Outputs:
-   *  - loss: Scalar loss, of shape (1).
-   *  - accuracy: Scalar accuracy, of shape (1).
-   */
-  # Compute loss & accuracy
-  loss = cross_entropy_loss::forward(probs, y)
-  correct_pred = rowIndexMax(probs) == rowIndexMax(y)
-  accuracy = mean(correct_pred)
-}
-
-generate_dummy_data = function()
-    return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
-  /*
-   * Generate a dummy dataset similar to the MNIST dataset.
-   *
-   * Outputs:
-   *  - X: Input data matrix, of shape (N, D).
-   *  - y: Target matrix, of shape (N, K).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   */
-  # Generate dummy input data
-  N = 1024  # num examples
-  C = 1  # num input channels
-  Hin = 28  # input height
-  Win = 28  # input width
-  K = 10  # num target classes
-  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
-  classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))
-  y = table(seq(1, N), classes)  # one-hot encoding
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml
deleted file mode 100644
index 4c8c434..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml
+++ /dev/null
@@ -1,77 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST Softmax - Predict
-#
-# This script computes the class probability predictions of a
-# trained softmax classifier on images of handwritten digits.
-#
-# Inputs:
-#  - X: File containing training images.
-#     The format is "pixel_1, pixel_2, ..., pixel_n".
-#  - model_dir: Directory containing the trained weights and biases
-#     of the model.
-#  - out_dir: Directory to store class probability predictions for
-#     each image.
-#  - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
-#     Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-#  - probs: File containing class probability predictions for each
-#     image.
-#
-# Data:
-# The X file should contain images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels.
-#
-# Sample Invocation:
-# 1. Download images.
-#
-#   For example, save images to `nn/examples/data/mnist/images.csv`.
-#
-# 2. Execute using Spark
-#   ```
-#   spark-submit --master local[*] --driver-memory 5G
-#   --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
-#   $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_softmax-predict.dml
-#   -nvargs X=nn/examples/data/mnist/images.csv
-#   model_dir=nn/examples/model/mnist_softmax out_dir=nn/examples/data/mnist
-#
-source("nn/examples/mnist_softmax.dml") as mnist_softmax
-
-# Read training data
-fmt = ifdef($fmt, "csv")
-X = read($X, format=fmt)
-
-# Scale images to [0,1], and one-hot encode the labels
-X = X / 255.0
-
-# Read model coefficients
-W = read($model_dir+"/W")
-b = read($model_dir+"/b")
-
-# Predict classes
-probs = mnist_softmax::predict(X, W, b)
-
-# Output results
-write(probs, $out_dir+"/probs."+fmt, format=fmt)
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml
deleted file mode 100644
index 09970f0..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml
+++ /dev/null
@@ -1,110 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST Softmax - Train
-#
-# This script trains a softmax classifier on images of handwritten
-# digits.
-#
-# Inputs:
-#  - train: File containing labeled MNIST training images.
-#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
-#  - test: File containing labeled MNIST test images.
-#     The format is "label, pixel_1, pixel_2, ..., pixel_n".
-#  - out_dir: Directory to store weights and bias matrices of
-#     trained model, as well as final test accuracy.
-#  - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
-#     Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-#  - W: File containing the trained weights of the model.
-#  - b: File containing the trained biases of the model.
-#  - accuracy: File containing the final accuracy on the test data.
-#
-# Data:
-# The MNIST dataset contains labeled images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels, and each label is
-# one of 10 possible digits in [0,9].
-#
-# Sample Invocation (running from wihtin the `examples` folder):
-# 1. Download data (60,000 training examples, and 10,000 test examples)
-#   ```
-#   nn/examples/get_mnist_data.sh
-#   ```
-#
-# 2. Execute using Spark
-#   ```
-#   spark-submit --master local[*] --driver-memory 10G
-#   --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
-#   $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_softmax-train.dml
-#   -nvargs train=nn/examples/data/mnist/mnist_train.csv test=nn/examples/data/mnist/mnist_test.csv
-#   epochs=1 out_dir=nn/examples/model/mnist_softmax
-#   ```
-#
-source("nn/examples/mnist_softmax.dml") as mnist_softmax
-
-# Read training data
-fmt = ifdef($fmt, "csv")
-train = read($train, format=fmt)
-test = read($test, format=fmt)
-epochs = ifdef($epochs, 1)
-out_dir = ifdef($out_dir, ".")
-
-# Extract images and labels
-images = train[,2:ncol(train)]
-labels = train[,1]
-X_test = test[,2:ncol(test)]
-y_test = test[,1]
-
-# Scale images to [0,1], and one-hot encode the labels
-n = nrow(train)
-n_test = nrow(test)
-classes = 10
-images = images / 255.0
-labels = table(seq(1, n), labels+1, n, classes)
-X_test = X_test / 255.0
-y_test = table(seq(1, n_test), y_test+1, n_test, classes)
-
-# Split into training (55,000 examples) and validation (5,000 examples)
-X = images[5001:nrow(images),]
-X_val = images[1:5000,]
-y = labels[5001:nrow(images),]
-y_val = labels[1:5000,]
-
-# Train
-[W, b] = mnist_softmax::train(X, y, X_val, y_val, epochs)
-
-# Write model out
-write(W, out_dir+"/W")
-write(b, out_dir+"/b")
-
-# Eval on test set
-probs = mnist_softmax::predict(X_test, W, b)
-[loss, accuracy] = mnist_softmax::eval(probs, y_test)
-
-# Output results
-print("Test Accuracy: " + accuracy)
-write(accuracy, out_dir+"/accuracy")
-
-print("")
-print("")
-



[10/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn` library to `scripts/nn`

Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/batch_norm2d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/batch_norm2d.dml b/scripts/nn/layers/batch_norm2d.dml
new file mode 100644
index 0000000..49c6746
--- /dev/null
+++ b/scripts/nn/layers/batch_norm2d.dml
@@ -0,0 +1,238 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D (Spatial) Batch Normalization layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
+                   int C, int Hin, int Win, string mode,
+                   matrix[double] ema_mean, matrix[double] ema_var,
+                   double mu, double epsilon)
+    return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
+            matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
+  /*
+   * Computes the forward pass for a 2D (spatial) batch normalization
+   * layer.  The input data has N examples, each represented as a 3D
+   * volume unrolled into a single vector.
+   *
+   * A spatial batch normalization layer uses the per-channel sample
+   * mean and per-channel uncorrected sample variance during training
+   * to normalize each channel of the input data.  Additionally, it
+   * introduces learnable parameters (gamma, beta) to control the
+   * amount of normalization.
+   *
+   *   `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
+   *
+   * This implementation maintains exponential moving averages of the
+   * mean and variance during training for use during testing.
+   *
+   * Reference:
+   *  - Batch Normalization: Accelerating Deep Network Training by
+   *    Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
+   *    - https://arxiv.org/abs/1502.03167
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - gamma: Scale parameters, of shape (C, 1).
+   *  - beta: Shift parameters, of shape (C, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - mode: 'train' or 'test' to indicate if the model is currently
+   *      being trained or tested.  During training, the current batch
+   *      mean and variance will be used to normalize the inputs, while
+   *      during testing, the exponential average of the mean and
+   *      variance over all previous batches will be used.
+   *  - ema_mean: Exponential moving average of the mean, of
+   *      shape (C, 1).
+   *  - ema_var: Exponential moving average of the variance, of
+   *      shape (C, 1).
+   *  - mu: Momentum value for moving averages.
+   *      Typical values are in the range of [0.9, 0.999].
+   *  - epsilon: Smoothing term to avoid divide by zero errors.
+   *      Typical values are in the range of [1e-5, 1e-3].
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, C*Hin*Win).
+   *  - ema_mean_upd: Updated exponential moving average of the mean,
+   *      of shape (C, 1).
+   *  - ema_var_upd: Updated exponential moving average of the variance,
+   *      of shape (C, 1).
+   *  - cache_mean: Cache of the batch mean, of shape (C, 1).
+   *      Note: This is used for performance during training.
+   *  - cache_var: Cache of the batch variance, of shape (C, 1).
+   *      Note: This is used for performance during training.
+   *  - cache_norm: Cache of the normalized inputs, of
+   *      shape (C, N*Hin*Win). Note: This is used for performance
+   *      during training.
+   */
+  N = nrow(X)
+
+  if (mode == 'train') {
+    # Compute channel-wise mean and variance
+    # Since we don't have tensors, we will compute the means and variances in a piece-wise fashion.
+    #  - mean of total group is mean of subgroup means
+    #  - variance is the mean of the subgroup variances + the variance of the subgroup means
+    subgrp_means = matrix(colMeans(X), rows=C, cols=Hin*Win)
+    subgrp_vars = matrix(colVars(X) * ((N-1)/N), rows=C, cols=Hin*Win)  # uncorrected variances
+    mean = rowMeans(subgrp_means)  # shape (C, 1)
+    var = rowMeans(subgrp_vars) + rowVars(subgrp_means)*(((Hin*Win)-1)/(Hin*Win))  # shape (C, 1)
+    # Update moving averages
+    ema_mean_upd = mu*ema_mean + (1-mu)*mean
+    ema_var_upd = mu*ema_var + (1-mu)*var
+  }
+  else {
+    # Use moving averages of mean and variance during testing
+    mean = ema_mean
+    var = ema_var
+    ema_mean_upd = ema_mean
+    ema_var_upd = ema_var
+  }
+
+  # Normalize, shift, and scale
+  # norm = (X-mean)*(var+epsilon)^(-1/2)
+  #      = (X-mean) / sqrt(var+epsilon)
+  centered = bias_add(X, -mean)  # shape (N, C*Hin*Win)
+  norm = bias_multiply(centered, 1/sqrt(var+epsilon))  # shape (N, C*Hin*Win)
+  # out = norm*gamma + beta
+  scaled = bias_multiply(norm, gamma)  # shape (N, C*Hin*Win)
+  out = bias_add(scaled, beta)  # shape (N, C*Hin*Win)
+
+  # Save variable for backward pass
+  cache_mean = mean
+  cache_var = var
+  cache_norm = norm
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+                    matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
+                    matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
+                    matrix[double] X, matrix[double] gamma, matrix[double] beta,
+                    int C, int Hin, int Win, string mode,
+                    matrix[double] ema_mean, matrix[double] ema_var,
+                    double mu, double epsilon)
+      return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+  /*
+   * Computes the backward pass for a 2D (spatial) batch normalization
+   * layer.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
+   *  - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
+   *  - ema_mean_upd: Updated exponential moving average of the mean
+   *      from the forward pass, of shape (C, 1).
+   *  - ema_var_upd: Updated exponential moving average of the variance
+   *      from the forward pass, of shape (C, 1).
+   *  - cache_mean: Cache of the batch mean from the forward pass, of
+   *      shape (C, 1).  Note: This is used for performance during
+   *      training.
+   *  - cache_var: Cache of the batch variance from the forward pass,
+   *      of shape (C, 1).  Note: This is used for performance during
+   *      training.
+   *  - cache_norm: Cache of the normalized inputs from the forward
+   *      pass, of shape (C, N*Hin*Win).  Note: This is used for
+   *      performance during training.
+   *  - X: Input data matrix to the forward pass, of
+   *      shape (N, C*Hin*Win).
+   *  - gamma: Scale parameters, of shape (C, 1).
+   *  - beta: Shift parameters, of shape (C, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - mode: 'train' or 'test' to indicate if the model is currently
+   *      being trained or tested.  During training, the current batch
+   *      mean and variance will be used to normalize the inputs, while
+   *      during testing, the exponential average of the mean and
+   *      variance over all previous batches will be used.
+   *  - ema_mean: Exponential moving average of the mean, of
+   *      shape (C, 1).
+   *  - ema_var: Exponential moving average of the variance, of
+   *      shape (C, 1).
+   *  - mu: Momentum value for moving averages.
+   *      Typical values are in the range of [0.9, 0.999].
+   *  - epsilon: Smoothing term to avoid divide by zero errors.
+   *      Typical values are in the range of [1e-5, 1e-3].
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dgamma: Gradient wrt `W`, of shape (C, 1).
+   *  - dbeta: Gradient wrt `b`, of shape (C, 1).
+   *
+   */
+  N = nrow(X)
+  mean = cache_mean
+  var = cache_var
+  norm = cache_norm
+  centered = bias_add(X, -mean)  # shape (N, C*Hin*Win)
+
+  if (mode == 'train') {
+    # Compute gradients during training
+    dgamma = util::channel_sums(dout*norm, C, Hin, Win)  # shape (C, 1)
+    dbeta = util::channel_sums(dout, C, Hin, Win)  # shape (C, 1)
+    dnorm = bias_multiply(dout, gamma)  # shape (N, C*Hin*Win)
+    dvar = util::channel_sums((-1/2) * bias_multiply(centered, (var+epsilon)^(-3/2)) * dnorm,
+                              C, Hin, Win)  # shape (C, 1)
+    dmean_norm_branch = util::channel_sums(bias_multiply(dnorm, -1/sqrt(var+epsilon)), C, Hin, Win)
+    dmean_var_branch =  util::channel_sums((-2/(N*Hin*Win)) * centered, C, Hin, Win)
+    dmean_var_branch = dmean_var_branch * dvar  # we can't use a function within an expression yet
+    dmean = dmean_norm_branch + dmean_var_branch  # shape (C, 1)
+    dX_norm_branch = bias_multiply(dnorm, 1/sqrt(var+epsilon))
+    dX_mean_branch = (1/(N*Hin*Win)) * bias_add(matrix(0, rows=1, cols=C*Hin*Win), dmean)
+    dX_var_branch = (2/(N*Hin*Win)) * bias_multiply(centered, dvar)
+    dX = dX_norm_branch + dX_mean_branch + dX_var_branch  # shape (N, C*Hin*Win)
+  }
+  else {
+    # Compute gradients during testing
+    dgamma = util::channel_sums(dout*norm, C, Hin, Win)  # shape (C, 1)
+    dbeta = util::channel_sums(dout, C, Hin, Win)  # shape (C, 1)
+    dnorm = bias_multiply(dout, gamma)  # shape (N, C*Hin*Win)
+    dX = bias_multiply(dnorm, 1/sqrt(var+epsilon))  # shape (N, C*Hin*Win)
+  }
+}
+
+init = function(int C)
+    return (matrix[double] gamma, matrix[double] beta,
+            matrix[double] ema_mean, matrix[double] ema_var) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * Inputs:
+   *  - C: Number of input channels (dimensionality of input depth).
+   *
+   * Outputs:
+   *  - gamma: Scale parameters, of shape (C, 1).
+   *  - beta: Shift parameters, of shape (C, 1).
+   *  - ema_mean: Exponential moving average of the mean, of
+   *      shape (C, 1).
+   *  - ema_var: Exponential moving average of the variance, of
+   *      shape (C, 1).
+   */
+   gamma = matrix(1, rows=C, cols=1)
+   beta = matrix(0, rows=C, cols=1)
+   ema_mean = matrix(0, rows=C, cols=1)
+   ema_var = matrix(1, rows=C, cols=1)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/conv2d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/conv2d.dml b/scripts/nn/layers/conv2d.dml
new file mode 100644
index 0000000..9d03568
--- /dev/null
+++ b/scripts/nn/layers/conv2d.dml
@@ -0,0 +1,194 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+                   int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial convolutional layer with
+   * F filters.  The input data has N examples, each represented as a 3D
+   * volume unrolled into a single vector.
+   *
+   * This implementation uses `im2col` internally for each image to
+   * extract local image regions (patches) into columns, and then
+   * performs a matrix multiplication with the filters to compute the
+   * output maps.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+   *      preserves the spatial dimensions of the input.
+   *  - padw: Padding for left and right sides.
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+   *      preserves the spatial dimensions of the input.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+  # Create output volume
+  out = matrix(0, rows=N, cols=F*Hout*Wout)
+
+  # Convolution - im2col implementation
+  parfor (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
+
+    # Pad image
+    Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0)  # shape (C, (Hin+2*padh)*(Win+2*padw))
+
+    # Extract local image patches into columns with im2col, of shape (C*Hf*Wf, Hout*Wout)
+    Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
+
+    # Convolve patches with filters
+    outn = W %*% Xn_padded_cols + b  # shape (F, Hout*Wout)
+    out[n,] = matrix(outn, rows=1, cols=F*Hout*Wout)  # reshape
+  }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+                    matrix[double] X, matrix[double] W, matrix[double] b,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+  /*
+   * Computes the backward pass for a 2D spatial convolutional layer
+   * with F filters.
+   *
+   * This implementation uses `im2col` and `col2im` internally.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt `b`, of shape (F, 1).
+   */
+  N = nrow(X)
+  F = nrow(W)
+
+  # Create gradient volumes
+  # Note: Create convenience gradient volumes for dW and db that will
+  # allow for one gradient to be stored per example, allowing for
+  # parallel computation at the expense of memory.  We will reduce at
+  # the end.
+  dX = matrix(0, rows=N, cols=C*Hin*Win)
+  dWN = matrix(0, rows=N, cols=F*C*Hf*Wf)  # dW = matrix(0, rows=F, cols=C*Hf*Wf)
+  dbN = matrix(0, rows=N, cols=F)  # db = matrix(0, rows=F, cols=1)
+
+  # Partial derivatives for convolution - im2col implementation
+  parfor (n in 1:N) {  # all examples
+    doutn = matrix(dout[n,], rows=F, cols=Hout*Wout)
+
+    # Compute dW
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
+    Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0)  # shape (C, (Hin+2*padh)*(Win+2*padw))
+    Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
+    # dW = dW + doutn %*% t(Xn_padded_cols)
+    dWN[n,] = matrix(doutn %*% t(Xn_padded_cols), rows=1, cols=F*C*Hf*Wf)
+
+    # Compute db
+    # db = db + rowSums(doutn)
+    dbN[n,] = matrix(rowSums(doutn), rows=1, cols=F)
+
+    # Compute dX
+    dXn_padded_cols = t(W) %*% doutn  # shape (C*Hf*Wf, Hout*Wout)
+    dXn_padded = util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, Wf,
+                              strideh, stridew, "add")
+    dXn = util::unpad_image(dXn_padded, Hin, Win, padh, padw)
+    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)  # reshape
+  }
+
+  # Reduce convenience gradient volumes with one gradient per example
+  # into single gradients for W and b.
+  dW = matrix(colSums(dWN), rows=F, cols=C*Hf*Wf)
+  db = matrix(colSums(dbN), rows=F, cols=1)
+}
+
+init = function(int F, int C, int Hf, int Wf)
+    return (matrix[double] W, matrix[double] b) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
+   *
+   * Inputs:
+   *  - F: Number of filters.
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *
+   * Outputs:
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   */
+  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+  b = matrix(0, rows=F, cols=1)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/conv2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/conv2d_builtin.dml b/scripts/nn/layers/conv2d_builtin.dml
new file mode 100644
index 0000000..bda7a9c
--- /dev/null
+++ b/scripts/nn/layers/conv2d_builtin.dml
@@ -0,0 +1,160 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ *
+ * This implementation uses a built-in operator for higher performance.
+ */
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+                   int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial convolutional layer with
+   * F filters.  The input data has N examples, each represented as a 3D
+   * volume unrolled into a single vector.
+   *
+   * This implementation uses a built-in operator for higher
+   * performance.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+   *      preserves the spatial dimensions of the input.
+   *  - padw: Padding for left and right sides.
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+   *      preserves the spatial dimensions of the input.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+  # Convolution - built-in implementation
+  out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
+               stride=[strideh,stridew], padding=[padh,padw])
+
+  # Add bias term to each output filter
+  out = bias_add(out, b)
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+                    matrix[double] X, matrix[double] W, matrix[double] b,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+  /*
+   * Computes the backward pass for a 2D spatial convolutional layer
+   * with F filters.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+   *      preserves the spatial dimensions of the input.
+   *  - padw: Padding for left and right sides.
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+   *      preserves the spatial dimensions of the input.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt `b`, of shape (F, 1).
+   */
+  N = nrow(X)
+  F = nrow(W)
+
+  # Partial derivatives for convolution - built-in implementation
+  dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
+                              input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
+  dX = conv2d_backward_data(W, dout, stride=[strideh, stridew], padding=[padh,padw],
+                            input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
+
+  # Partial derivatives for bias vector
+  db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))
+}
+
+init = function(int F, int C, int Hf, int Wf)
+    return (matrix[double] W, matrix[double] b) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
+   *
+   * Inputs:
+   *  - F: Number of filters.
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *
+   * Outputs:
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   */
+  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+  b = matrix(0, rows=F, cols=1)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/cross_entropy_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/cross_entropy_loss.dml b/scripts/nn/layers/cross_entropy_loss.dml
new file mode 100644
index 0000000..63db502
--- /dev/null
+++ b/scripts/nn/layers/cross_entropy_loss.dml
@@ -0,0 +1,78 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Cross-Entropy loss function.
+ */
+
+forward = function(matrix[double] pred, matrix[double] y)
+    return (double loss) {
+  /*
+   * Computes the forward pass for a cross-entropy loss function.  The
+   * inputs consist of N examples, each with K dimensions corresponding
+   * to normalized probabilities of K classes.
+   *
+   *   ```
+   *   L_i = -y_i^T * log(pred_i)
+   *   L = (1/N) sum(L_i) for i=1 to N
+   *   ```
+   *
+   * In these equations, `L` is the total loss, `L_i` is the loss for
+   * example `i`, `y_i` is the K-dimensional vector of target class
+   * probabilities, `pred_i` is K-dimensional vector of predicted
+   * class probabilities, and `N` is the number of examples.
+   *
+   * This can be interpreted as the negative log-likelihood assuming
+   * a Bernoulli distribution generalized to K dimensions, or a
+   * Multinomial with one observation.
+   *
+   * Inputs:
+   *  - pred: Predictions, of shape (N, K).
+   *  - y: Targets, of shape (N, K).
+   *
+   * Outputs:
+   *  - loss: Average loss.
+   */
+  N = nrow(y)
+  eps = 1e-10  # numerical stability to avoid log(0)
+  losses = rowSums(-y * log(pred+eps))
+  loss = sum(losses) / N
+}
+
+backward = function(matrix[double] pred, matrix[double] y)
+    return (matrix[double] dpred) {
+  /*
+   * Computes the backward pass of a cross-entropy loss function.  The
+   * inputs consist of N examples, each with K dimensions corresponding
+   * to normalized probabilities of K classes.
+   *
+   * Inputs:
+   *  - pred: Predictions, of shape (N, K).
+   *  - y: Targets, of shape (N, K).
+   *
+   * Outputs:
+   *  - dpred: Gradient wrt `pred`, of shape (N, K).
+   */
+  N = nrow(y)
+  eps = 1e-10  # numerical stability to avoid divide-by-zero
+  dpred = (1/N) * -y * (1/(pred+eps))
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/dropout.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/dropout.dml b/scripts/nn/layers/dropout.dml
new file mode 100644
index 0000000..a36878b
--- /dev/null
+++ b/scripts/nn/layers/dropout.dml
@@ -0,0 +1,76 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Dropout layer.
+ */
+
+forward = function(matrix[double] X, double p, int seed)
+    return (matrix[double] out, matrix[double] mask) {
+  /*
+   * Computes the forward pass for an inverted dropout layer.
+   *
+   * Drops the inputs element-wise with a probability p, and divides
+   * by p to maintain the expected values of those inputs (which are
+   * the outputs of neurons) at test time.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (any, any).
+   *  - p: Probability of keeping a neuron output.
+   *  - seed: [Optional: -1] Random number generator seed to allow for
+   *      deterministic evaluation.  Set to -1 for a random seed.
+   *
+   * Outputs:
+   *  - out: Outputs, of same shape as `X`.
+   *  - mask: Dropout mask used to compute the output.
+   */
+  # Normally, we might use something like
+  #    `mask = rand(rows=nrow(X), cols=ncol(X), min=0, max=1, seed=seed) <= p`
+  # to create a dropout mask.  Fortunately, SystemML has a `sparsity` parameter on
+  # the `rand` function that allows use to create a mask directly.
+  if (seed == -1) {
+    mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p)
+  } else {
+    mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed)
+  }
+  out = X * mask / p
+}
+
+backward = function(matrix[double] dout, matrix[double] X, double p, matrix[double] mask)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for an inverted dropout layer.
+   *
+   * Applies the mask to the upstream gradient, and divides by p to
+   * maintain the expected values at test time.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out`, of same shape as `X`.
+   *  - X: Inputs, of shape (any, any).
+   *  - p: Probability of keeping a neuron output.
+   *  - mask: Dropout mask used to compute the output.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
+   */
+  dX = mask / p * dout
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l1_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/l1_loss.dml b/scripts/nn/layers/l1_loss.dml
new file mode 100644
index 0000000..b74566d
--- /dev/null
+++ b/scripts/nn/layers/l1_loss.dml
@@ -0,0 +1,72 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * L1 loss function.
+ */
+
+forward = function(matrix[double] pred, matrix[double] y)
+    return (double loss) {
+  /*
+   * Computes the forward pass for an L1 loss function.  The inputs
+   * consist of N examples, each with M dimensions to predict.
+   *
+   *   ```
+   *   L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j.
+   *   L = (1/N) sum(L_i) for i=1 to N
+   *   ```
+   *
+   * In these equations, `L` is the total loss, `L_i` is the loss for
+   * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
+   * prediction, and `N` is the number of examples.
+   *
+   * This can be interpreted as the negative log-likelihood assuming
+   * a Laplace distribution.
+   *
+   * Inputs:
+   *  - pred: Predictions, of shape (N, M).
+   *  - y: Targets, of shape (N, M).
+   *
+   * Outputs:
+   *  - loss: Average loss.
+   */
+  N = nrow(y)
+  losses = rowSums(abs(pred-y))
+  loss = sum(losses) / N
+}
+
+backward = function(matrix[double] pred, matrix[double] y)
+    return (matrix[double] dpred) {
+  /*
+   * Computes the backward pass for an L1 loss function.  The inputs
+   * consist of N examples, each with M dimensions to predict.
+   *
+   * Inputs:
+   *  - pred: Predictions, of shape (N, M).
+   *  - y: Targets, of shape (N, M).
+   *
+   * Outputs:
+   *  - dpred: Gradient wrt `pred`, of shape (N, M).
+   */
+  N = nrow(y)
+  dpred = sign(pred-y) / N
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l1_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/l1_reg.dml b/scripts/nn/layers/l1_reg.dml
new file mode 100644
index 0000000..2b81c0b
--- /dev/null
+++ b/scripts/nn/layers/l1_reg.dml
@@ -0,0 +1,56 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * L1 regularization.
+ */
+
+forward = function(matrix[double] X, double lambda)
+    return (double reg_loss) {
+  /*
+   * Computes the forward pass for an L1 regularization function.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (any, any).
+   *  - lambda: Regularization strength.
+   *      A typical value is 0.01.
+   *
+   * Outputs:
+   *  - reg_loss: Total regularization loss.
+   */
+  reg_loss = lambda * sum(abs(X))
+}
+
+backward = function(matrix[double] X, double lambda)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for an L1 regularization function.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (any, any).
+   *  - lambda: Regularization strength.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
+   */
+  dX = lambda * sign(X)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l2_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/l2_loss.dml b/scripts/nn/layers/l2_loss.dml
new file mode 100644
index 0000000..0482f25
--- /dev/null
+++ b/scripts/nn/layers/l2_loss.dml
@@ -0,0 +1,72 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * L2 loss function.
+ */
+
+forward = function(matrix[double] pred, matrix[double] y)
+    return (double loss) {
+  /*
+   * Computes the forward pass for an L2 loss function.  The inputs
+   * consist of N examples, each with M dimensions to predict.
+   *
+   *   ```
+   *   L_i = (1/2) norm(pred_i - y_i)^2
+   *   L = (1/N) sum(L_i) for i=1 to N
+   *   ```
+   *
+   * In these equations, `L` is the total loss, `L_i` is the loss for
+   * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
+   * prediction, and `N` is the number of examples.
+   *
+   * This can be interpreted as the negative log-likelihood assuming
+   * a Gaussian distribution.
+   *
+   * Inputs:
+   *  - pred: Predictions, of shape (N, M).
+   *  - y: Targets, of shape (N, M).
+   *
+   * Outputs:
+   *  - loss: Average loss.
+   */
+  N = nrow(y)
+  losses = 0.5 * rowSums((pred-y)^2)
+  loss = sum(losses) / N
+}
+
+backward = function(matrix[double] pred, matrix[double] y)
+    return (matrix[double] dpred) {
+  /*
+   * Computes the backward pass for an L2 loss function.  The inputs
+   * consist of N examples, each with M dimensions to predict.
+   *
+   * Inputs:
+   *  - pred: Predictions, of shape (N, M).
+   *  - y: Targets, of shape (N, M).
+   *
+   * Outputs:
+   *  - dpred: Gradient wrt `pred`, of shape (N, M).
+   */
+  N = nrow(y)
+  dpred = (pred-y) / N
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l2_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/l2_reg.dml b/scripts/nn/layers/l2_reg.dml
new file mode 100644
index 0000000..7255efe
--- /dev/null
+++ b/scripts/nn/layers/l2_reg.dml
@@ -0,0 +1,56 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * L2 regularization.
+ */
+
+forward = function(matrix[double] X, double lambda)
+    return (double reg_loss) {
+  /*
+   * Computes the forward pass for an L2 regularization function.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (any, any).
+   *  - lambda: Regularization strength.
+   *      A typical value is 0.01.
+   *
+   * Outputs:
+   *  - reg_loss: Total regularization loss.
+   */
+  reg_loss = 0.5 * lambda * sum(X^2)
+}
+
+backward = function(matrix[double] X, double lambda)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for an L2 regularization function.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (any, any).
+   *  - lambda: Regularization strength.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
+   */
+  dX = lambda * X
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/log_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/log_loss.dml b/scripts/nn/layers/log_loss.dml
new file mode 100644
index 0000000..15914f7
--- /dev/null
+++ b/scripts/nn/layers/log_loss.dml
@@ -0,0 +1,76 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Log loss function.
+ */
+
+forward = function(matrix[double] pred, matrix[double] y)
+    return (double loss) {
+  /*
+   * Computes the forward pass for a log loss function.
+   *
+   *   ```
+   *   L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i)
+   *   L = (1/N) sum(L_i) for i=1 to N
+   *   ```
+   *
+   * In these equations, `L` is the total loss, `L_i` is the loss for
+   * example `i`, `y_i` is the binary target, `pred_i` is probability
+   * of the true class (i.e. `y=1`), and `N` is the number of examples.
+   *
+   * This can be interpreted as the negative log-likelihood assuming
+   * a Bernoulli distribution.
+   *
+   * Inputs:
+   *  - pred: Predictions, of shape (N, 1).
+   *      Predictions should be probabilities of the true
+   *      class (i.e. probability of `y=1`).
+   *  - y: Targets, of shape (N, 1).
+   *      Targets should be binary in the set {0, 1}.
+   *
+   * Outputs:
+   *  - loss: Average loss.
+   */
+  N = nrow(y)
+  losses = -y*log(pred) - (1-y)*log(1-pred)
+  loss = sum(losses) / N
+}
+
+backward = function(matrix[double] pred, matrix[double] y)
+    return (matrix[double] dpred) {
+  /*
+   * Computes the backward pass for a log loss function.
+   *
+   * Inputs:
+   *  - pred: Predictions, of shape (N, 1).
+   *      Predictions should be probabilities of the true
+   *      class (i.e. probability of `y=1`).
+   *  - y: Targets, of shape (N, 1).
+   *      Targets should be binary in the set {0, 1}.
+   *
+   * Outputs:
+   *  - dpred: Gradient wrt `pred`, of shape (N, 1).
+   */
+  N = nrow(y)
+  dpred = (1/N) * (pred-y) / (pred*(1-pred))
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/lstm.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/lstm.dml b/scripts/nn/layers/lstm.dml
new file mode 100644
index 0000000..a75add4
--- /dev/null
+++ b/scripts/nn/layers/lstm.dml
@@ -0,0 +1,260 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * LSTM layer.
+ */
+source("nn/layers/sigmoid.dml") as sigmoid
+source("nn/layers/tanh.dml") as tanh
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
+                   boolean return_sequences, matrix[double] out0, matrix[double] c0)
+    return (matrix[double] out, matrix[double] c,
+            matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog) {
+  /*
+   * Computes the forward pass for an LSTM layer with M neurons.
+   * The input data has N sequences of T examples, each with D features.
+   *
+   * In an LSTM, an internal cell state is maintained, additive
+   * interactions operate over the cell state at each timestep, and
+   * some amount of this cell state is exposed as output at each
+   * timestep.  Additionally, the output of the previous timestep is fed
+   * back in as an additional input at the current timestep.
+   *
+   * Reference:
+   *  - Long Short-Term Memory, Hochreiter, 1997
+   *    - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, T*D).
+   *  - W: Weights, of shape (D+M, 4M).
+   *  - b: Biases, of shape (1, 4M).
+   *  - T: Length of example sequences (number of timesteps).
+   *  - D: Dimensionality of the input features (number of features).
+   *  - return_sequences: Whether to return `out` at all timesteps,
+   *      or just for the final timestep.
+   *  - out0: Outputs from previous timestep, of shape (N, M).
+   *      Note: This is *optional* and could just be an empty matrix.
+   *  - c0: Initial cell state, of shape (N, M).
+   *      Note: This is *optional* and could just be an empty matrix.
+   *
+   * Outputs:
+   *  - out: If `return_sequences` is True, outputs for all timesteps,
+   *      of shape (N, T*M).  Else, outputs for the final timestep, of
+   *      shape (N, M).
+   *  - c: Cell state for final timestep, of shape (N, M).
+   *  - cache_out: Cache of outputs, of shape (T, N*M).
+   *      Note: This is used for performance during training.
+   *  - cache_c: Cache of cell state, of shape (T, N*M).
+   *      Note: This is used for performance during training.
+   *  - cache_ifog: Cache of intermediate values, of shape (T, N*4M).
+   *      Note: This is used for performance during training.
+   */
+  N = nrow(X)
+  M = as.integer(ncol(W)/4)
+  out_prev = out0
+  c_prev = c0
+  c = c_prev
+  if (return_sequences) {
+    out = matrix(0, rows=N, cols=T*M)
+  }
+  else {
+    out = matrix(0, rows=N, cols=M)
+  }
+  # caches to be used during the backward pass for performance
+  cache_out = matrix(0, rows=T, cols=N*M)
+  cache_c = matrix(0, rows=T, cols=N*M)
+  cache_ifog = matrix(0, rows=T, cols=N*4*M)
+
+  for (t in 1:T) {  # each timestep
+    X_t = X[,(t-1)*D+1:t*D]  # shape (N, D)
+    input = cbind(X_t, out_prev)  # shape (N, D+M)
+    ifog = input %*% W + b  # input, forget, output, and g gates; shape (N, 4M)
+    tmp = sigmoid::forward(ifog[,1:3*M])  # i,f,o gates squashed with sigmoid
+    ifog[,1:3*M] = tmp
+    tmp = tanh::forward(ifog[,3*M+1:4*M])  # g gate squashed with tanh
+    ifog[,3*M+1:4*M] = tmp
+    # c_t = f*prev_c + i*g
+    c = ifog[,M+1:2*M]*c_prev + ifog[,1:M]*ifog[,3*M+1:4*M]  # shape (N, M)
+    # out_t = o*tanh(c)
+    tmp = tanh::forward(c)
+    out_t = ifog[,2*M+1:3*M] * tmp  # shape (N, M)
+
+    # store
+    if (return_sequences) {
+      out[,(t-1)*M+1:t*M] = out_t
+    }
+    else {
+      out = out_t
+    }
+    out_prev = out_t
+    c_prev = c
+    cache_out[t,] = matrix(out_t, rows=1, cols=N*M)  # reshape
+    cache_c[t,] = matrix(c, rows=1, cols=N*M)  # reshape
+    cache_ifog[t,] = matrix(ifog, rows=1, cols=N*4*M)  # reshape
+  }
+}
+
+backward = function(matrix[double] dout, matrix[double] dc,
+                    matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
+                    boolean given_sequences, matrix[double] out0, matrix[double] c0,
+                    matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog)
+    return (matrix[double] dX, matrix[double] dW, matrix[double] db,
+            matrix[double] dout0, matrix[double] dc0) {
+  /*
+   * Computes the backward pass for an LSTM layer with M neurons.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out`.  If `given_sequences` is `True`,
+   *      contains gradients on outputs for all timesteps, of
+   *      shape (N, T*M). Else, contains the gradient on the output
+   *      for the final timestep, of shape (N, M).
+   *  - dc: Gradient wrt `c` (from later in time), of shape (N, M).
+   *      This would come from later in time if the cell state was used
+   *      downstream as the initial cell state for another LSTM layer.
+   *      Typically, this would be used when a sequence was cut at
+   *      timestep `T` and then continued in the next batch.  If `c`
+   *      was not used downstream, then `dc` would be an empty matrix.
+   *  - X: Inputs, of shape (N, T*D).
+   *  - W: Weights, of shape (D+M, 4M).
+   *  - b: Biases, of shape (1, 4M).
+   *  - T: Length of example sequences (number of timesteps).
+   *  - D: Dimensionality of the input features.
+   *  - given_sequences: Whether `dout` is for all timesteps,
+   *      or just for the final timestep.  This is based on whether
+   *      `return_sequences` was true in the forward pass.
+   *  - out0: Outputs from previous timestep, of shape (N, M).
+   *      Note: This is *optional* and could just be an empty matrix.
+   *  - c0: Initial cell state, of shape (N, M).
+   *      Note: This is *optional* and could just be an empty matrix.
+   *  - cache_out: Cache of outputs, of shape (T, N*M).
+   *      Note: This is used for performance during training.
+   *  - cache_c: Cache of cell state, of shape (T, N*M).
+   *      Note: This is used for performance during training.
+   *  - cache_ifog: Cache of intermediate values, of shape (T, N*4*M).
+   *      Note: This is used for performance during training.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, T*D).
+   *  - dW: Gradient wrt `W`, of shape (D+M, 4M).
+   *  - db: Gradient wrt `b`, of shape (1, 4M).
+   *  - dout0: Gradient wrt `out0`, of shape (N, M).
+   *  - dc0: Gradient wrt `c0`, of shape (N, M).
+   */
+  N = nrow(X)
+  M = as.integer(ncol(W)/4)
+  dX = matrix(0, rows=N, cols=T*D)
+  dW = matrix(0, rows=D+M, cols=4*M)
+  db = matrix(0, rows=1, cols=4*M)
+  dout0 = matrix(0, rows=N, cols=M)
+  dc0 = matrix(0, rows=N, cols=M)
+  dct = dc
+  if (!given_sequences) {
+    # only given dout for output at final timestep, so prepend empty douts for all other timesteps
+    dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout)  # shape (N, T*M)
+  }
+
+  t = T
+  for (iter in 1:T) {  # each timestep in reverse order
+    X_t = X[,(t-1)*D+1:t*D]  # shape (N, D)
+    dout_t = dout[,(t-1)*M+1:t*M]  # shape (N, M)
+    out_t = matrix(cache_out[t,], rows=N, cols=M)  # shape (N, M)
+    ct = matrix(cache_c[t,], rows=N, cols=M)  # shape (N, M)
+    if (t == 1) {
+      out_prev = out0  # shape (N, M)
+      c_prev = c0  # shape (N, M)
+    }
+    else {
+      out_prev = matrix(cache_out[t-1,], rows=N, cols=M)  # shape (N, M)
+      c_prev = matrix(cache_c[t-1,], rows=N, cols=M)  # shape (N, M)
+    }
+    input = cbind(X_t, out_prev)  # shape (N, D+M)
+    ifog = matrix(cache_ifog[t,], rows=N, cols=4*M)
+    i = ifog[,1:M]  # input gate, shape (N, M)
+    f = ifog[,M+1:2*M]  # forget gate, shape (N, M)
+    o = ifog[,2*M+1:3*M]  # output gate, shape (N, M)
+    g = ifog[,3*M+1:4*M]  # g gate, shape (N, M)
+
+    tmp = tanh::backward(dout_t, ct)
+    dct = dct + o*tmp  # shape (N, M)
+    tmp = tanh::forward(ct)
+    do = tmp * dout_t  # output gate, shape (N, M)
+    df = c_prev * dct  # forget gate, shape (N, M)
+    dc_prev = f * dct  # shape (N, M)
+    di = g * dct  # input gate, shape (N, M)
+    dg = i * dct  # g gate, shape (N, M)
+
+    di_raw = i * (1-i) * di
+    df_raw = f * (1-f) * df
+    do_raw = o * (1-o) * do
+    dg_raw = (1-g^2) * dg
+    difog_raw = cbind(di_raw, cbind(df_raw, cbind(do_raw, dg_raw)))  # shape (N, 4M)
+
+    dW = dW + t(input) %*% difog_raw  # shape (D+M, 4M)
+    db = db + colSums(difog_raw)  # shape (1, 4M)
+    dinput = difog_raw %*% t(W)  # shape (N, D+M)
+    dX[,(t-1)*D+1:t*D] = dinput[,1:D]
+    dout_prev = dinput[,D+1:D+M]  # shape (N, M)
+    if (t == 1) {
+      dout0 = dout_prev  # shape (N, M)
+      dc0 = dc_prev  # shape (N, M)
+    }
+    else {
+      dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev  # shape (N, M)
+      dct = dc_prev  # shape (N, M)
+    }
+    t = t - 1
+  }
+}
+
+init = function(int N, int D, int M)
+    return (matrix[double] W, matrix[double] b, matrix[double] out0, matrix[double] c0) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * We use the Glorot uniform heuristic which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
+   *  - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
+   *
+   * Inputs:
+   *  - N: Number of examples in batch.
+   *  - D: Dimensionality of the input features (number of features).
+   *  - M: Number of neurons in this layer.
+   *
+   * Outputs:
+   *  - W: Weights, of shape (D+M, 4M).
+   *  - b: Biases, of shape (1, 4M).
+   *  - out0: Empty previous timestep output matrix, of shape (N, M).
+   *  - c0: Empty initial cell state matrix, of shape (N, M).
+   */
+  fan_in = D+M
+  fan_out = 4*M
+  scale = sqrt(6/(fan_in+fan_out))
+  W = rand(rows=D+M, cols=4*M, min=-scale, max=scale, pdf="uniform")
+  b = matrix(0, rows=1, cols=4*M)
+  out0 = matrix(0, rows=N, cols=M)
+  c0 = matrix(0, rows=N, cols=M)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/max_pool2d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/max_pool2d.dml b/scripts/nn/layers/max_pool2d.dml
new file mode 100644
index 0000000..fba1a4c
--- /dev/null
+++ b/scripts/nn/layers/max_pool2d.dml
@@ -0,0 +1,159 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Max Pooling layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * This implementation uses `im2col` internally for each image to
+   * extract local image regions (patches) of each channel slice into
+   * columns, and then performs max pooling over the patches to compute
+   * the output maps.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+  pad_value = -1/0  # in max pooling we pad with -infinity
+
+  # Create output volume
+  out = matrix(0, rows=N, cols=C*Hout*Wout)
+
+  # Max pooling - im2col implementation
+  parfor (n in 1:N) {  # all examples
+    img = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
+
+    if (padh > 0 | padw > 0) {
+      # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
+      img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
+    }
+
+    img_maxes = matrix(0, rows=C, cols=Hout*Wout)  # zeros
+    parfor (c in 1:C) {  # all channels
+      # Extract local image slice patches into columns with im2col, of shape (Hf*Wf, Hout*Wout)
+      img_slice_cols = util::im2col(img[c,], Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
+
+      # Max pooling on patches
+      img_maxes[c,] = colMaxs(img_slice_cols)
+    }
+
+    out[n,] = matrix(img_maxes, rows=1, cols=C*Hout*Wout)
+  }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   */
+  N = nrow(X)
+  pad_value = -1/0  # in max pooling we pad with -infinity
+
+  # Create gradient volume
+  dX = matrix(0, rows=N, cols=C*Hin*Win)
+
+  # Gradient of max pooling
+  parfor (n in 1:N, check=0) {  # all examples
+    img = matrix(X[n,], rows=C, cols=Hin*Win)
+    if (padh > 0 | padw > 0) {
+      # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
+      img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
+    }
+
+    dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+    parfor (c in 1:C, check=0) {  # all channels
+      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+      for (hout in 1:Hout, check=0) {  # all output rows
+        hin = (hout-1)*strideh + 1
+        for (wout in 1:Wout) {  # all output columns
+          win = (wout-1)*stridew + 1
+          img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
+          max_val_ind = img_slice_patch == max(img_slice_patch)  # max value indicator matrix
+          # gradient passes through only for the max value(s) in this patch
+          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
+          dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
+                                                   + dimg_slice_patch
+        }
+      }
+      dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
+    }
+
+    if (padh > 0 | padw > 0) {
+      # Unpad image gradient
+      dimg = util::unpad_image(dimg, Hin, Win, padh, padw)  # shape (C, (Hin+2*padh)*(Win+2*padw))
+    }
+    dX[n,] = matrix(dimg, rows=1, cols=C*Hin*Win)
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/max_pool2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/max_pool2d_builtin.dml b/scripts/nn/layers/max_pool2d_builtin.dml
new file mode 100644
index 0000000..880f818
--- /dev/null
+++ b/scripts/nn/layers/max_pool2d_builtin.dml
@@ -0,0 +1,103 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Max Pooling layer.
+ *
+ * This implementation uses a built-in operator for higher performance.
+ */
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * This implementation uses a built-in operator for higher
+   * performance.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+  # Max pooling - built-in implementation
+  out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
+                 stride=[strideh,stridew], padding=[padh,padw])
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   */
+  N = nrow(X)
+
+  # Gradient of max pooling
+  dX = max_pool_backward(X, dout, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
+                         stride=[strideh,stridew], padding=[padh,padw])
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/relu.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/relu.dml b/scripts/nn/layers/relu.dml
new file mode 100644
index 0000000..93a6e90
--- /dev/null
+++ b/scripts/nn/layers/relu.dml
@@ -0,0 +1,59 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Rectified Linear Unit (ReLU) nonlinearity layer.
+ */
+
+forward = function(matrix[double] X)
+    return (matrix[double] out) {
+  /*
+   * Computes the forward pass for a ReLU nonlinearity layer.
+   *
+   * Performs an element-wise evaluation of `f(input) = max(0, input)`.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (any, any).
+   *
+   * Outputs:
+   *  - out: Outputs, of same shape as `X`.
+   */
+  out = max(X, 0)
+}
+
+backward = function(matrix[double] dout, matrix[double] X)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a ReLU nonlinearity layer.
+   *
+   * Essentially performs a pass-through of the upstream gradient
+   * for cells > 0.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+   *  - X: Previous input data matrix, of shape (any, any).
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
+   */
+   dX = (X > 0) * dout
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/rnn.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/rnn.dml b/scripts/nn/layers/rnn.dml
new file mode 100644
index 0000000..3c6faae
--- /dev/null
+++ b/scripts/nn/layers/rnn.dml
@@ -0,0 +1,183 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Simple (Vanilla) RNN layer.
+ */
+source("nn/layers/tanh.dml") as tanh
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
+                   boolean return_sequences, matrix[double] out0)
+    return (matrix[double] out, matrix[double] cache_out) {
+  /*
+   * Computes the forward pass for a simple RNN layer with M neurons.
+   * The input data has N sequences of T examples, each with D features.
+   *
+   * In a simple RNN, the output of the previous timestep is fed back
+   * in as an additional input at the current timestep.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, T*D).
+   *  - W: Weights, of shape (D+M, M).
+   *  - b: Biases, of shape (1, M).
+   *  - T: Length of example sequences (number of timesteps).
+   *  - D: Dimensionality of the input features (number of features).
+   *  - return_sequences: Whether to return `out` at all timesteps,
+   *      or just for the final timestep.
+   *  - out0: Output matrix from previous timestep, of shape (N, M).
+   *      Note: This is *optional* and could just be an empty matrix.
+   *
+   * Outputs:
+   *  - out: If `return_sequences` is True, outputs for all timesteps,
+   *      of shape (N, T*M).  Else, outputs for the final timestep, of
+   *      shape (N, M).
+   *  - cache_out: Cache of outputs, of shape (T, N*M).
+   *      Note: This is used for performance during training.
+   */
+  N = nrow(X)
+  M = ncol(W)
+  out_prev = out0
+  if (return_sequences) {
+    out = matrix(0, rows=N, cols=T*M)
+  }
+  else {
+    out = matrix(0, rows=N, cols=M)
+  }
+  # caches to be used during the backward pass for performance
+  cache_out = matrix(0, rows=T, cols=N*M)
+
+  for (t in 1:T) {  # each timestep
+    X_t = X[,(t-1)*D+1:t*D]  # shape (N, D)
+    input = cbind(X_t, out_prev)  # shape (N, D+M)
+    out_t = tanh::forward(input %*% W + b)  # shape (N, M)
+    # store
+    if (return_sequences) {
+      out[,(t-1)*M+1:t*M] = out_t
+    }
+    else {
+      out = out_t
+    }
+    out_prev = out_t
+    cache_out[t,] = matrix(out_t, rows=1, cols=N*M)  # reshape
+  }
+}
+
+backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, matrix[double] b,
+                    int T, int D, boolean given_sequences, matrix[double] out0,
+                    matrix[double] cache_out)
+    return (matrix[double] dX, matrix[double] dW, matrix[double] db, matrix[double] dout0) {
+  /*
+   * Computes the backward pass for a simple RNN layer with M neurons.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream.  If `given_sequences`
+   *      is True, contains gradients on outputs for all timesteps,
+   *      of shape (N, T*M).  Else, contains gradient on output for
+   *      the final timestep, of shape (N, M).
+   *  - X: Inputs, of shape (N, T*D).
+   *  - W: Weights, of shape (D+M, M).
+   *  - b: Biases, of shape (1, M).
+   *  - T: Length of example sequences (number of timesteps).
+   *  - D: Dimensionality of the input features (number of features).
+   *  - given_sequences: Whether `dout` is for all timesteps,
+   *      or just for the final timestep.  This is based on whether
+   *      `return_sequences` was true in the forward pass.
+   *  - out0: Output matrix from previous timestep, of shape (N, M).
+   *      Note: This is *optional* and could just be an empty matrix.
+   *  - cache_out: Cache of outputs, of shape (T, N*M).
+   *      Note: This is used for performance during training.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, T*D).
+   *  - dW: Gradient wrt `W`, of shape (D+M, 4M).
+   *  - db: Gradient wrt `b`, of shape (1, 4M).
+   *  - dout0: Gradient wrt `out0`, of shape (N, M).
+   */
+  N = nrow(X)
+  M = ncol(W)
+  dX = matrix(0, rows=N, cols=T*D)
+  dW = matrix(0, rows=D+M, cols=M)
+  db = matrix(0, rows=1, cols=M)
+  dout0 = matrix(0, rows=N, cols=M)
+  if (!given_sequences) {
+    # only given dout for output at final timestep, so prepend empty douts for all other timesteps
+    dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout)  # shape (N, T*M)
+  }
+
+  t = T
+  for (iter in 1:T) {  # each timestep in reverse order
+    X_t = X[,(t-1)*D+1:t*D]  # shape (N, D)
+    dout_t = dout[,(t-1)*M+1:t*M]  # shape (N, M)
+    out_t = matrix(cache_out[t,], rows=N, cols=M)  # shape (N, M)
+    if (t == 1) {
+      out_prev = out0  # shape (N, M)
+    }
+    else {
+      out_prev = matrix(cache_out[t-1,], rows=N, cols=M)  # shape (N, M)
+    }
+    input = cbind(X_t, out_prev)  # shape (N, D+M)
+    dout_t_raw = (1-out_t^2) * dout_t  # into tanh, shape (N, M)
+    dW = dW + t(input) %*% dout_t_raw  # shape (D+M, M)
+    db = db + colSums(dout_t_raw)  # shape (1, M)
+    dinput = dout_t_raw %*% t(W)  # shape (N, D+M)
+    dX[,(t-1)*D+1:t*D] = dinput[,1:D]
+    dout_prev = dinput[,D+1:D+M]  # shape (N, M)
+    if (t == 1) {
+      dout0 = dout_prev  # shape (N, M)
+    }
+    else {
+      dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev  # shape (N, M)
+    }
+    t = t - 1
+  }
+}
+
+init = function(int N, int D, int M)
+    return (matrix[double] W, matrix[double] b, matrix[double] out0) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * We use the Glorot uniform heuristic which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
+   *  - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
+   *
+   * Inputs:
+   *  - N: Number of examples in batch.
+   *  - D: Dimensionality of the input features (number of features).
+   *  - M: Number of neurons in this layer.
+   *
+   * Outputs:
+   *  - W: Weights, of shape (D+M, M).
+   *  - b: Biases, of shape (1, M).
+   *  - out0: Empty previous timestep output matrix, of shape (N, M).
+   */
+  fan_in = D+M
+  fan_out = M
+  scale = sqrt(6/(fan_in+fan_out))
+  W = rand(rows=D+M, cols=M, min=-scale, max=scale, pdf="uniform")
+  b = matrix(0, rows=1, cols=M)
+  out0 = matrix(0, rows=N, cols=M)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/scale_shift1d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/scale_shift1d.dml b/scripts/nn/layers/scale_shift1d.dml
new file mode 100644
index 0000000..7e162a3
--- /dev/null
+++ b/scripts/nn/layers/scale_shift1d.dml
@@ -0,0 +1,95 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 1D Scale & Shift layer.
+ */
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta)
+    return (matrix[double] out) {
+  /*
+   * Computes the forward pass for a 1D scale & shift layer. The input
+   * data has N examples, each with D features.
+   *
+   * A 1D scale & shift layer introduces learnable parameters
+   * (gamma, beta) to scale and shift the input on a per-feature basis.
+   *
+   *   `y = x*gamma + beta`
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, D).
+   *  - gamma: Scale parameters, of shape (1, D).
+   *  - beta: Shift parameters, of shape (1, D).
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, D).
+   */
+  # Scale and shift
+  out = X*gamma + beta  # shape (N, D)
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+                    matrix[double] X, matrix[double] gamma, matrix[double] beta)
+      return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+  /*
+   * Computes the backward pass for a 1D scale & shift layer.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of shape (N, D).
+   *  - out: Outputs from the forward pass, of shape (N, D).
+   *  - X: Inputs, of shape (N, D).
+   *  - gamma: Scale parameters, of shape (1, D).
+   *  - beta: Shift parameters, of shape (1, D).
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, D).
+   *  - dgamma: Gradient wrt `W`, of shape (1, D).
+   *  - dbeta: Gradient wrt `b`, of shape (1, D).
+   *
+   */
+  # Compute gradients during training
+  dgamma = colSums(dout*X)  # shape (1, D)
+  dbeta = colSums(dout)  # shape (1, D)
+  dX = dout * gamma  # shape (N, D)
+}
+
+init = function(int D)
+    return (matrix[double] gamma, matrix[double] beta) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * By default, we initialize to an identity function, with a scale
+   * filler of `1`, and a shift filler of `0`.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * Inputs:
+   *  - D: Dimensionality of the input features (number of features).
+   *
+   * Outputs:
+   *  - gamma: Scale parameters, of shape (1, D).
+   *  - beta: Shift parameters, of shape (1, D).
+   */
+   gamma = matrix(1, rows=1, cols=D)
+   beta = matrix(0, rows=1, cols=D)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/scale_shift2d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/scale_shift2d.dml b/scripts/nn/layers/scale_shift2d.dml
new file mode 100644
index 0000000..79c884a
--- /dev/null
+++ b/scripts/nn/layers/scale_shift2d.dml
@@ -0,0 +1,107 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Scale & Shift layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
+                   int C, int Hin, int Win)
+    return (matrix[double] out) {
+  /*
+   * Computes the forward pass for a 2D scale & shift layer.  The input
+   * data has N examples, each represented as a 3D volume unrolled into
+   * a single vector.
+   *
+   * A 2D scale & shift layer introduces learnable parameters
+   * (gamma, beta) to scale and shift the input on a per-channel basis.
+   *
+   *   `y = x*gamma + beta`
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - gamma: Scale parameters, of shape (C, 1).
+   *  - beta: Shift parameters, of shape (C, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, C*Hin*Win).
+   */
+  # Scale and shift
+  scaled = bias_multiply(X, gamma)  # shape (N, C*Hin*Win)
+  out = bias_add(scaled, beta)  # shape (N, C*Hin*Win)
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+                    matrix[double] X, matrix[double] gamma, matrix[double] beta,
+                    int C, int Hin, int Win)
+      return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+  /*
+   * Computes the backward pass for a 2D scale & shift layer.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
+   *  - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
+   *  - X: Input data matrix to the forward pass, of
+   *      shape (N, C*Hin*Win).
+   *  - gamma: Scale parameters, of shape (C, 1).
+   *  - beta: Shift parameters, of shape (C, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dgamma: Gradient wrt `W`, of shape (C, 1).
+   *  - dbeta: Gradient wrt `b`, of shape (C, 1).
+   *
+   */
+  # Compute gradients during training
+  dgamma = util::channel_sums(dout*X, C, Hin, Win)  # shape (C, 1)
+  dbeta = util::channel_sums(dout, C, Hin, Win)  # shape (C, 1)
+  dX = bias_multiply(dout, gamma)  # shape (N, C*Hin*Win)
+}
+
+init = function(int C)
+    return (matrix[double] gamma, matrix[double] beta) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * By default, we initialize to an identity function, with a scale
+   * filler of `1`, and a shift filler of `0`.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * Inputs:
+   *  - C: Number of input channels (dimensionality of input depth).
+   *
+   * Outputs:
+   *  - gamma: Scale parameters, of shape (C, 1).
+   *  - beta: Shift parameters, of shape (C, 1).
+   */
+   gamma = matrix(1, rows=C, cols=1)
+   beta = matrix(0, rows=C, cols=1)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/sigmoid.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/sigmoid.dml b/scripts/nn/layers/sigmoid.dml
new file mode 100644
index 0000000..2d85adc
--- /dev/null
+++ b/scripts/nn/layers/sigmoid.dml
@@ -0,0 +1,62 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Sigmoid nonlinearity layer.
+ */
+
+forward = function(matrix[double] X)
+    return (matrix[double] out) {
+  /*
+   * Computes the forward pass for a sigmoid nonlinearity layer.
+   *
+   *   `sigmoid(x) = 1 / (1 + e^-x)`
+   *
+   * If `X` contains a single feature column, the output of a sigmoid
+   * layer can be interpreted as a predicted probability of a true
+   * class when paired with a log loss function in a binary
+   * classification problem.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (any, any).
+   *
+   * Outputs:
+   *  - out: Outputs, of same shape as `X`.
+   */
+  out = 1 / (1+exp(-X))
+}
+
+backward = function(matrix[double] dout, matrix[double] X)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a sigmoid nonlinearity layer.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+   *  - X: Inputs, of shape (any, any).
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
+   */
+  out = 1 / (1+exp(-X))
+  dX = out * (1-out) * dout
+}
+