You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/04/26 21:42:34 UTC
[08/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn` library to `scripts/nn`

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/grad_check.dml b/scripts/nn/test/grad_check.dml
new file mode 100644
index 0000000..f3bc9a7
--- /dev/null
+++ b/scripts/nn/test/grad_check.dml
@@ -0,0 +1,1769 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Gradient checks for various architectures.
+ */
+source("nn/layers/affine.dml") as affine
+source("nn/layers/batch_norm1d.dml") as batch_norm1d
+source("nn/layers/batch_norm2d.dml") as batch_norm2d
+source("nn/layers/conv2d.dml") as conv2d
+source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/dropout.dml") as dropout
+source("nn/layers/l1_loss.dml") as l1_loss
+source("nn/layers/l1_reg.dml") as l1_reg
+source("nn/layers/l2_loss.dml") as l2_loss
+source("nn/layers/l2_reg.dml") as l2_reg
+source("nn/layers/log_loss.dml") as log_loss
+source("nn/layers/lstm.dml") as lstm
+source("nn/layers/max_pool2d.dml") as max_pool2d
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
+source("nn/layers/relu.dml") as relu
+source("nn/layers/rnn.dml") as rnn
+source("nn/layers/scale_shift1d.dml") as scale_shift1d
+source("nn/layers/scale_shift2d.dml") as scale_shift2d
+source("nn/layers/sigmoid.dml") as sigmoid
+source("nn/layers/softmax.dml") as softmax
+source("nn/layers/tanh.dml") as tanh
+source("nn/test/conv2d_simple.dml") as conv2d_simple
+source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
+source("nn/test/util.dml") as test_util
+
+affine = function() {
+  /*
+   * Gradient check for the affine layer.
+   */
+  print("Grad checking the affine layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 100 # num features
+  M = 10 # num neurons
+  X = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=M)
+  [W, b] = affine::init(D, M)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = affine::forward(X, W, b)
+  dout = l2_loss::backward(out, y)
+  [dX, dW, db] = affine::backward(dout, X, W, b)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = affine::forward(X, W, b)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = affine::forward(X, W, b)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      outmh = affine::forward(X, W, b)
+      lossmh = l2_loss::forward(outmh, y)
+      W[i,j] = old + h
+      outph = affine::forward(X, W, b)
+      lossph = l2_loss::forward(outph, y)
+      W[i,j] = old  # reset
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      outmh = affine::forward(X, W, b)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      outph = affine::forward(X, W, b)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+}
+
+batch_norm1d = function() {
+  /*
+   * Gradient check for the 1D batch normalization layer.
+   */
+  print("Grad checking the 1D batch normalization layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 100 # num features
+  mu = 0.9  # momentum
+  eps = 1e-5  # epsilon
+  X = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=D)
+  gamma = rand(rows=1, cols=D)
+  beta = rand(rows=1, cols=D)
+  ema_mean = rand(rows=1, cols=D)
+  ema_var = rand(rows=1, cols=D)
+  #[dummy, dummy, ema_mean, ema_var] = batch_norm1d::init(D)
+
+  # Check training & testing modes
+  for (i in 1:2) {
+    if (i == 1)
+      mode = 'train'
+    else
+      mode = 'test'
+    print(" - Grad checking the '"+mode+"' mode.")
+
+    # Compute analytical gradients of loss wrt parameters
+    [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+        batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+    dout = l2_loss::backward(out, y)
+    [dX, dgamma, dbeta] = batch_norm1d::backward(dout, out, ema_mean_upd, ema_var_upd,
+                                                 cache_mean, cache_var, cache_norm,
+                                                 X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+
+    # Grad check
+    h = 1e-5
+    print("   - Grad checking X.")
+    for (i in 1:nrow(X)) {
+      for (j in 1:ncol(X)) {
+        # Compute numerical derivative
+        old = as.scalar(X[i,j])
+        X[i,j] = old - h
+        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+        lossmh = l2_loss::forward(outmh, y)
+        X[i,j] = old + h
+        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+        lossph = l2_loss::forward(outph, y)
+        X[i,j] = old  # reset
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      }
+    }
+
+    print("   - Grad checking gamma.")
+    for (i in 1:nrow(gamma)) {
+      for (j in 1:ncol(gamma)) {
+        # Compute numerical derivative
+        old = as.scalar(gamma[i,j])
+        gamma[i,j] = old - h
+        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+        lossmh = l2_loss::forward(outmh, y)
+        gamma[i,j] = old + h
+        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+        lossph = l2_loss::forward(outph, y)
+        gamma[i,j] = old  # reset
+        dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+                                                    lossph, lossmh)
+      }
+    }
+
+    print("   - Grad checking beta.")
+    for (i in 1:nrow(beta)) {
+      for (j in 1:ncol(beta)) {
+        # Compute numerical derivative
+        old = as.scalar(beta[i,j])
+        beta[i,j] = old - h
+        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+        lossmh = l2_loss::forward(outmh, y)
+        beta[i,j] = old + h
+        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+        lossph = l2_loss::forward(outph, y)
+        beta[i,j] = old  # reset
+        dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+                                                    lossph, lossmh)
+      }
+    }
+  }
+}
+
+batch_norm2d = function() {
+  /*
+   * Gradient check for the 2D (spatial) batch normalization layer.
+   */
+  print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  C = 2  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  mu = 0.9  # momentum
+  eps = 1e-5  # epsilon
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=C*Hin*Win)
+  gamma = rand(rows=C, cols=1)
+  beta = rand(rows=C, cols=1)
+  ema_mean = rand(rows=C, cols=1)
+  ema_var = rand(rows=C, cols=1)
+  #[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C)
+
+  # Check training & testing modes
+  for (i in 1:2) {
+    if (i == 1)
+      mode = 'train'
+    else
+      mode = 'test'
+    print(" - Grad checking the '"+mode+"' mode.")
+
+    # Compute analytical gradients of loss wrt parameters
+    [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+        batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+    dout = l2_loss::backward(out, y)
+    [dX, dgamma, dbeta] = batch_norm2d::backward(dout, out, ema_mean_upd, ema_var_upd,
+                                                 cache_mean, cache_var, cache_norm,
+                                                 X, gamma, beta, C, Hin, Win, mode,
+                                                 ema_mean, ema_var, mu, eps)
+
+    # Grad check
+    h = 1e-5
+    print("   - Grad checking X.")
+    for (i in 1:nrow(X)) {
+      for (j in 1:ncol(X)) {
+        # Compute numerical derivative
+        old = as.scalar(X[i,j])
+        X[i,j] = old - h
+        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+        lossmh = l2_loss::forward(outmh, y)
+        X[i,j] = old + h
+        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+        lossph = l2_loss::forward(outph, y)
+        X[i,j] = old  # reset
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      }
+    }
+
+    print("   - Grad checking gamma.")
+    for (i in 1:nrow(gamma)) {
+      for (j in 1:ncol(gamma)) {
+        # Compute numerical derivative
+        old = as.scalar(gamma[i,j])
+        gamma[i,j] = old - h
+        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+        lossmh = l2_loss::forward(outmh, y)
+        gamma[i,j] = old + h
+        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+        lossph = l2_loss::forward(outph, y)
+        gamma[i,j] = old  # reset
+        dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+                                                    lossph, lossmh)
+      }
+    }
+
+    print("   - Grad checking beta.")
+    for (i in 1:nrow(beta)) {
+      for (j in 1:ncol(beta)) {
+        # Compute numerical derivative
+        old = as.scalar(beta[i,j])
+        beta[i,j] = old - h
+        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+        lossmh = l2_loss::forward(outmh, y)
+        beta[i,j] = old + h
+        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+        lossph = l2_loss::forward(outph, y)
+        beta[i,j] = old  # reset
+        dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+                                                    lossph, lossmh)
+      }
+    }
+  }
+}
+
+conv2d = function() {
+  /*
+   * Gradient check for the 2D convolutional layer using `im2col`.
+   */
+  print("Grad checking the `im2col` 2D convolutional layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  F = 2  # num filters
+  Hf = 3  # filter height
+  Wf = 3  # filter width
+  stride = 1
+  pad = 1
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=F*Hin*Win)
+
+  # Create layers
+  [W, b] = conv2d::init(F, C, Hf, Wf)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  dout = l2_loss::backward(out, y)
+  [dX, dW, db] = conv2d::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                  pad, pad)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      W[i,j] = old + h
+      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      W[i,j] = old  # reset
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+}
+
+conv2d_builtin = function() {
+  /*
+   * Gradient check for the 2D convolutional layer using built-in
+   * functions.
+   */
+  print("Grad checking the built-in 2D convolutional layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  F = 2  # num filters
+  Hf = 3  # filter height
+  Wf = 3  # filter width
+  stride = 1
+  pad = 1
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=F*Hin*Win)
+
+  # Create layers
+  [W, b] = conv2d_builtin::init(F, C, Hf, Wf)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                              pad, pad)
+  dout = l2_loss::backward(out, y)
+  [dX, dW, db] = conv2d_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
+                                          stride, stride, pad, pad)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      W[i,j] = old + h
+      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      W[i,j] = old  # reset
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+}
+
+conv2d_simple = function() {
+  /*
+   * Gradient check for the simple reference 2D convolutional layer.
+   */
+  print("Grad checking the simple reference 2D convolutional layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  F = 2  # num filters
+  Hf = 3  # filter height
+  Wf = 3  # filter width
+  stride = 1
+  pad = 1
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=F*Hin*Win)
+
+  # Create layers
+  [W, b] = conv2d_simple::init(F, C, Hf, Wf)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+  dout = l2_loss::backward(out, y)
+  [dX, dW, db] = conv2d_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
+                                         stride, stride, pad, pad)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      W[i,j] = old + h
+      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      W[i,j] = old  # reset
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+                                                   pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+}
+
+cross_entropy_loss = function() {
+  /*
+   * Gradient check for the cross-entropy loss function.
+   */
+  print("Grad checking the cross-entropy loss function.")
+
+  # Generate data
+  N = 3 # num examples
+  K = 10 # num targets
+  pred = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
+  pred = pred / rowSums(pred)  # normalized probs
+  y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
+  y = y / rowSums(y)  # normalized probs
+
+  # Compute analytical gradient
+  dpred = cross_entropy_loss::backward(pred, y)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(pred)) {
+    for (j in 1:ncol(pred)) {
+      # Compute numerical derivative
+      old = as.scalar(pred[i,j])
+      pred[i,j] = old - h
+      lossmh = cross_entropy_loss::forward(pred, y)
+      pred[i,j] = old + h
+      lossph = cross_entropy_loss::forward(pred, y)
+      pred[i,j] = old  # reset W[i,j]
+      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+    }
+  }
+}
+
+dropout = function() {
+  /*
+   * Gradient check for the (inverted) dropout layer.
+   */
+  print("Grad checking the (inverted) dropout layer with L2 loss.")
+
+  # Generate data
+  N = 3  # num examples
+  M = 100  # num neurons
+  p = 0.5  # probability of dropping neuron output
+  seed = as.integer(floor(as.scalar(rand(rows=1, cols=1, min=1, max=100000))))  # random seed
+  X = rand(rows=N, cols=M)
+  y = rand(rows=N, cols=M)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, mask] = dropout::forward(X, p, seed)
+  dout = l2_loss::backward(out, y)
+  dX = dropout::backward(dout, X, p, mask)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, mask] = dropout::forward(X, p, seed)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, mask] = dropout::forward(X, p, seed)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+l1_loss = function() {
+  /*
+   * Gradient check for the L1 loss function.
+   */
+  print("Grad checking the L1 loss function.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 2 # num targets
+  pred = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=D)
+
+  # Compute analytical gradient
+  dpred = l1_loss::backward(pred, y)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(pred)) {
+    for (j in 1:ncol(pred)) {
+      # Compute numerical derivative
+      old = as.scalar(pred[i,j])
+      pred[i,j] = old - h
+      lossmh = l1_loss::forward(pred, y)
+      pred[i,j] = old + h
+      lossph = l1_loss::forward(pred, y)
+      pred[i,j] = old  # reset W[i,j]
+      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+    }
+  }
+}
+
+l1_reg = function() {
+  /*
+   * Gradient check for the L1 regularization function.
+   */
+  print("Grad checking the L1 regularization function.")
+
+  # Generate data
+  D = 5 # num features
+  M = 3 # num neurons
+  lambda = 0.01
+  W = rand(rows=D, cols=M)
+
+  # Compute analytical gradient
+  dW = l1_reg::backward(W, lambda)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      reg_lossmh = l1_reg::forward(W, lambda)
+      W[i,j] = old + h
+      reg_lossph = l1_reg::forward(W, lambda)
+      W[i,j] = old  # reset W[i,j]
+      dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
+                                                  reg_lossph, reg_lossmh)
+    }
+  }
+}
+
+l2_loss = function() {
+  /*
+   * Gradient check for the L2 loss function.
+   */
+  print("Grad checking the L2 loss function.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 2 # num targets
+  pred = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=D)
+
+  # Compute analytical gradient
+  dpred = l2_loss::backward(pred, y)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(pred)) {
+    for (j in 1:ncol(pred)) {
+      # Compute numerical derivative
+      old = as.scalar(pred[i,j])
+      pred[i,j] = old - h
+      lossmh = l2_loss::forward(pred, y)
+      pred[i,j] = old + h
+      lossph = l2_loss::forward(pred, y)
+      pred[i,j] = old  # reset W[i,j]
+      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+    }
+  }
+}
+
+l2_reg = function() {
+  /*
+   * Gradient check for the L2 regularization function.
+   */
+  print("Grad checking the L2 regularization function.")
+
+  # Generate data
+  D = 5 # num features
+  M = 3 # num neurons
+  lambda = 0.01
+  W = rand(rows=D, cols=M)
+
+  # Compute analytical gradient
+  dW = l2_reg::backward(W, lambda)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      reg_lossmh = l2_reg::forward(W, lambda)
+      W[i,j] = old + h
+      reg_lossph = l2_reg::forward(W, lambda)
+      W[i,j] = old  # reset W[i,j]
+      dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
+                                                  reg_lossph, reg_lossmh)
+    }
+  }
+}
+
+log_loss = function() {
+  /*
+   * Gradient check for the log loss function.
+   */
+  print("Grad checking the log loss function.")
+
+  # Generate data
+  N = 20 # num examples
+  D = 1 # num targets
+  pred = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
+  y = round(rand(rows=N, cols=D, min=0, max=1, pdf="uniform"))
+
+  # Compute analytical gradient
+  dpred = log_loss::backward(pred, y)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(pred)) {
+    for (j in 1:ncol(pred)) {
+      # Compute numerical derivative
+      old = as.scalar(pred[i,j])
+      pred[i,j] = old - h
+      lossmh = log_loss::forward(pred, y)
+      pred[i,j] = old + h
+      lossph = log_loss::forward(pred, y)
+      pred[i,j] = old  # reset W[i,j]
+      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+    }
+  }
+}
+
+lstm = function() {
+  /*
+   * Gradient check for the LSTM layer.
+   */
+  print("Grad checking the LSTM layer with L2 loss.")
+
+  # Generate data
+  N = 3  # num examples
+  D = 10  # num features
+  T = 15  # num timesteps (sequence length)
+  M = 5 # num neurons
+  return_seq = TRUE
+  X = rand(rows=N, cols=T*D)
+  y = rand(rows=N, cols=T*M)
+  yc = rand(rows=N, cols=M)
+  out0 = rand(rows=N, cols=M)
+  c0 = rand(rows=N, cols=M)
+  [W, b, dummy, dummy2] = lstm::init(N, D, M)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+  dout = l2_loss::backward(out, y)
+  dc = l2_loss::backward(c, yc)
+  [dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0,
+                                            cache_out, cache_c, cache_ifog)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outmh = l2_loss::forward(outmh, y)
+      loss_cmh = l2_loss::forward(cmh, yc)
+      lossmh = loss_outmh + loss_cmh
+      X[i,j] = old + h
+      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outph = l2_loss::forward(outph, y)
+      loss_cph = l2_loss::forward(cph, yc)
+      lossph = loss_outph + loss_cph
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outmh = l2_loss::forward(outmh, y)
+      loss_cmh = l2_loss::forward(cmh, yc)
+      lossmh = loss_outmh + loss_cmh
+      W[i,j] = old + h
+      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outph = l2_loss::forward(outph, y)
+      loss_cph = l2_loss::forward(cph, yc)
+      lossph = loss_outph + loss_cph
+      W[i,j] = old  # reset
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outmh = l2_loss::forward(outmh, y)
+      loss_cmh = l2_loss::forward(cmh, yc)
+      lossmh = loss_outmh + loss_cmh
+      b[i,j] = old + h
+      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outph = l2_loss::forward(outph, y)
+      loss_cph = l2_loss::forward(cph, yc)
+      lossph = loss_outph + loss_cph
+      b[i,j] = old  # reset
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking out0.")
+  for (i in 1:nrow(out0)) {
+    for (j in 1:ncol(out0)) {
+      # Compute numerical derivative
+      old = as.scalar(out0[i,j])
+      out0[i,j] = old - h
+      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outmh = l2_loss::forward(outmh, y)
+      loss_cmh = l2_loss::forward(cmh, yc)
+      lossmh = loss_outmh + loss_cmh
+      out0[i,j] = old + h
+      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outph = l2_loss::forward(outph, y)
+      loss_cph = l2_loss::forward(cph, yc)
+      lossph = loss_outph + loss_cph
+      out0[i,j] = old  # reset
+      dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking c0.")
+  for (i in 1:nrow(c0)) {
+    for (j in 1:ncol(c0)) {
+      # Compute numerical derivative
+      old = as.scalar(c0[i,j])
+      c0[i,j] = old - h
+      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outmh = l2_loss::forward(outmh, y)
+      loss_cmh = l2_loss::forward(cmh, yc)
+      lossmh = loss_outmh + loss_cmh
+      c0[i,j] = old + h
+      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+      loss_outph = l2_loss::forward(outph, y)
+      loss_cph = l2_loss::forward(cph, yc)
+      lossph = loss_outph + loss_cph
+      c0[i,j] = old  # reset
+      dc0_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
+    }
+  }
+}
+
+max_pool2d = function() {
+  /*
+   * Gradient check for the 2D max pooling layer.
+   */
+  print("Grad checking the 2D max pooling layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 4  # input height
+  Win = 4  # input width
+  Hf = 2  # pool filter height
+  Wf = 2  # pool filter width
+  stride = 2
+  X = rand(rows=N, cols=C*Hin*Win)
+
+  for (pad in 0:1) {
+    print(" - Grad checking w/ pad="+pad+".")
+    Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
+    Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
+    y = rand(rows=N, cols=C*Hout*Wout)
+
+    # Compute analytical gradients of loss wrt parameters
+    [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+    dout = l2_loss::backward(out, y)
+    dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+
+    # Grad check
+    h = 1e-5
+    for (i in 1:nrow(X)) {
+      for (j in 1:ncol(X)) {
+        # Compute numerical derivative
+        old = as.scalar(X[i,j])
+        X[i,j] = old - h
+        [outmh, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+        lossmh = l2_loss::forward(outmh, y)
+        X[i,j] = old + h
+        [outph, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+        lossph = l2_loss::forward(outph, y)
+        X[i,j] = old  # reset
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      }
+    }
+  }
+}
+
+max_pool2d_builtin = function() {
+  /*
+   * Gradient check for the 2D max pooling layer.
+   */
+  print("Grad checking the built-in 2D max pooling layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 4  # input height
+  Win = 4  # input width
+  Hf = 2  # pool filter height
+  Wf = 2  # pool filter width
+  stride = 2
+  X = rand(rows=N, cols=C*Hin*Win)
+
+  for (pad in 0:1) {
+    print(" - Grad checking w/ pad="+pad+".")
+    Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
+    Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
+    y = rand(rows=N, cols=C*Hout*Wout)
+
+    # Compute analytical gradients of loss wrt parameters
+    [out, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                    pad, pad)
+    dout = l2_loss::backward(out, y)
+    dX = max_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
+                                      pad, pad)
+
+    # Grad check
+    h = 1e-5
+    for (i in 1:nrow(X)) {
+      for (j in 1:ncol(X)) {
+        # Compute numerical derivative
+        old = as.scalar(X[i,j])
+        X[i,j] = old - h
+        [outmh, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                          pad, pad)
+        lossmh = l2_loss::forward(outmh, y)
+        X[i,j] = old + h
+        [outph, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                          pad, pad)
+        lossph = l2_loss::forward(outph, y)
+        X[i,j] = old  # reset
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      }
+    }
+  }
+}
+
+max_pool2d_simple = function() {
+  /*
+   * Gradient check for the simple reference 2D max pooling layer.
+   */
+  print("Grad checking the simple reference 2D max pooling layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 4  # input height
+  Win = 4  # input width
+  Hf = 2  # pool filter height
+  Wf = 2  # pool filter width
+  stride = 2
+  X = rand(rows=N, cols=C*Hin*Win)
+
+  for (pad in 0:1) {
+    print(" - Grad checking w/ pad="+pad+".")
+    Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
+    Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
+    y = rand(rows=N, cols=C*Hout*Wout)
+
+    # Compute analytical gradients of loss wrt parameters
+    [out, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+    dout = l2_loss::backward(out, y)
+    dX = max_pool2d_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
+                                     pad, pad)
+
+    # Grad check
+    h = 1e-5
+    for (i in 1:nrow(X)) {
+      for (j in 1:ncol(X)) {
+        # Compute numerical derivative
+        old = as.scalar(X[i,j])
+        X[i,j] = old - h
+        [outmh, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                         pad, pad)
+        lossmh = l2_loss::forward(outmh, y)
+        X[i,j] = old + h
+        [outph, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+                                                         pad, pad)
+        lossph = l2_loss::forward(outph, y)
+        X[i,j] = old  # reset
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      }
+    }
+  }
+}
+
+relu = function() {
+  /*
+   * Gradient check for the ReLU nonlinearity layer.
+   *
+   * NOTE: This could result in a false-negative in which the test
+   * fails due to a kink being crossed in the nonlinearity.  This
+   * occurs when the tests, f(x-h) and f(x+h), end up on opposite
+   * sides of the zero threshold of max(0, fx).  For now, just run
+   * the tests again.  In the future, we can explicitly check for
+   * this and rerun the test automatically.
+   */
+  print("Grad checking the ReLU nonlinearity layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  M = 10 # num neurons
+  X = rand(rows=N, cols=M, min=-5, max=5)
+  y = rand(rows=N, cols=M)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = relu::forward(X)
+  dout = l2_loss::backward(out, y)
+  dX = relu::backward(dout, X)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = relu::forward(X)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = relu::forward(X)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+rnn = function() {
+  /*
+   * Gradient check for the simple RNN layer.
+   */
+  print("Grad checking the simple RNN layer with L2 loss.")
+
+  # Generate data
+  N = 3  # num examples
+  D = 10  # num features
+  T = 15  # num timesteps (sequence length)
+  M = 5 # num neurons
+  return_seq = TRUE
+  X = rand(rows=N, cols=T*D)
+  y = rand(rows=N, cols=T*M)
+  out0 = rand(rows=N, cols=M)
+  [W, b, dummy] = rnn::init(N, D, M)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+  dout = l2_loss::backward(out, y)
+  [dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossmh = l2_loss::forward(outmh, y)
+      W[i,j] = old + h
+      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossph = l2_loss::forward(outph, y)
+      W[i,j] = old  # reset
+      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking out0.")
+  for (i in 1:nrow(out0)) {
+    for (j in 1:ncol(out0)) {
+      # Compute numerical derivative
+      old = as.scalar(out0[i,j])
+      out0[i,j] = old - h
+      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossmh = l2_loss::forward(outmh, y)
+      out0[i,j] = old + h
+      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+      lossph = l2_loss::forward(outph, y)
+      out0[i,j] = old  # reset
+      dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
+    }
+  }
+}
+
+scale_shift1d = function() {
+  /*
+   * Gradient check for the 1D scale & shift layer.
+   */
+  print("Grad checking the 1D scale & shift layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 100 # num features
+  X = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=D)
+  [gamma, beta] = scale_shift1d::init(D)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = scale_shift1d::forward(X, gamma, beta)
+  dout = l2_loss::backward(out, y)
+  [dX, dgamma, dbeta] = scale_shift1d::backward(dout, out, X, gamma, beta)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = scale_shift1d::forward(X, gamma, beta)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = scale_shift1d::forward(X, gamma, beta)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking gamma.")
+  for (i in 1:nrow(gamma)) {
+    for (j in 1:ncol(gamma)) {
+      # Compute numerical derivative
+      old = as.scalar(gamma[i,j])
+      gamma[i,j] = old - h
+      outmh = scale_shift1d::forward(X, gamma, beta)
+      lossmh = l2_loss::forward(outmh, y)
+      gamma[i,j] = old + h
+      outph = scale_shift1d::forward(X, gamma, beta)
+      lossph = l2_loss::forward(outph, y)
+      gamma[i,j] = old  # reset
+      dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+                                                  lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking beta.")
+  for (i in 1:nrow(beta)) {
+    for (j in 1:ncol(beta)) {
+      # Compute numerical derivative
+      old = as.scalar(beta[i,j])
+      beta[i,j] = old - h
+      outmh = scale_shift1d::forward(X, gamma, beta)
+      lossmh = l2_loss::forward(outmh, y)
+      beta[i,j] = old + h
+      outph = scale_shift1d::forward(X, gamma, beta)
+      lossph = l2_loss::forward(outph, y)
+      beta[i,j] = old  # reset
+      dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+                                                  lossph, lossmh)
+    }
+  }
+}
+
+scale_shift2d = function() {
+  /*
+   * Gradient check for the 2D scale & shift layer.
+   */
+  print("Grad checking the 2D scale & shift layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  C = 2  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=C*Hin*Win)
+  [gamma, beta] = scale_shift2d::init(C)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+  dout = l2_loss::backward(out, y)
+  [dX, dgamma, dbeta] = scale_shift2d::backward(dout, out, X, gamma, beta, C, Hin, Win)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking gamma.")
+  for (i in 1:nrow(gamma)) {
+    for (j in 1:ncol(gamma)) {
+      # Compute numerical derivative
+      old = as.scalar(gamma[i,j])
+      gamma[i,j] = old - h
+      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+      lossmh = l2_loss::forward(outmh, y)
+      gamma[i,j] = old + h
+      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+      lossph = l2_loss::forward(outph, y)
+      gamma[i,j] = old  # reset
+      dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+                                                  lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking beta.")
+  for (i in 1:nrow(beta)) {
+    for (j in 1:ncol(beta)) {
+      # Compute numerical derivative
+      old = as.scalar(beta[i,j])
+      beta[i,j] = old - h
+      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+      lossmh = l2_loss::forward(outmh, y)
+      beta[i,j] = old + h
+      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+      lossph = l2_loss::forward(outph, y)
+      beta[i,j] = old  # reset
+      dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+                                                  lossph, lossmh)
+    }
+  }
+}
+
+sigmoid = function() {
+  /*
+   * Gradient check for the sigmoid nonlinearity layer.
+   */
+  print("Grad checking the sigmoid nonlinearity layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  M = 10 # num neurons
+  X = rand(rows=N, cols=M)
+  y = rand(rows=N, cols=M)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = sigmoid::forward(X)
+  dout = l2_loss::backward(out, y)
+  dX = sigmoid::backward(dout, X)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = sigmoid::forward(X)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = sigmoid::forward(X)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+softmax = function() {
+  /*
+   * Gradient check for the softmax layer.
+   */
+  print("Grad checking the softmax layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 10 # num classes
+  X = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
+  y = y / rowSums(y)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = softmax::forward(X)
+  dout = l2_loss::backward(out, y)
+  dX = softmax::backward(dout, X)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = softmax::forward(X)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = softmax::forward(X)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+tanh = function() {
+  /*
+   * Gradient check for the hyperbolic tangent (tanh) nonlinearity
+   * layer.
+   */
+  print("Grad checking the tanh nonlinearity layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  M = 10 # num neurons
+  X = rand(rows=N, cols=M)
+  y = rand(rows=N, cols=M)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = tanh::forward(X)
+  dout = l2_loss::backward(out, y)
+  dX = tanh::backward(dout, X)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = tanh::forward(X)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = tanh::forward(X)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+two_layer_affine_l2_net = function() {
+  /*
+   * Gradient check for a two-layer, fully-connected, feed-forward
+   * network with ReLU nonlinearity and L2 loss.
+   *
+   * NOTE: This could result in a false-negative in which the test
+   * fails due to a kink being crossed in the ReLU nonlinearity.  This
+   * occurs when the tests, f(x-h) and f(x+h), end up on opposite
+   * sides of the zero threshold of max(0, fx).  For now, just run
+   * the tests again.  In the future, we can explicitly check for
+   * this and rerun the test automatically.
+   */
+  print("Grad checking a two-layer, fully-connected, feed-forward network with a ReLU " +
+        "nonlinearity, and an L2 loss function.")
+
+  # Generate input data
+  N = 1000 # num examples
+  D = 100 # num features
+  yD = 5 # num targets
+  X = rand(rows=N, cols=D, pdf="normal")
+  y = rand(rows=N, cols=yD)
+
+  # Create 2-layer, fully-connected network
+  M = 10 # number of hidden neurons
+  [W1, b1] = affine::init(D, M)
+  [W2, b2] = affine::init(M, yD)
+
+  # Optimize for short "burn-in" time to move to characteristic
+  # mode of operation and unmask any real issues.
+  print(" - Burn-in:")
+  lr = 0.0001
+  decay = 0.99
+  for(i in 1:5) {
+    # Compute forward and backward passes of net
+    [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
+    print("   - L2 loss: " + loss)
+
+    # Optimize with basic SGD
+    W1 = W1 - lr * dW1
+    b1 = b1 - lr * db1
+    W2 = W2 - lr * dW2
+    b2 = b2 - lr * db2
+    lr = lr * decay
+  }
+
+  # Compute analytical gradients
+  [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:2) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old_x = as.scalar(X[i,j])
+      X[i,j] = old_x - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      X[i,j] = old_x + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      X[i,j] = old_x  # reset X[i,j]
+      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W1.")
+  for (i in 1:nrow(W1)) {
+    for (j in 1:ncol(W1)) {
+      # Compute numerical derivative
+      old_w = as.scalar(W1[i,j])
+      W1[i,j] = old_w - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      W1[i,j] = old_w + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      W1[i,j] = old_w  # reset W[i,j]
+      dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W2.")
+  for (i in 1:nrow(W2)) {
+    for (j in 1:ncol(W2)) {
+      # Compute numerical derivative
+      old_w = as.scalar(W2[i,j])
+      W2[i,j] = old_w - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      W2[i,j] = old_w + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      W2[i,j] = old_w  # reset W[i,j]
+      dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b1.")
+  for (i in 1:nrow(b1)) {
+    for (j in 1:ncol(b1)) {
+      # Compute numerical derivative
+      old_b = as.scalar(b1[i,j])
+      b1[i,j] = old_b - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      b1[i,j] = old_b + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      b1[i,j] = old_b  # reset b[1,j]
+      dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b2.")
+  for (i in 1:nrow(b2)) {
+    for (j in 1:ncol(b2)) {
+      # Compute numerical derivative
+      old_b = as.scalar(b2[i,j])
+      b2[i,j] = old_b - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      b2[i,j] = old_b + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+      b2[i,j] = old_b  # reset b[1,j]
+      dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+      # Check error
+      rel_error = test_util::check_rel_grad_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh)
+    }
+  }
+}
+
+/*
+ * Test network with forward/backward functions.
+ */
+two_layer_affine_l2_net_run = function(matrix[double] X, matrix[double] y,
+                                       matrix[double] W1, matrix[double] b1,
+                                       matrix[double] W2, matrix[double] b2)
+    return (matrix[double] pred, double loss,
+            matrix[double] dX,
+            matrix[double] dW1, matrix[double] db1,
+            matrix[double] dW2, matrix[double] db2) {
+  # Compute forward pass
+  [loss, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+
+  # Compute backward pass
+  [dX, dpred, daout, dhout, dW1, db1, dW2, db2] =
+      two_layer_affine_l2_net_backward(X, y, pred, aout, hout, W1, b1, W2, b2)
+}
+
+two_layer_affine_l2_net_forward = function(matrix[double] X, matrix[double] y,
+                                           matrix[double] W1, matrix[double] b1,
+                                           matrix[double] W2, matrix[double] b2)
+    return (double loss, matrix[double] pred, matrix[double] aout, matrix[double] hout) {
+  # Compute forward pass
+  hout = affine::forward(X, W1, b1)
+  aout = relu::forward(hout)
+  pred = affine::forward(aout, W2, b2)
+
+  # Compute loss
+  loss = l2_loss::forward(pred, y)
+}
+
+two_layer_affine_l2_net_backward = function(matrix[double] X, matrix[double] y, matrix[double] pred,
+                                            matrix[double] aout, matrix[double] hout,
+                                            matrix[double] W1, matrix[double] b1,
+                                            matrix[double] W2, matrix[double] b2)
+    return (matrix[double] dX, matrix[double] dpred,
+            matrix[double] daout, matrix[double] dhout,
+            matrix[double] dW1, matrix[double] db1, matrix[double] dW2, matrix[double] db2) {
+  # Compute backward pass
+  dpred = l2_loss::backward(pred, y)
+  [daout, dW2, db2] = affine::backward(dpred, aout, W2, b2)
+  dhout = relu::backward(daout, hout)
+  [dX, dW1, db1] = affine::backward(dhout, X, W1, b1)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/max_pool2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/max_pool2d_simple.dml b/scripts/nn/test/max_pool2d_simple.dml
new file mode 100644
index 0000000..188bd6e
--- /dev/null
+++ b/scripts/nn/test/max_pool2d_simple.dml
@@ -0,0 +1,172 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Max Pooling layer.
+ *
+ * This implementation is intended to be a simple, reference version.
+ */
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * This implementation is intended to be a simple, reference version.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+  # Create output volume
+  out = matrix(0, rows=N, cols=C*Hout*Wout)
+
+  # Max pooling
+  parfor (n in 1:N, check=0) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+
+    # Pad image
+    pad_value = -1/0
+    Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+    parfor (c in 1:C) {
+      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+    }
+    img = Xn_padded  # shape (C, (Hin+2*padh)*(Win+2*padw))
+
+    parfor (c in 1:C, check=0) {  # all channels
+      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      parfor (hout in 1:Hout, check=0) {  # all output rows
+        hin = (hout-1) * strideh + 1
+        parfor (wout in 1:Wout, check=0) {  # all output columns
+          win = (wout-1) * stridew + 1
+          out[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout] = max(img_slice[hin:hin+Hf-1,
+                                                               win:win+Wf-1])
+        }
+      }
+    }
+  }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   */
+  N = nrow(X)
+
+  # Create gradient volume
+  dX = matrix(0, rows=N, cols=C*Hin*Win)
+
+  # Gradient of max pooling
+  for (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+
+    # Pad image
+    pad_value = -1/0
+    Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+    parfor (c in 1:C) {
+      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+    }
+    img = Xn_padded
+
+    dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+    for (c in 1:C) {  # all channels
+      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+      for (hout in 1:Hout, check=0) {  # all output rows
+        hin = (hout-1) * strideh + 1
+        for (wout in 1:Wout) {  # all output columns
+          win = (wout-1) * stridew + 1
+          img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
+          max_val_ind = img_slice_patch == max(img_slice_patch)  # max value indicator matrix
+          # gradient passes through only for the max value(s) in this patch
+          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
+          dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
+                                                   + dimg_slice_patch
+        }
+      }
+      dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
+    }
+
+    # Unpad derivs on input
+    dXn = matrix(0, rows=C, cols=Hin*Win)
+    parfor (c in 1:C, check=0) {
+      dXn_padded_slice = matrix(dimg[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
+      dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
+      dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
+    }
+    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/run_tests.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/run_tests.dml b/scripts/nn/test/run_tests.dml
new file mode 100644
index 0000000..d8173a9
--- /dev/null
+++ b/scripts/nn/test/run_tests.dml
@@ -0,0 +1,90 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Script to run tests.
+ */
+source("nn/test/grad_check.dml") as grad_check
+source("nn/test/test.dml") as test
+
+print("")
+print("Starting grad checks.")
+print("---")
+
+# Loss & loss-related functions
+grad_check::cross_entropy_loss()
+grad_check::l1_loss()
+grad_check::l1_reg()
+grad_check::l2_loss()
+grad_check::l2_reg()
+grad_check::log_loss()
+print("")
+
+# Core layers
+grad_check::affine()
+grad_check::batch_norm1d()
+grad_check::batch_norm2d()
+grad_check::conv2d()
+grad_check::conv2d_builtin()
+grad_check::conv2d_simple()
+grad_check::dropout()
+grad_check::lstm()
+grad_check::max_pool2d()
+grad_check::max_pool2d_builtin()
+grad_check::max_pool2d_simple()
+grad_check::relu()
+grad_check::rnn()
+grad_check::scale_shift1d()
+grad_check::scale_shift2d()
+grad_check::sigmoid()
+grad_check::softmax()
+grad_check::tanh()
+print("")
+
+# Example model
+grad_check::two_layer_affine_l2_net()
+print("")
+
+print("---")
+print("Grad checks complete -- look for any ERRORs or WARNINGs.")
+print("If any tests involving ReLUs failed, try a few times " +
+      "to ensure that they were not false negatives due to " +
+      "kinks being crossed.")
+print("")
+
+print("")
+print("Starting other tests.")
+print("---")
+
+test::batch_norm1d()
+test::batch_norm2d()
+test::conv2d()
+test::cross_entropy_loss()
+test::im2col()
+test::max_pool2d()
+test::padding()
+test::tanh()
+
+print("---")
+print("Other tests complete -- look for any ERRORs or WARNINGs.")
+print("")
+print("")
+