You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/04/26 21:42:30 UTC
[04/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn` library to `scripts/nn`

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/grad_check.dml b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
deleted file mode 100644
index f3bc9a7..0000000
--- a/scripts/staging/SystemML-NN/nn/test/grad_check.dml
+++ /dev/null
@@ -1,1769 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Gradient checks for various architectures.
- */
-source("nn/layers/affine.dml") as affine
-source("nn/layers/batch_norm1d.dml") as batch_norm1d
-source("nn/layers/batch_norm2d.dml") as batch_norm2d
-source("nn/layers/conv2d.dml") as conv2d
-source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/dropout.dml") as dropout
-source("nn/layers/l1_loss.dml") as l1_loss
-source("nn/layers/l1_reg.dml") as l1_reg
-source("nn/layers/l2_loss.dml") as l2_loss
-source("nn/layers/l2_reg.dml") as l2_reg
-source("nn/layers/log_loss.dml") as log_loss
-source("nn/layers/lstm.dml") as lstm
-source("nn/layers/max_pool2d.dml") as max_pool2d
-source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
-source("nn/layers/relu.dml") as relu
-source("nn/layers/rnn.dml") as rnn
-source("nn/layers/scale_shift1d.dml") as scale_shift1d
-source("nn/layers/scale_shift2d.dml") as scale_shift2d
-source("nn/layers/sigmoid.dml") as sigmoid
-source("nn/layers/softmax.dml") as softmax
-source("nn/layers/tanh.dml") as tanh
-source("nn/test/conv2d_simple.dml") as conv2d_simple
-source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
-source("nn/test/util.dml") as test_util
-
-affine = function() {
-  /*
-   * Gradient check for the affine layer.
-   */
-  print("Grad checking the affine layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  D = 100 # num features
-  M = 10 # num neurons
-  X = rand(rows=N, cols=D)
-  y = rand(rows=N, cols=M)
-  [W, b] = affine::init(D, M)
-
-  # Compute analytical gradients of loss wrt parameters
-  out = affine::forward(X, W, b)
-  dout = l2_loss::backward(out, y)
-  [dX, dW, db] = affine::backward(dout, X, W, b)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      outmh = affine::forward(X, W, b)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      outph = affine::forward(X, W, b)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W.")
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      outmh = affine::forward(X, W, b)
-      lossmh = l2_loss::forward(outmh, y)
-      W[i,j] = old + h
-      outph = affine::forward(X, W, b)
-      lossph = l2_loss::forward(outph, y)
-      W[i,j] = old  # reset
-      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b.")
-  for (i in 1:nrow(b)) {
-    for (j in 1:ncol(b)) {
-      # Compute numerical derivative
-      old = as.scalar(b[i,j])
-      b[i,j] = old - h
-      outmh = affine::forward(X, W, b)
-      lossmh = l2_loss::forward(outmh, y)
-      b[i,j] = old + h
-      outph = affine::forward(X, W, b)
-      lossph = l2_loss::forward(outph, y)
-      b[i,j] = old  # reset
-      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
-    }
-  }
-}
-
-batch_norm1d = function() {
-  /*
-   * Gradient check for the 1D batch normalization layer.
-   */
-  print("Grad checking the 1D batch normalization layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  D = 100 # num features
-  mu = 0.9  # momentum
-  eps = 1e-5  # epsilon
-  X = rand(rows=N, cols=D)
-  y = rand(rows=N, cols=D)
-  gamma = rand(rows=1, cols=D)
-  beta = rand(rows=1, cols=D)
-  ema_mean = rand(rows=1, cols=D)
-  ema_var = rand(rows=1, cols=D)
-  #[dummy, dummy, ema_mean, ema_var] = batch_norm1d::init(D)
-
-  # Check training & testing modes
-  for (i in 1:2) {
-    if (i == 1)
-      mode = 'train'
-    else
-      mode = 'test'
-    print(" - Grad checking the '"+mode+"' mode.")
-
-    # Compute analytical gradients of loss wrt parameters
-    [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-        batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-    dout = l2_loss::backward(out, y)
-    [dX, dgamma, dbeta] = batch_norm1d::backward(dout, out, ema_mean_upd, ema_var_upd,
-                                                 cache_mean, cache_var, cache_norm,
-                                                 X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-
-    # Grad check
-    h = 1e-5
-    print("   - Grad checking X.")
-    for (i in 1:nrow(X)) {
-      for (j in 1:ncol(X)) {
-        # Compute numerical derivative
-        old = as.scalar(X[i,j])
-        X[i,j] = old - h
-        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-        lossmh = l2_loss::forward(outmh, y)
-        X[i,j] = old + h
-        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-        lossph = l2_loss::forward(outph, y)
-        X[i,j] = old  # reset
-        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-      }
-    }
-
-    print("   - Grad checking gamma.")
-    for (i in 1:nrow(gamma)) {
-      for (j in 1:ncol(gamma)) {
-        # Compute numerical derivative
-        old = as.scalar(gamma[i,j])
-        gamma[i,j] = old - h
-        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-        lossmh = l2_loss::forward(outmh, y)
-        gamma[i,j] = old + h
-        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-        lossph = l2_loss::forward(outph, y)
-        gamma[i,j] = old  # reset
-        dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
-                                                    lossph, lossmh)
-      }
-    }
-
-    print("   - Grad checking beta.")
-    for (i in 1:nrow(beta)) {
-      for (j in 1:ncol(beta)) {
-        # Compute numerical derivative
-        old = as.scalar(beta[i,j])
-        beta[i,j] = old - h
-        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-        lossmh = l2_loss::forward(outmh, y)
-        beta[i,j] = old + h
-        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-        lossph = l2_loss::forward(outph, y)
-        beta[i,j] = old  # reset
-        dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
-                                                    lossph, lossmh)
-      }
-    }
-  }
-}
-
-batch_norm2d = function() {
-  /*
-   * Gradient check for the 2D (spatial) batch normalization layer.
-   */
-  print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  C = 2  # num channels
-  Hin = 5  # input height
-  Win = 5  # input width
-  mu = 0.9  # momentum
-  eps = 1e-5  # epsilon
-  X = rand(rows=N, cols=C*Hin*Win)
-  y = rand(rows=N, cols=C*Hin*Win)
-  gamma = rand(rows=C, cols=1)
-  beta = rand(rows=C, cols=1)
-  ema_mean = rand(rows=C, cols=1)
-  ema_var = rand(rows=C, cols=1)
-  #[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C)
-
-  # Check training & testing modes
-  for (i in 1:2) {
-    if (i == 1)
-      mode = 'train'
-    else
-      mode = 'test'
-    print(" - Grad checking the '"+mode+"' mode.")
-
-    # Compute analytical gradients of loss wrt parameters
-    [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-        batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-    dout = l2_loss::backward(out, y)
-    [dX, dgamma, dbeta] = batch_norm2d::backward(dout, out, ema_mean_upd, ema_var_upd,
-                                                 cache_mean, cache_var, cache_norm,
-                                                 X, gamma, beta, C, Hin, Win, mode,
-                                                 ema_mean, ema_var, mu, eps)
-
-    # Grad check
-    h = 1e-5
-    print("   - Grad checking X.")
-    for (i in 1:nrow(X)) {
-      for (j in 1:ncol(X)) {
-        # Compute numerical derivative
-        old = as.scalar(X[i,j])
-        X[i,j] = old - h
-        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-        lossmh = l2_loss::forward(outmh, y)
-        X[i,j] = old + h
-        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-        lossph = l2_loss::forward(outph, y)
-        X[i,j] = old  # reset
-        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-      }
-    }
-
-    print("   - Grad checking gamma.")
-    for (i in 1:nrow(gamma)) {
-      for (j in 1:ncol(gamma)) {
-        # Compute numerical derivative
-        old = as.scalar(gamma[i,j])
-        gamma[i,j] = old - h
-        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-        lossmh = l2_loss::forward(outmh, y)
-        gamma[i,j] = old + h
-        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-        lossph = l2_loss::forward(outph, y)
-        gamma[i,j] = old  # reset
-        dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
-                                                    lossph, lossmh)
-      }
-    }
-
-    print("   - Grad checking beta.")
-    for (i in 1:nrow(beta)) {
-      for (j in 1:ncol(beta)) {
-        # Compute numerical derivative
-        old = as.scalar(beta[i,j])
-        beta[i,j] = old - h
-        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-        lossmh = l2_loss::forward(outmh, y)
-        beta[i,j] = old + h
-        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
-            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-        lossph = l2_loss::forward(outph, y)
-        beta[i,j] = old  # reset
-        dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
-                                                    lossph, lossmh)
-      }
-    }
-  }
-}
-
-conv2d = function() {
-  /*
-   * Gradient check for the 2D convolutional layer using `im2col`.
-   */
-  print("Grad checking the `im2col` 2D convolutional layer with L2 loss.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 2  # num channels
-  Hin = 5  # input height
-  Win = 5  # input width
-  F = 2  # num filters
-  Hf = 3  # filter height
-  Wf = 3  # filter width
-  stride = 1
-  pad = 1
-  X = rand(rows=N, cols=C*Hin*Win)
-  y = rand(rows=N, cols=F*Hin*Win)
-
-  # Create layers
-  [W, b] = conv2d::init(F, C, Hf, Wf)
-
-  # Compute analytical gradients of loss wrt parameters
-  [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-  dout = l2_loss::backward(out, y)
-  [dX, dW, db] = conv2d::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                  pad, pad)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W.")
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      W[i,j] = old + h
-      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      W[i,j] = old  # reset
-      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b.")
-  for (i in 1:nrow(b)) {
-    for (j in 1:ncol(b)) {
-      # Compute numerical derivative
-      old = as.scalar(b[i,j])
-      b[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      b[i,j] = old + h
-      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      b[i,j] = old  # reset
-      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
-    }
-  }
-}
-
-conv2d_builtin = function() {
-  /*
-   * Gradient check for the 2D convolutional layer using built-in
-   * functions.
-   */
-  print("Grad checking the built-in 2D convolutional layer with L2 loss.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 2  # num channels
-  Hin = 5  # input height
-  Win = 5  # input width
-  F = 2  # num filters
-  Hf = 3  # filter height
-  Wf = 3  # filter width
-  stride = 1
-  pad = 1
-  X = rand(rows=N, cols=C*Hin*Win)
-  y = rand(rows=N, cols=F*Hin*Win)
-
-  # Create layers
-  [W, b] = conv2d_builtin::init(F, C, Hf, Wf)
-
-  # Compute analytical gradients of loss wrt parameters
-  [out, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                              pad, pad)
-  dout = l2_loss::backward(out, y)
-  [dX, dW, db] = conv2d_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
-                                          stride, stride, pad, pad)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                    pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                    pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W.")
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                    pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      W[i,j] = old + h
-      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                    pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      W[i,j] = old  # reset
-      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b.")
-  for (i in 1:nrow(b)) {
-    for (j in 1:ncol(b)) {
-      # Compute numerical derivative
-      old = as.scalar(b[i,j])
-      b[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                    pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      b[i,j] = old + h
-      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                    pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      b[i,j] = old  # reset
-      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
-    }
-  }
-}
-
-conv2d_simple = function() {
-  /*
-   * Gradient check for the simple reference 2D convolutional layer.
-   */
-  print("Grad checking the simple reference 2D convolutional layer with L2 loss.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 2  # num channels
-  Hin = 5  # input height
-  Win = 5  # input width
-  F = 2  # num filters
-  Hf = 3  # filter height
-  Wf = 3  # filter width
-  stride = 1
-  pad = 1
-  X = rand(rows=N, cols=C*Hin*Win)
-  y = rand(rows=N, cols=F*Hin*Win)
-
-  # Create layers
-  [W, b] = conv2d_simple::init(F, C, Hf, Wf)
-
-  # Compute analytical gradients of loss wrt parameters
-  [out, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-  dout = l2_loss::backward(out, y)
-  [dX, dW, db] = conv2d_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
-                                         stride, stride, pad, pad)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                   pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                   pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W.")
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                   pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      W[i,j] = old + h
-      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                   pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      W[i,j] = old  # reset
-      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b.")
-  for (i in 1:nrow(b)) {
-    for (j in 1:ncol(b)) {
-      # Compute numerical derivative
-      old = as.scalar(b[i,j])
-      b[i,j] = old - h
-      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                   pad, pad)
-      lossmh = l2_loss::forward(outmh, y)
-      b[i,j] = old + h
-      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
-                                                   pad, pad)
-      lossph = l2_loss::forward(outph, y)
-      b[i,j] = old  # reset
-      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
-    }
-  }
-}
-
-cross_entropy_loss = function() {
-  /*
-   * Gradient check for the cross-entropy loss function.
-   */
-  print("Grad checking the cross-entropy loss function.")
-
-  # Generate data
-  N = 3 # num examples
-  K = 10 # num targets
-  pred = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
-  pred = pred / rowSums(pred)  # normalized probs
-  y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
-  y = y / rowSums(y)  # normalized probs
-
-  # Compute analytical gradient
-  dpred = cross_entropy_loss::backward(pred, y)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(pred)) {
-    for (j in 1:ncol(pred)) {
-      # Compute numerical derivative
-      old = as.scalar(pred[i,j])
-      pred[i,j] = old - h
-      lossmh = cross_entropy_loss::forward(pred, y)
-      pred[i,j] = old + h
-      lossph = cross_entropy_loss::forward(pred, y)
-      pred[i,j] = old  # reset W[i,j]
-      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
-    }
-  }
-}
-
-dropout = function() {
-  /*
-   * Gradient check for the (inverted) dropout layer.
-   */
-  print("Grad checking the (inverted) dropout layer with L2 loss.")
-
-  # Generate data
-  N = 3  # num examples
-  M = 100  # num neurons
-  p = 0.5  # probability of dropping neuron output
-  seed = as.integer(floor(as.scalar(rand(rows=1, cols=1, min=1, max=100000))))  # random seed
-  X = rand(rows=N, cols=M)
-  y = rand(rows=N, cols=M)
-
-  # Compute analytical gradients of loss wrt parameters
-  [out, mask] = dropout::forward(X, p, seed)
-  dout = l2_loss::backward(out, y)
-  dX = dropout::backward(dout, X, p, mask)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      [outmh, mask] = dropout::forward(X, p, seed)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      [outph, mask] = dropout::forward(X, p, seed)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-}
-
-l1_loss = function() {
-  /*
-   * Gradient check for the L1 loss function.
-   */
-  print("Grad checking the L1 loss function.")
-
-  # Generate data
-  N = 3 # num examples
-  D = 2 # num targets
-  pred = rand(rows=N, cols=D)
-  y = rand(rows=N, cols=D)
-
-  # Compute analytical gradient
-  dpred = l1_loss::backward(pred, y)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(pred)) {
-    for (j in 1:ncol(pred)) {
-      # Compute numerical derivative
-      old = as.scalar(pred[i,j])
-      pred[i,j] = old - h
-      lossmh = l1_loss::forward(pred, y)
-      pred[i,j] = old + h
-      lossph = l1_loss::forward(pred, y)
-      pred[i,j] = old  # reset W[i,j]
-      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
-    }
-  }
-}
-
-l1_reg = function() {
-  /*
-   * Gradient check for the L1 regularization function.
-   */
-  print("Grad checking the L1 regularization function.")
-
-  # Generate data
-  D = 5 # num features
-  M = 3 # num neurons
-  lambda = 0.01
-  W = rand(rows=D, cols=M)
-
-  # Compute analytical gradient
-  dW = l1_reg::backward(W, lambda)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      reg_lossmh = l1_reg::forward(W, lambda)
-      W[i,j] = old + h
-      reg_lossph = l1_reg::forward(W, lambda)
-      W[i,j] = old  # reset W[i,j]
-      dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
-                                                  reg_lossph, reg_lossmh)
-    }
-  }
-}
-
-l2_loss = function() {
-  /*
-   * Gradient check for the L2 loss function.
-   */
-  print("Grad checking the L2 loss function.")
-
-  # Generate data
-  N = 3 # num examples
-  D = 2 # num targets
-  pred = rand(rows=N, cols=D)
-  y = rand(rows=N, cols=D)
-
-  # Compute analytical gradient
-  dpred = l2_loss::backward(pred, y)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(pred)) {
-    for (j in 1:ncol(pred)) {
-      # Compute numerical derivative
-      old = as.scalar(pred[i,j])
-      pred[i,j] = old - h
-      lossmh = l2_loss::forward(pred, y)
-      pred[i,j] = old + h
-      lossph = l2_loss::forward(pred, y)
-      pred[i,j] = old  # reset W[i,j]
-      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
-    }
-  }
-}
-
-l2_reg = function() {
-  /*
-   * Gradient check for the L2 regularization function.
-   */
-  print("Grad checking the L2 regularization function.")
-
-  # Generate data
-  D = 5 # num features
-  M = 3 # num neurons
-  lambda = 0.01
-  W = rand(rows=D, cols=M)
-
-  # Compute analytical gradient
-  dW = l2_reg::backward(W, lambda)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      reg_lossmh = l2_reg::forward(W, lambda)
-      W[i,j] = old + h
-      reg_lossph = l2_reg::forward(W, lambda)
-      W[i,j] = old  # reset W[i,j]
-      dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
-                                                  reg_lossph, reg_lossmh)
-    }
-  }
-}
-
-log_loss = function() {
-  /*
-   * Gradient check for the log loss function.
-   */
-  print("Grad checking the log loss function.")
-
-  # Generate data
-  N = 20 # num examples
-  D = 1 # num targets
-  pred = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
-  y = round(rand(rows=N, cols=D, min=0, max=1, pdf="uniform"))
-
-  # Compute analytical gradient
-  dpred = log_loss::backward(pred, y)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(pred)) {
-    for (j in 1:ncol(pred)) {
-      # Compute numerical derivative
-      old = as.scalar(pred[i,j])
-      pred[i,j] = old - h
-      lossmh = log_loss::forward(pred, y)
-      pred[i,j] = old + h
-      lossph = log_loss::forward(pred, y)
-      pred[i,j] = old  # reset W[i,j]
-      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
-    }
-  }
-}
-
-lstm = function() {
-  /*
-   * Gradient check for the LSTM layer.
-   */
-  print("Grad checking the LSTM layer with L2 loss.")
-
-  # Generate data
-  N = 3  # num examples
-  D = 10  # num features
-  T = 15  # num timesteps (sequence length)
-  M = 5 # num neurons
-  return_seq = TRUE
-  X = rand(rows=N, cols=T*D)
-  y = rand(rows=N, cols=T*M)
-  yc = rand(rows=N, cols=M)
-  out0 = rand(rows=N, cols=M)
-  c0 = rand(rows=N, cols=M)
-  [W, b, dummy, dummy2] = lstm::init(N, D, M)
-
-  # Compute analytical gradients of loss wrt parameters
-  [out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-  dout = l2_loss::backward(out, y)
-  dc = l2_loss::backward(c, yc)
-  [dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0,
-                                            cache_out, cache_c, cache_ifog)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      X[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W.")
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      W[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      W[i,j] = old  # reset
-      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b.")
-  for (i in 1:nrow(b)) {
-    for (j in 1:ncol(b)) {
-      # Compute numerical derivative
-      old = as.scalar(b[i,j])
-      b[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      b[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      b[i,j] = old  # reset
-      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking out0.")
-  for (i in 1:nrow(out0)) {
-    for (j in 1:ncol(out0)) {
-      # Compute numerical derivative
-      old = as.scalar(out0[i,j])
-      out0[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      out0[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      out0[i,j] = old  # reset
-      dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking c0.")
-  for (i in 1:nrow(c0)) {
-    for (j in 1:ncol(c0)) {
-      # Compute numerical derivative
-      old = as.scalar(c0[i,j])
-      c0[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      c0[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      c0[i,j] = old  # reset
-      dc0_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
-    }
-  }
-}
-
-max_pool2d = function() {
-  /*
-   * Gradient check for the 2D max pooling layer.
-   */
-  print("Grad checking the 2D max pooling layer with L2 loss.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 2  # num channels
-  Hin = 4  # input height
-  Win = 4  # input width
-  Hf = 2  # pool filter height
-  Wf = 2  # pool filter width
-  stride = 2
-  X = rand(rows=N, cols=C*Hin*Win)
-
-  for (pad in 0:1) {
-    print(" - Grad checking w/ pad="+pad+".")
-    Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
-    Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
-    y = rand(rows=N, cols=C*Hout*Wout)
-
-    # Compute analytical gradients of loss wrt parameters
-    [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-    dout = l2_loss::backward(out, y)
-    dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-
-    # Grad check
-    h = 1e-5
-    for (i in 1:nrow(X)) {
-      for (j in 1:ncol(X)) {
-        # Compute numerical derivative
-        old = as.scalar(X[i,j])
-        X[i,j] = old - h
-        [outmh, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-        lossmh = l2_loss::forward(outmh, y)
-        X[i,j] = old + h
-        [outph, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-        lossph = l2_loss::forward(outph, y)
-        X[i,j] = old  # reset
-        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-      }
-    }
-  }
-}
-
-max_pool2d_builtin = function() {
-  /*
-   * Gradient check for the 2D max pooling layer.
-   */
-  print("Grad checking the built-in 2D max pooling layer with L2 loss.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 2  # num channels
-  Hin = 4  # input height
-  Win = 4  # input width
-  Hf = 2  # pool filter height
-  Wf = 2  # pool filter width
-  stride = 2
-  X = rand(rows=N, cols=C*Hin*Win)
-
-  for (pad in 0:1) {
-    print(" - Grad checking w/ pad="+pad+".")
-    Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
-    Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
-    y = rand(rows=N, cols=C*Hout*Wout)
-
-    # Compute analytical gradients of loss wrt parameters
-    [out, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
-                                                    pad, pad)
-    dout = l2_loss::backward(out, y)
-    dX = max_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
-                                      pad, pad)
-
-    # Grad check
-    h = 1e-5
-    for (i in 1:nrow(X)) {
-      for (j in 1:ncol(X)) {
-        # Compute numerical derivative
-        old = as.scalar(X[i,j])
-        X[i,j] = old - h
-        [outmh, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
-                                                          pad, pad)
-        lossmh = l2_loss::forward(outmh, y)
-        X[i,j] = old + h
-        [outph, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
-                                                          pad, pad)
-        lossph = l2_loss::forward(outph, y)
-        X[i,j] = old  # reset
-        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-      }
-    }
-  }
-}
-
-max_pool2d_simple = function() {
-  /*
-   * Gradient check for the simple reference 2D max pooling layer.
-   */
-  print("Grad checking the simple reference 2D max pooling layer with L2 loss.")
-
-  # Generate data
-  N = 2  # num examples
-  C = 2  # num channels
-  Hin = 4  # input height
-  Win = 4  # input width
-  Hf = 2  # pool filter height
-  Wf = 2  # pool filter width
-  stride = 2
-  X = rand(rows=N, cols=C*Hin*Win)
-
-  for (pad in 0:1) {
-    print(" - Grad checking w/ pad="+pad+".")
-    Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
-    Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
-    y = rand(rows=N, cols=C*Hout*Wout)
-
-    # Compute analytical gradients of loss wrt parameters
-    [out, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-    dout = l2_loss::backward(out, y)
-    dX = max_pool2d_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
-                                     pad, pad)
-
-    # Grad check
-    h = 1e-5
-    for (i in 1:nrow(X)) {
-      for (j in 1:ncol(X)) {
-        # Compute numerical derivative
-        old = as.scalar(X[i,j])
-        X[i,j] = old - h
-        [outmh, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
-                                                         pad, pad)
-        lossmh = l2_loss::forward(outmh, y)
-        X[i,j] = old + h
-        [outph, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
-                                                         pad, pad)
-        lossph = l2_loss::forward(outph, y)
-        X[i,j] = old  # reset
-        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-        # Check error
-        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-      }
-    }
-  }
-}
-
-relu = function() {
-  /*
-   * Gradient check for the ReLU nonlinearity layer.
-   *
-   * NOTE: This could result in a false-negative in which the test
-   * fails due to a kink being crossed in the nonlinearity.  This
-   * occurs when the tests, f(x-h) and f(x+h), end up on opposite
-   * sides of the zero threshold of max(0, fx).  For now, just run
-   * the tests again.  In the future, we can explicitly check for
-   * this and rerun the test automatically.
-   */
-  print("Grad checking the ReLU nonlinearity layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  M = 10 # num neurons
-  X = rand(rows=N, cols=M, min=-5, max=5)
-  y = rand(rows=N, cols=M)
-
-  # Compute analytical gradients of loss wrt parameters
-  out = relu::forward(X)
-  dout = l2_loss::backward(out, y)
-  dX = relu::backward(dout, X)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      outmh = relu::forward(X)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      outph = relu::forward(X)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-}
-
-rnn = function() {
-  /*
-   * Gradient check for the simple RNN layer.
-   */
-  print("Grad checking the simple RNN layer with L2 loss.")
-
-  # Generate data
-  N = 3  # num examples
-  D = 10  # num features
-  T = 15  # num timesteps (sequence length)
-  M = 5 # num neurons
-  return_seq = TRUE
-  X = rand(rows=N, cols=T*D)
-  y = rand(rows=N, cols=T*M)
-  out0 = rand(rows=N, cols=M)
-  [W, b, dummy] = rnn::init(N, D, M)
-
-  # Compute analytical gradients of loss wrt parameters
-  [out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-  dout = l2_loss::backward(out, y)
-  [dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W.")
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossmh = l2_loss::forward(outmh, y)
-      W[i,j] = old + h
-      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossph = l2_loss::forward(outph, y)
-      W[i,j] = old  # reset
-      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b.")
-  for (i in 1:nrow(b)) {
-    for (j in 1:ncol(b)) {
-      # Compute numerical derivative
-      old = as.scalar(b[i,j])
-      b[i,j] = old - h
-      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossmh = l2_loss::forward(outmh, y)
-      b[i,j] = old + h
-      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossph = l2_loss::forward(outph, y)
-      b[i,j] = old  # reset
-      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking out0.")
-  for (i in 1:nrow(out0)) {
-    for (j in 1:ncol(out0)) {
-      # Compute numerical derivative
-      old = as.scalar(out0[i,j])
-      out0[i,j] = old - h
-      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossmh = l2_loss::forward(outmh, y)
-      out0[i,j] = old + h
-      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossph = l2_loss::forward(outph, y)
-      out0[i,j] = old  # reset
-      dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
-    }
-  }
-}
-
-scale_shift1d = function() {
-  /*
-   * Gradient check for the 1D scale & shift layer.
-   */
-  print("Grad checking the 1D scale & shift layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  D = 100 # num features
-  X = rand(rows=N, cols=D)
-  y = rand(rows=N, cols=D)
-  [gamma, beta] = scale_shift1d::init(D)
-
-  # Compute analytical gradients of loss wrt parameters
-  out = scale_shift1d::forward(X, gamma, beta)
-  dout = l2_loss::backward(out, y)
-  [dX, dgamma, dbeta] = scale_shift1d::backward(dout, out, X, gamma, beta)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      outmh = scale_shift1d::forward(X, gamma, beta)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      outph = scale_shift1d::forward(X, gamma, beta)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking gamma.")
-  for (i in 1:nrow(gamma)) {
-    for (j in 1:ncol(gamma)) {
-      # Compute numerical derivative
-      old = as.scalar(gamma[i,j])
-      gamma[i,j] = old - h
-      outmh = scale_shift1d::forward(X, gamma, beta)
-      lossmh = l2_loss::forward(outmh, y)
-      gamma[i,j] = old + h
-      outph = scale_shift1d::forward(X, gamma, beta)
-      lossph = l2_loss::forward(outph, y)
-      gamma[i,j] = old  # reset
-      dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
-                                                  lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking beta.")
-  for (i in 1:nrow(beta)) {
-    for (j in 1:ncol(beta)) {
-      # Compute numerical derivative
-      old = as.scalar(beta[i,j])
-      beta[i,j] = old - h
-      outmh = scale_shift1d::forward(X, gamma, beta)
-      lossmh = l2_loss::forward(outmh, y)
-      beta[i,j] = old + h
-      outph = scale_shift1d::forward(X, gamma, beta)
-      lossph = l2_loss::forward(outph, y)
-      beta[i,j] = old  # reset
-      dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
-                                                  lossph, lossmh)
-    }
-  }
-}
-
-scale_shift2d = function() {
-  /*
-   * Gradient check for the 2D scale & shift layer.
-   */
-  print("Grad checking the 2D scale & shift layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  C = 2  # num channels
-  Hin = 5  # input height
-  Win = 5  # input width
-  X = rand(rows=N, cols=C*Hin*Win)
-  y = rand(rows=N, cols=C*Hin*Win)
-  [gamma, beta] = scale_shift2d::init(C)
-
-  # Compute analytical gradients of loss wrt parameters
-  out = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
-  dout = l2_loss::backward(out, y)
-  [dX, dgamma, dbeta] = scale_shift2d::backward(dout, out, X, gamma, beta, C, Hin, Win)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking gamma.")
-  for (i in 1:nrow(gamma)) {
-    for (j in 1:ncol(gamma)) {
-      # Compute numerical derivative
-      old = as.scalar(gamma[i,j])
-      gamma[i,j] = old - h
-      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
-      lossmh = l2_loss::forward(outmh, y)
-      gamma[i,j] = old + h
-      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
-      lossph = l2_loss::forward(outph, y)
-      gamma[i,j] = old  # reset
-      dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
-                                                  lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking beta.")
-  for (i in 1:nrow(beta)) {
-    for (j in 1:ncol(beta)) {
-      # Compute numerical derivative
-      old = as.scalar(beta[i,j])
-      beta[i,j] = old - h
-      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
-      lossmh = l2_loss::forward(outmh, y)
-      beta[i,j] = old + h
-      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
-      lossph = l2_loss::forward(outph, y)
-      beta[i,j] = old  # reset
-      dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
-                                                  lossph, lossmh)
-    }
-  }
-}
-
-sigmoid = function() {
-  /*
-   * Gradient check for the sigmoid nonlinearity layer.
-   */
-  print("Grad checking the sigmoid nonlinearity layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  M = 10 # num neurons
-  X = rand(rows=N, cols=M)
-  y = rand(rows=N, cols=M)
-
-  # Compute analytical gradients of loss wrt parameters
-  out = sigmoid::forward(X)
-  dout = l2_loss::backward(out, y)
-  dX = sigmoid::backward(dout, X)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      outmh = sigmoid::forward(X)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      outph = sigmoid::forward(X)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-}
-
-softmax = function() {
-  /*
-   * Gradient check for the softmax layer.
-   */
-  print("Grad checking the softmax layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  D = 10 # num classes
-  X = rand(rows=N, cols=D)
-  y = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
-  y = y / rowSums(y)
-
-  # Compute analytical gradients of loss wrt parameters
-  out = softmax::forward(X)
-  dout = l2_loss::backward(out, y)
-  dX = softmax::backward(dout, X)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      outmh = softmax::forward(X)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      outph = softmax::forward(X)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-}
-
-tanh = function() {
-  /*
-   * Gradient check for the hyperbolic tangent (tanh) nonlinearity
-   * layer.
-   */
-  print("Grad checking the tanh nonlinearity layer with L2 loss.")
-
-  # Generate data
-  N = 3 # num examples
-  M = 10 # num neurons
-  X = rand(rows=N, cols=M)
-  y = rand(rows=N, cols=M)
-
-  # Compute analytical gradients of loss wrt parameters
-  out = tanh::forward(X)
-  dout = l2_loss::backward(out, y)
-  dX = tanh::backward(dout, X)
-
-  # Grad check
-  h = 1e-5
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      outmh = tanh::forward(X)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      outph = tanh::forward(X)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-}
-
-two_layer_affine_l2_net = function() {
-  /*
-   * Gradient check for a two-layer, fully-connected, feed-forward
-   * network with ReLU nonlinearity and L2 loss.
-   *
-   * NOTE: This could result in a false-negative in which the test
-   * fails due to a kink being crossed in the ReLU nonlinearity.  This
-   * occurs when the tests, f(x-h) and f(x+h), end up on opposite
-   * sides of the zero threshold of max(0, fx).  For now, just run
-   * the tests again.  In the future, we can explicitly check for
-   * this and rerun the test automatically.
-   */
-  print("Grad checking a two-layer, fully-connected, feed-forward network with a ReLU " +
-        "nonlinearity, and an L2 loss function.")
-
-  # Generate input data
-  N = 1000 # num examples
-  D = 100 # num features
-  yD = 5 # num targets
-  X = rand(rows=N, cols=D, pdf="normal")
-  y = rand(rows=N, cols=yD)
-
-  # Create 2-layer, fully-connected network
-  M = 10 # number of hidden neurons
-  [W1, b1] = affine::init(D, M)
-  [W2, b2] = affine::init(M, yD)
-
-  # Optimize for short "burn-in" time to move to characteristic
-  # mode of operation and unmask any real issues.
-  print(" - Burn-in:")
-  lr = 0.0001
-  decay = 0.99
-  for(i in 1:5) {
-    # Compute forward and backward passes of net
-    [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
-    print("   - L2 loss: " + loss)
-
-    # Optimize with basic SGD
-    W1 = W1 - lr * dW1
-    b1 = b1 - lr * db1
-    W2 = W2 - lr * dW2
-    b2 = b2 - lr * db2
-    lr = lr * decay
-  }
-
-  # Compute analytical gradients
-  [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
-
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:2) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old_x = as.scalar(X[i,j])
-      X[i,j] = old_x - h
-      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      X[i,j] = old_x + h
-      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      X[i,j] = old_x  # reset X[i,j]
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W1.")
-  for (i in 1:nrow(W1)) {
-    for (j in 1:ncol(W1)) {
-      # Compute numerical derivative
-      old_w = as.scalar(W1[i,j])
-      W1[i,j] = old_w - h
-      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      W1[i,j] = old_w + h
-      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      W1[i,j] = old_w  # reset W[i,j]
-      dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking W2.")
-  for (i in 1:nrow(W2)) {
-    for (j in 1:ncol(W2)) {
-      # Compute numerical derivative
-      old_w = as.scalar(W2[i,j])
-      W2[i,j] = old_w - h
-      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      W2[i,j] = old_w + h
-      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      W2[i,j] = old_w  # reset W[i,j]
-      dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b1.")
-  for (i in 1:nrow(b1)) {
-    for (j in 1:ncol(b1)) {
-      # Compute numerical derivative
-      old_b = as.scalar(b1[i,j])
-      b1[i,j] = old_b - h
-      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      b1[i,j] = old_b + h
-      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      b1[i,j] = old_b  # reset b[1,j]
-      dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh)
-    }
-  }
-
-  print(" - Grad checking b2.")
-  for (i in 1:nrow(b2)) {
-    for (j in 1:ncol(b2)) {
-      # Compute numerical derivative
-      old_b = as.scalar(b2[i,j])
-      b2[i,j] = old_b - h
-      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      b2[i,j] = old_b + h
-      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-      b2[i,j] = old_b  # reset b[1,j]
-      dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative
-
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh)
-    }
-  }
-}
-
-/*
- * Test network with forward/backward functions.
- */
-two_layer_affine_l2_net_run = function(matrix[double] X, matrix[double] y,
-                                       matrix[double] W1, matrix[double] b1,
-                                       matrix[double] W2, matrix[double] b2)
-    return (matrix[double] pred, double loss,
-            matrix[double] dX,
-            matrix[double] dW1, matrix[double] db1,
-            matrix[double] dW2, matrix[double] db2) {
-  # Compute forward pass
-  [loss, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-
-  # Compute backward pass
-  [dX, dpred, daout, dhout, dW1, db1, dW2, db2] =
-      two_layer_affine_l2_net_backward(X, y, pred, aout, hout, W1, b1, W2, b2)
-}
-
-two_layer_affine_l2_net_forward = function(matrix[double] X, matrix[double] y,
-                                           matrix[double] W1, matrix[double] b1,
-                                           matrix[double] W2, matrix[double] b2)
-    return (double loss, matrix[double] pred, matrix[double] aout, matrix[double] hout) {
-  # Compute forward pass
-  hout = affine::forward(X, W1, b1)
-  aout = relu::forward(hout)
-  pred = affine::forward(aout, W2, b2)
-
-  # Compute loss
-  loss = l2_loss::forward(pred, y)
-}
-
-two_layer_affine_l2_net_backward = function(matrix[double] X, matrix[double] y, matrix[double] pred,
-                                            matrix[double] aout, matrix[double] hout,
-                                            matrix[double] W1, matrix[double] b1,
-                                            matrix[double] W2, matrix[double] b2)
-    return (matrix[double] dX, matrix[double] dpred,
-            matrix[double] daout, matrix[double] dhout,
-            matrix[double] dW1, matrix[double] db1, matrix[double] dW2, matrix[double] db2) {
-  # Compute backward pass
-  dpred = l2_loss::backward(pred, y)
-  [daout, dW2, db2] = affine::backward(dpred, aout, W2, b2)
-  dhout = relu::backward(daout, hout)
-  [dX, dW1, db1] = affine::backward(dhout, X, W1, b1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml b/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
deleted file mode 100644
index 188bd6e..0000000
--- a/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
+++ /dev/null
@@ -1,172 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Max Pooling layer.
- *
- * This implementation is intended to be a simple, reference version.
- */
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * This implementation is intended to be a simple, reference version.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
-  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
-  # Create output volume
-  out = matrix(0, rows=N, cols=C*Hout*Wout)
-
-  # Max pooling
-  parfor (n in 1:N, check=0) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
-
-    # Pad image
-    pad_value = -1/0
-    Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
-    parfor (c in 1:C) {
-      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
-      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
-    }
-    img = Xn_padded  # shape (C, (Hin+2*padh)*(Win+2*padw))
-
-    parfor (c in 1:C, check=0) {  # all channels
-      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      parfor (hout in 1:Hout, check=0) {  # all output rows
-        hin = (hout-1) * strideh + 1
-        parfor (wout in 1:Wout, check=0) {  # all output columns
-          win = (wout-1) * stridew + 1
-          out[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout] = max(img_slice[hin:hin+Hf-1,
-                                                               win:win+Wf-1])
-        }
-      }
-    }
-  }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   */
-  N = nrow(X)
-
-  # Create gradient volume
-  dX = matrix(0, rows=N, cols=C*Hin*Win)
-
-  # Gradient of max pooling
-  for (n in 1:N) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
-
-    # Pad image
-    pad_value = -1/0
-    Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
-    parfor (c in 1:C) {
-      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
-      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
-    }
-    img = Xn_padded
-
-    dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
-    for (c in 1:C) {  # all channels
-      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
-      for (hout in 1:Hout, check=0) {  # all output rows
-        hin = (hout-1) * strideh + 1
-        for (wout in 1:Wout) {  # all output columns
-          win = (wout-1) * stridew + 1
-          img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
-          max_val_ind = img_slice_patch == max(img_slice_patch)  # max value indicator matrix
-          # gradient passes through only for the max value(s) in this patch
-          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
-          dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
-                                                   + dimg_slice_patch
-        }
-      }
-      dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
-    }
-
-    # Unpad derivs on input
-    dXn = matrix(0, rows=C, cols=Hin*Win)
-    parfor (c in 1:C, check=0) {
-      dXn_padded_slice = matrix(dimg[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
-      dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
-      dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
-    }
-    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
-  }
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/run_tests.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/run_tests.dml b/scripts/staging/SystemML-NN/nn/test/run_tests.dml
deleted file mode 100644
index d8173a9..0000000
--- a/scripts/staging/SystemML-NN/nn/test/run_tests.dml
+++ /dev/null
@@ -1,90 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Script to run tests.
- */
-source("nn/test/grad_check.dml") as grad_check
-source("nn/test/test.dml") as test
-
-print("")
-print("Starting grad checks.")
-print("---")
-
-# Loss & loss-related functions
-grad_check::cross_entropy_loss()
-grad_check::l1_loss()
-grad_check::l1_reg()
-grad_check::l2_loss()
-grad_check::l2_reg()
-grad_check::log_loss()
-print("")
-
-# Core layers
-grad_check::affine()
-grad_check::batch_norm1d()
-grad_check::batch_norm2d()
-grad_check::conv2d()
-grad_check::conv2d_builtin()
-grad_check::conv2d_simple()
-grad_check::dropout()
-grad_check::lstm()
-grad_check::max_pool2d()
-grad_check::max_pool2d_builtin()
-grad_check::max_pool2d_simple()
-grad_check::relu()
-grad_check::rnn()
-grad_check::scale_shift1d()
-grad_check::scale_shift2d()
-grad_check::sigmoid()
-grad_check::softmax()
-grad_check::tanh()
-print("")
-
-# Example model
-grad_check::two_layer_affine_l2_net()
-print("")
-
-print("---")
-print("Grad checks complete -- look for any ERRORs or WARNINGs.")
-print("If any tests involving ReLUs failed, try a few times " +
-      "to ensure that they were not false negatives due to " +
-      "kinks being crossed.")
-print("")
-
-print("")
-print("Starting other tests.")
-print("---")
-
-test::batch_norm1d()
-test::batch_norm2d()
-test::conv2d()
-test::cross_entropy_loss()
-test::im2col()
-test::max_pool2d()
-test::padding()
-test::tanh()
-
-print("---")
-print("Other tests complete -- look for any ERRORs or WARNINGs.")
-print("")
-print("")
-