You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/09/20 20:31:29 UTC
systemml git commit: [SYSTEMML-1921] Bug in backward of recurrent layers when only returning final output

Repository: systemml
Updated Branches:
  refs/heads/master a134997e6 -> 3acf786d4


[SYSTEMML-1921] Bug in backward of recurrent layers when only returning final output

This fixes a typo in the `backward` functions of the recurrent layers
that causes an error when `given_sequences` is false.  Specifically,
when only the final time-step's output was desired from the forward
functions, `out` and `dout` will be of shape `(N, M)`, and within
the `backward` functions, `dout` will be padded with zeros for the
earlier time-steps to enlarge the shape to `(N, T*M)`.  While the goal
of the logic and the associated comments were correct, there was a typo
in which the shape was instead enlarged to `(T*D)`.

This also updates the associated gradient checks to also run the same
test with `return_sequences = FALSE`.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/3acf786d
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/3acf786d
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/3acf786d

Branch: refs/heads/master
Commit: 3acf786d4ccee15882950738e5259440de7b9a51
Parents: a134997
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Wed Sep 20 13:29:52 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Wed Sep 20 13:29:52 2017 -0700

----------------------------------------------------------------------
 scripts/nn/layers/lstm.dml     |   2 +-
 scripts/nn/layers/rnn.dml      |   2 +-
 scripts/nn/test/grad_check.dml | 396 +++++++++++++++++++-----------------
 3 files changed, 212 insertions(+), 188 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/3acf786d/scripts/nn/layers/lstm.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/lstm.dml b/scripts/nn/layers/lstm.dml
index a75add4..7c21f2b 100644
--- a/scripts/nn/layers/lstm.dml
+++ b/scripts/nn/layers/lstm.dml
@@ -169,7 +169,7 @@ backward = function(matrix[double] dout, matrix[double] dc,
   dct = dc
   if (!given_sequences) {
     # only given dout for output at final timestep, so prepend empty douts for all other timesteps
-    dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout)  # shape (N, T*M)
+    dout = cbind(matrix(0, rows=N, cols=(T-1)*M), dout)  # shape (N, T*M)
   }
 
   t = T

http://git-wip-us.apache.org/repos/asf/systemml/blob/3acf786d/scripts/nn/layers/rnn.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/rnn.dml b/scripts/nn/layers/rnn.dml
index 3c6faae..a1e0bb4 100644
--- a/scripts/nn/layers/rnn.dml
+++ b/scripts/nn/layers/rnn.dml
@@ -119,7 +119,7 @@ backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, mat
   dout0 = matrix(0, rows=N, cols=M)
   if (!given_sequences) {
     # only given dout for output at final timestep, so prepend empty douts for all other timesteps
-    dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout)  # shape (N, T*M)
+    dout = cbind(matrix(0, rows=N, cols=(T-1)*M), dout)  # shape (N, T*M)
   }
 
   t = T

http://git-wip-us.apache.org/repos/asf/systemml/blob/3acf786d/scripts/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/grad_check.dml b/scripts/nn/test/grad_check.dml
index c969a98..41a8bc6 100644
--- a/scripts/nn/test/grad_check.dml
+++ b/scripts/nn/test/grad_check.dml
@@ -1211,138 +1211,150 @@ lstm = function() {
 
   # Generate data
   N = 3  # num examples
-  D = 10  # num features
-  T = 15  # num timesteps (sequence length)
-  M = 5 # num neurons
-  return_seq = TRUE
+  D = 5  # num features
+  T = 15 # num timesteps (sequence length)
+  M = 10 # num neurons
   X = rand(rows=N, cols=T*D)
-  y = rand(rows=N, cols=T*M)
   yc = rand(rows=N, cols=M)
   out0 = rand(rows=N, cols=M)
   c0 = rand(rows=N, cols=M)
   [W, b, dummy, dummy2] = lstm::init(N, D, M)
 
-  # Compute analytical gradients of loss wrt parameters
-  [out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-  dout = l2_loss::backward(out, y)
-  dc = l2_loss::backward(c, yc)
-  [dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0,
-                                            cache_out, cache_c, cache_ifog)
+  # test with (1) outputs from all timesteps, and (2) output from the final timestep
+  for (i in 1:2) {
+    if (i == 1) {
+      return_seq = TRUE
+      y = rand(rows=N, cols=T*M)
+    }
+    else {
+      return_seq = FALSE
+      y = rand(rows=N, cols=M)
+    }
 
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      X[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+    print(" - Grad checking with return_seq = " + return_seq)
 
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    # Compute analytical gradients of loss wrt parameters
+    [out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+    dout = l2_loss::backward(out, y)
+    dc = l2_loss::backward(c, yc)
+    [dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0,
+                                              cache_out, cache_c, cache_ifog)
+
+    # Grad check
+    h = 1e-5
+    print("   - Grad checking X.")
+    for (i in 1:nrow(X)) {
+      for (j in 1:ncol(X)) {
+        # Compute numerical derivative
+        old = as.scalar(X[i,j])
+        X[i,j] = old - h
+        [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+        loss_outmh = l2_loss::forward(outmh, y)
+        loss_cmh = l2_loss::forward(cmh, yc)
+        lossmh = loss_outmh + loss_cmh
+        X[i,j] = old + h
+        [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+        loss_outph = l2_loss::forward(outph, y)
+        loss_cph = l2_loss::forward(cph, yc)
+        lossph = loss_outph + loss_cph
+        X[i,j] = old  # reset
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      }
     }
-  }
 
-  print(" - Grad checking W.")
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      W[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      W[i,j] = old  # reset
-      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
+    print("   - Grad checking W.")
+    for (i in 1:nrow(W)) {
+      for (j in 1:ncol(W)) {
+        # Compute numerical derivative
+        old = as.scalar(W[i,j])
+        W[i,j] = old - h
+        [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+        loss_outmh = l2_loss::forward(outmh, y)
+        loss_cmh = l2_loss::forward(cmh, yc)
+        lossmh = loss_outmh + loss_cmh
+        W[i,j] = old + h
+        [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+        loss_outph = l2_loss::forward(outph, y)
+        loss_cph = l2_loss::forward(cph, yc)
+        lossph = loss_outph + loss_cph
+        W[i,j] = old  # reset
+        dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+      }
     }
-  }
 
-  print(" - Grad checking b.")
-  for (i in 1:nrow(b)) {
-    for (j in 1:ncol(b)) {
-      # Compute numerical derivative
-      old = as.scalar(b[i,j])
-      b[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      b[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      b[i,j] = old  # reset
-      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+    print("   - Grad checking b.")
+    for (i in 1:nrow(b)) {
+      for (j in 1:ncol(b)) {
+        # Compute numerical derivative
+        old = as.scalar(b[i,j])
+        b[i,j] = old - h
+        [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+        loss_outmh = l2_loss::forward(outmh, y)
+        loss_cmh = l2_loss::forward(cmh, yc)
+        lossmh = loss_outmh + loss_cmh
+        b[i,j] = old + h
+        [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+        loss_outph = l2_loss::forward(outph, y)
+        loss_cph = l2_loss::forward(cph, yc)
+        lossph = loss_outph + loss_cph
+        b[i,j] = old  # reset
+        db_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+      }
     }
-  }
 
-  print(" - Grad checking out0.")
-  for (i in 1:nrow(out0)) {
-    for (j in 1:ncol(out0)) {
-      # Compute numerical derivative
-      old = as.scalar(out0[i,j])
-      out0[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      out0[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      out0[i,j] = old  # reset
-      dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
+    print("   - Grad checking out0.")
+    for (i in 1:nrow(out0)) {
+      for (j in 1:ncol(out0)) {
+        # Compute numerical derivative
+        old = as.scalar(out0[i,j])
+        out0[i,j] = old - h
+        [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+        loss_outmh = l2_loss::forward(outmh, y)
+        loss_cmh = l2_loss::forward(cmh, yc)
+        lossmh = loss_outmh + loss_cmh
+        out0[i,j] = old + h
+        [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+        loss_outph = l2_loss::forward(outph, y)
+        loss_cph = l2_loss::forward(cph, yc)
+        lossph = loss_outph + loss_cph
+        out0[i,j] = old  # reset
+        dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
+      }
     }
-  }
 
-  print(" - Grad checking c0.")
-  for (i in 1:nrow(c0)) {
-    for (j in 1:ncol(c0)) {
-      # Compute numerical derivative
-      old = as.scalar(c0[i,j])
-      c0[i,j] = old - h
-      [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outmh = l2_loss::forward(outmh, y)
-      loss_cmh = l2_loss::forward(cmh, yc)
-      lossmh = loss_outmh + loss_cmh
-      c0[i,j] = old + h
-      [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
-      loss_outph = l2_loss::forward(outph, y)
-      loss_cph = l2_loss::forward(cph, yc)
-      lossph = loss_outph + loss_cph
-      c0[i,j] = old  # reset
-      dc0_num = (lossph-lossmh) / (2*h)  # numerical derivative
+    print("   - Grad checking c0.")
+    for (i in 1:nrow(c0)) {
+      for (j in 1:ncol(c0)) {
+        # Compute numerical derivative
+        old = as.scalar(c0[i,j])
+        c0[i,j] = old - h
+        [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+        loss_outmh = l2_loss::forward(outmh, y)
+        loss_cmh = l2_loss::forward(cmh, yc)
+        lossmh = loss_outmh + loss_cmh
+        c0[i,j] = old + h
+        [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+        loss_outph = l2_loss::forward(outph, y)
+        loss_cph = l2_loss::forward(cph, yc)
+        lossph = loss_outph + loss_cph
+        c0[i,j] = old  # reset
+        dc0_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
+      }
     }
   }
 }
@@ -1554,95 +1566,107 @@ rnn = function() {
 
   # Generate data
   N = 3  # num examples
-  D = 10  # num features
-  T = 15  # num timesteps (sequence length)
-  M = 5 # num neurons
-  return_seq = TRUE
+  D = 5  # num features
+  T = 15 # num timesteps (sequence length)
+  M = 10 # num neurons
   X = rand(rows=N, cols=T*D)
-  y = rand(rows=N, cols=T*M)
   out0 = rand(rows=N, cols=M)
   [W, b, dummy] = rnn::init(N, D, M)
 
-  # Compute analytical gradients of loss wrt parameters
-  [out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-  dout = l2_loss::backward(out, y)
-  [dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out)
+  # test with (1) outputs from all timesteps, and (2) output from the final timestep
+  for (i in 1:2) {
+    if (i == 1) {
+      return_seq = TRUE
+      y = rand(rows=N, cols=T*M)
+    }
+    else {
+      return_seq = FALSE
+      y = rand(rows=N, cols=M)
+    }
 
-  # Grad check
-  h = 1e-5
-  print(" - Grad checking X.")
-  for (i in 1:nrow(X)) {
-    for (j in 1:ncol(X)) {
-      # Compute numerical derivative
-      old = as.scalar(X[i,j])
-      X[i,j] = old - h
-      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossmh = l2_loss::forward(outmh, y)
-      X[i,j] = old + h
-      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossph = l2_loss::forward(outph, y)
-      X[i,j] = old  # reset
-      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+    print(" - Grad checking with return_seq = " + return_seq)
 
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    # Compute analytical gradients of loss wrt parameters
+    [out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+    dout = l2_loss::backward(out, y)
+    [dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out)
+
+    # Grad check
+    h = 1e-5
+    print("   - Grad checking X.")
+    for (i in 1:nrow(X)) {
+      for (j in 1:ncol(X)) {
+        # Compute numerical derivative
+        old = as.scalar(X[i,j])
+        X[i,j] = old - h
+        [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+        lossmh = l2_loss::forward(outmh, y)
+        X[i,j] = old + h
+        [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+        lossph = l2_loss::forward(outph, y)
+        X[i,j] = old  # reset
+        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative
+
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+      }
     }
-  }
 
-  print(" - Grad checking W.")
-  for (i in 1:nrow(W)) {
-    for (j in 1:ncol(W)) {
-      # Compute numerical derivative
-      old = as.scalar(W[i,j])
-      W[i,j] = old - h
-      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossmh = l2_loss::forward(outmh, y)
-      W[i,j] = old + h
-      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossph = l2_loss::forward(outph, y)
-      W[i,j] = old  # reset
-      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
+    print("   - Grad checking W.")
+    for (i in 1:nrow(W)) {
+      for (j in 1:ncol(W)) {
+        # Compute numerical derivative
+        old = as.scalar(W[i,j])
+        W[i,j] = old - h
+        [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+        lossmh = l2_loss::forward(outmh, y)
+        W[i,j] = old + h
+        [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+        lossph = l2_loss::forward(outph, y)
+        W[i,j] = old  # reset
+        dW_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+      }
     }
-  }
 
-  print(" - Grad checking b.")
-  for (i in 1:nrow(b)) {
-    for (j in 1:ncol(b)) {
-      # Compute numerical derivative
-      old = as.scalar(b[i,j])
-      b[i,j] = old - h
-      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossmh = l2_loss::forward(outmh, y)
-      b[i,j] = old + h
-      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossph = l2_loss::forward(outph, y)
-      b[i,j] = old  # reset
-      db_num = (lossph-lossmh) / (2*h)  # numerical derivative
+    print("   - Grad checking b.")
+    for (i in 1:nrow(b)) {
+      for (j in 1:ncol(b)) {
+        # Compute numerical derivative
+        old = as.scalar(b[i,j])
+        b[i,j] = old - h
+        [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+        lossmh = l2_loss::forward(outmh, y)
+        b[i,j] = old + h
+        [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+        lossph = l2_loss::forward(outph, y)
+        b[i,j] = old  # reset
+        db_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+      }
     }
-  }
 
-  print(" - Grad checking out0.")
-  for (i in 1:nrow(out0)) {
-    for (j in 1:ncol(out0)) {
-      # Compute numerical derivative
-      old = as.scalar(out0[i,j])
-      out0[i,j] = old - h
-      [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossmh = l2_loss::forward(outmh, y)
-      out0[i,j] = old + h
-      [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
-      lossph = l2_loss::forward(outph, y)
-      out0[i,j] = old  # reset
-      dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
+    print("   - Grad checking out0.")
+    for (i in 1:nrow(out0)) {
+      for (j in 1:ncol(out0)) {
+        # Compute numerical derivative
+        old = as.scalar(out0[i,j])
+        out0[i,j] = old - h
+        [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+        lossmh = l2_loss::forward(outmh, y)
+        out0[i,j] = old + h
+        [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+        lossph = l2_loss::forward(outph, y)
+        out0[i,j] = old  # reset
+        dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative
 
-      # Check error
-      rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
+        # Check error
+        rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
+      }
     }
   }
 }