You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/04/26 21:42:34 UTC
[08/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn`
library to `scripts/nn`
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/grad_check.dml b/scripts/nn/test/grad_check.dml
new file mode 100644
index 0000000..f3bc9a7
--- /dev/null
+++ b/scripts/nn/test/grad_check.dml
@@ -0,0 +1,1769 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Gradient checks for various architectures.
+ */
+source("nn/layers/affine.dml") as affine
+source("nn/layers/batch_norm1d.dml") as batch_norm1d
+source("nn/layers/batch_norm2d.dml") as batch_norm2d
+source("nn/layers/conv2d.dml") as conv2d
+source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/dropout.dml") as dropout
+source("nn/layers/l1_loss.dml") as l1_loss
+source("nn/layers/l1_reg.dml") as l1_reg
+source("nn/layers/l2_loss.dml") as l2_loss
+source("nn/layers/l2_reg.dml") as l2_reg
+source("nn/layers/log_loss.dml") as log_loss
+source("nn/layers/lstm.dml") as lstm
+source("nn/layers/max_pool2d.dml") as max_pool2d
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
+source("nn/layers/relu.dml") as relu
+source("nn/layers/rnn.dml") as rnn
+source("nn/layers/scale_shift1d.dml") as scale_shift1d
+source("nn/layers/scale_shift2d.dml") as scale_shift2d
+source("nn/layers/sigmoid.dml") as sigmoid
+source("nn/layers/softmax.dml") as softmax
+source("nn/layers/tanh.dml") as tanh
+source("nn/test/conv2d_simple.dml") as conv2d_simple
+source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
+source("nn/test/util.dml") as test_util
+
+affine = function() {
+ /*
+ * Gradient check for the affine layer.
+ */
+ print("Grad checking the affine layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 100 # num features
+ M = 10 # num neurons
+ X = rand(rows=N, cols=D)
+ y = rand(rows=N, cols=M)
+ [W, b] = affine::init(D, M)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = affine::forward(X, W, b)
+ dout = l2_loss::backward(out, y)
+ [dX, dW, db] = affine::backward(dout, X, W, b)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = affine::forward(X, W, b)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = affine::forward(X, W, b)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W.")
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ outmh = affine::forward(X, W, b)
+ lossmh = l2_loss::forward(outmh, y)
+ W[i,j] = old + h
+ outph = affine::forward(X, W, b)
+ lossph = l2_loss::forward(outph, y)
+ W[i,j] = old # reset
+ dW_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b.")
+ for (i in 1:nrow(b)) {
+ for (j in 1:ncol(b)) {
+ # Compute numerical derivative
+ old = as.scalar(b[i,j])
+ b[i,j] = old - h
+ outmh = affine::forward(X, W, b)
+ lossmh = l2_loss::forward(outmh, y)
+ b[i,j] = old + h
+ outph = affine::forward(X, W, b)
+ lossph = l2_loss::forward(outph, y)
+ b[i,j] = old # reset
+ db_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+ }
+ }
+}
+
+batch_norm1d = function() {
+ /*
+ * Gradient check for the 1D batch normalization layer.
+ */
+ print("Grad checking the 1D batch normalization layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 100 # num features
+ mu = 0.9 # momentum
+ eps = 1e-5 # epsilon
+ X = rand(rows=N, cols=D)
+ y = rand(rows=N, cols=D)
+ gamma = rand(rows=1, cols=D)
+ beta = rand(rows=1, cols=D)
+ ema_mean = rand(rows=1, cols=D)
+ ema_var = rand(rows=1, cols=D)
+ #[dummy, dummy, ema_mean, ema_var] = batch_norm1d::init(D)
+
+ # Check training & testing modes
+ for (i in 1:2) {
+ if (i == 1)
+ mode = 'train'
+ else
+ mode = 'test'
+ print(" - Grad checking the '"+mode+"' mode.")
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ dout = l2_loss::backward(out, y)
+ [dX, dgamma, dbeta] = batch_norm1d::backward(dout, out, ema_mean_upd, ema_var_upd,
+ cache_mean, cache_var, cache_norm,
+ X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking gamma.")
+ for (i in 1:nrow(gamma)) {
+ for (j in 1:ncol(gamma)) {
+ # Compute numerical derivative
+ old = as.scalar(gamma[i,j])
+ gamma[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ gamma[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ gamma[i,j] = old # reset
+ dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+ lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking beta.")
+ for (i in 1:nrow(beta)) {
+ for (j in 1:ncol(beta)) {
+ # Compute numerical derivative
+ old = as.scalar(beta[i,j])
+ beta[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ beta[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ beta[i,j] = old # reset
+ dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+ lossph, lossmh)
+ }
+ }
+ }
+}
+
+batch_norm2d = function() {
+ /*
+ * Gradient check for the 2D (spatial) batch normalization layer.
+ */
+ print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ C = 2 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ mu = 0.9 # momentum
+ eps = 1e-5 # epsilon
+ X = rand(rows=N, cols=C*Hin*Win)
+ y = rand(rows=N, cols=C*Hin*Win)
+ gamma = rand(rows=C, cols=1)
+ beta = rand(rows=C, cols=1)
+ ema_mean = rand(rows=C, cols=1)
+ ema_var = rand(rows=C, cols=1)
+ #[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C)
+
+ # Check training & testing modes
+ for (i in 1:2) {
+ if (i == 1)
+ mode = 'train'
+ else
+ mode = 'test'
+ print(" - Grad checking the '"+mode+"' mode.")
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ dout = l2_loss::backward(out, y)
+ [dX, dgamma, dbeta] = batch_norm2d::backward(dout, out, ema_mean_upd, ema_var_upd,
+ cache_mean, cache_var, cache_norm,
+ X, gamma, beta, C, Hin, Win, mode,
+ ema_mean, ema_var, mu, eps)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking gamma.")
+ for (i in 1:nrow(gamma)) {
+ for (j in 1:ncol(gamma)) {
+ # Compute numerical derivative
+ old = as.scalar(gamma[i,j])
+ gamma[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ gamma[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ gamma[i,j] = old # reset
+ dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+ lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking beta.")
+ for (i in 1:nrow(beta)) {
+ for (j in 1:ncol(beta)) {
+ # Compute numerical derivative
+ old = as.scalar(beta[i,j])
+ beta[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ beta[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ beta[i,j] = old # reset
+ dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+ lossph, lossmh)
+ }
+ }
+ }
+}
+
+conv2d = function() {
+ /*
+ * Gradient check for the 2D convolutional layer using `im2col`.
+ */
+ print("Grad checking the `im2col` 2D convolutional layer with L2 loss.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 2 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ F = 2 # num filters
+ Hf = 3 # filter height
+ Wf = 3 # filter width
+ stride = 1
+ pad = 1
+ X = rand(rows=N, cols=C*Hin*Win)
+ y = rand(rows=N, cols=F*Hin*Win)
+
+ # Create layers
+ [W, b] = conv2d::init(F, C, Hf, Wf)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ dout = l2_loss::backward(out, y)
+ [dX, dW, db] = conv2d::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W.")
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ W[i,j] = old + h
+ [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ W[i,j] = old # reset
+ dW_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b.")
+ for (i in 1:nrow(b)) {
+ for (j in 1:ncol(b)) {
+ # Compute numerical derivative
+ old = as.scalar(b[i,j])
+ b[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ b[i,j] = old + h
+ [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ b[i,j] = old # reset
+ db_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+ }
+ }
+}
+
+conv2d_builtin = function() {
+ /*
+ * Gradient check for the 2D convolutional layer using built-in
+ * functions.
+ */
+ print("Grad checking the built-in 2D convolutional layer with L2 loss.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 2 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ F = 2 # num filters
+ Hf = 3 # filter height
+ Wf = 3 # filter width
+ stride = 1
+ pad = 1
+ X = rand(rows=N, cols=C*Hin*Win)
+ y = rand(rows=N, cols=F*Hin*Win)
+
+ # Create layers
+ [W, b] = conv2d_builtin::init(F, C, Hf, Wf)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ dout = l2_loss::backward(out, y)
+ [dX, dW, db] = conv2d_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W.")
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ W[i,j] = old + h
+ [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ W[i,j] = old # reset
+ dW_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b.")
+ for (i in 1:nrow(b)) {
+ for (j in 1:ncol(b)) {
+ # Compute numerical derivative
+ old = as.scalar(b[i,j])
+ b[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ b[i,j] = old + h
+ [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ b[i,j] = old # reset
+ db_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+ }
+ }
+}
+
+conv2d_simple = function() {
+ /*
+ * Gradient check for the simple reference 2D convolutional layer.
+ */
+ print("Grad checking the simple reference 2D convolutional layer with L2 loss.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 2 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ F = 2 # num filters
+ Hf = 3 # filter height
+ Wf = 3 # filter width
+ stride = 1
+ pad = 1
+ X = rand(rows=N, cols=C*Hin*Win)
+ y = rand(rows=N, cols=F*Hin*Win)
+
+ # Create layers
+ [W, b] = conv2d_simple::init(F, C, Hf, Wf)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ dout = l2_loss::backward(out, y)
+ [dX, dW, db] = conv2d_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W.")
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ W[i,j] = old + h
+ [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ W[i,j] = old # reset
+ dW_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b.")
+ for (i in 1:nrow(b)) {
+ for (j in 1:ncol(b)) {
+ # Compute numerical derivative
+ old = as.scalar(b[i,j])
+ b[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ b[i,j] = old + h
+ [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ b[i,j] = old # reset
+ db_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+ }
+ }
+}
+
+cross_entropy_loss = function() {
+ /*
+ * Gradient check for the cross-entropy loss function.
+ */
+ print("Grad checking the cross-entropy loss function.")
+
+ # Generate data
+ N = 3 # num examples
+ K = 10 # num targets
+ pred = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
+ pred = pred / rowSums(pred) # normalized probs
+ y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
+ y = y / rowSums(y) # normalized probs
+
+ # Compute analytical gradient
+ dpred = cross_entropy_loss::backward(pred, y)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(pred)) {
+ for (j in 1:ncol(pred)) {
+ # Compute numerical derivative
+ old = as.scalar(pred[i,j])
+ pred[i,j] = old - h
+ lossmh = cross_entropy_loss::forward(pred, y)
+ pred[i,j] = old + h
+ lossph = cross_entropy_loss::forward(pred, y)
+ pred[i,j] = old # reset W[i,j]
+ dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+ }
+ }
+}
+
+dropout = function() {
+ /*
+ * Gradient check for the (inverted) dropout layer.
+ */
+ print("Grad checking the (inverted) dropout layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ M = 100 # num neurons
+ p = 0.5 # probability of dropping neuron output
+ seed = as.integer(floor(as.scalar(rand(rows=1, cols=1, min=1, max=100000)))) # random seed
+ X = rand(rows=N, cols=M)
+ y = rand(rows=N, cols=M)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, mask] = dropout::forward(X, p, seed)
+ dout = l2_loss::backward(out, y)
+ dX = dropout::backward(dout, X, p, mask)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, mask] = dropout::forward(X, p, seed)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, mask] = dropout::forward(X, p, seed)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+}
+
+l1_loss = function() {
+ /*
+ * Gradient check for the L1 loss function.
+ */
+ print("Grad checking the L1 loss function.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 2 # num targets
+ pred = rand(rows=N, cols=D)
+ y = rand(rows=N, cols=D)
+
+ # Compute analytical gradient
+ dpred = l1_loss::backward(pred, y)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(pred)) {
+ for (j in 1:ncol(pred)) {
+ # Compute numerical derivative
+ old = as.scalar(pred[i,j])
+ pred[i,j] = old - h
+ lossmh = l1_loss::forward(pred, y)
+ pred[i,j] = old + h
+ lossph = l1_loss::forward(pred, y)
+ pred[i,j] = old # reset W[i,j]
+ dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+ }
+ }
+}
+
+l1_reg = function() {
+ /*
+ * Gradient check for the L1 regularization function.
+ */
+ print("Grad checking the L1 regularization function.")
+
+ # Generate data
+ D = 5 # num features
+ M = 3 # num neurons
+ lambda = 0.01
+ W = rand(rows=D, cols=M)
+
+ # Compute analytical gradient
+ dW = l1_reg::backward(W, lambda)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ reg_lossmh = l1_reg::forward(W, lambda)
+ W[i,j] = old + h
+ reg_lossph = l1_reg::forward(W, lambda)
+ W[i,j] = old # reset W[i,j]
+ dW_num = (reg_lossph-reg_lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
+ reg_lossph, reg_lossmh)
+ }
+ }
+}
+
+l2_loss = function() {
+ /*
+ * Gradient check for the L2 loss function.
+ */
+ print("Grad checking the L2 loss function.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 2 # num targets
+ pred = rand(rows=N, cols=D)
+ y = rand(rows=N, cols=D)
+
+ # Compute analytical gradient
+ dpred = l2_loss::backward(pred, y)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(pred)) {
+ for (j in 1:ncol(pred)) {
+ # Compute numerical derivative
+ old = as.scalar(pred[i,j])
+ pred[i,j] = old - h
+ lossmh = l2_loss::forward(pred, y)
+ pred[i,j] = old + h
+ lossph = l2_loss::forward(pred, y)
+ pred[i,j] = old # reset W[i,j]
+ dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+ }
+ }
+}
+
+l2_reg = function() {
+ /*
+ * Gradient check for the L2 regularization function.
+ */
+ print("Grad checking the L2 regularization function.")
+
+ # Generate data
+ D = 5 # num features
+ M = 3 # num neurons
+ lambda = 0.01
+ W = rand(rows=D, cols=M)
+
+ # Compute analytical gradient
+ dW = l2_reg::backward(W, lambda)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ reg_lossmh = l2_reg::forward(W, lambda)
+ W[i,j] = old + h
+ reg_lossph = l2_reg::forward(W, lambda)
+ W[i,j] = old # reset W[i,j]
+ dW_num = (reg_lossph-reg_lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
+ reg_lossph, reg_lossmh)
+ }
+ }
+}
+
+log_loss = function() {
+ /*
+ * Gradient check for the log loss function.
+ */
+ print("Grad checking the log loss function.")
+
+ # Generate data
+ N = 20 # num examples
+ D = 1 # num targets
+ pred = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
+ y = round(rand(rows=N, cols=D, min=0, max=1, pdf="uniform"))
+
+ # Compute analytical gradient
+ dpred = log_loss::backward(pred, y)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(pred)) {
+ for (j in 1:ncol(pred)) {
+ # Compute numerical derivative
+ old = as.scalar(pred[i,j])
+ pred[i,j] = old - h
+ lossmh = log_loss::forward(pred, y)
+ pred[i,j] = old + h
+ lossph = log_loss::forward(pred, y)
+ pred[i,j] = old # reset W[i,j]
+ dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+ }
+ }
+}
+
+lstm = function() {
+ /*
+ * Gradient check for the LSTM layer.
+ */
+ print("Grad checking the LSTM layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 10 # num features
+ T = 15 # num timesteps (sequence length)
+ M = 5 # num neurons
+ return_seq = TRUE
+ X = rand(rows=N, cols=T*D)
+ y = rand(rows=N, cols=T*M)
+ yc = rand(rows=N, cols=M)
+ out0 = rand(rows=N, cols=M)
+ c0 = rand(rows=N, cols=M)
+ [W, b, dummy, dummy2] = lstm::init(N, D, M)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ dout = l2_loss::backward(out, y)
+ dc = l2_loss::backward(c, yc)
+ [dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0,
+ cache_out, cache_c, cache_ifog)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outmh = l2_loss::forward(outmh, y)
+ loss_cmh = l2_loss::forward(cmh, yc)
+ lossmh = loss_outmh + loss_cmh
+ X[i,j] = old + h
+ [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outph = l2_loss::forward(outph, y)
+ loss_cph = l2_loss::forward(cph, yc)
+ lossph = loss_outph + loss_cph
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W.")
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outmh = l2_loss::forward(outmh, y)
+ loss_cmh = l2_loss::forward(cmh, yc)
+ lossmh = loss_outmh + loss_cmh
+ W[i,j] = old + h
+ [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outph = l2_loss::forward(outph, y)
+ loss_cph = l2_loss::forward(cph, yc)
+ lossph = loss_outph + loss_cph
+ W[i,j] = old # reset
+ dW_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b.")
+ for (i in 1:nrow(b)) {
+ for (j in 1:ncol(b)) {
+ # Compute numerical derivative
+ old = as.scalar(b[i,j])
+ b[i,j] = old - h
+ [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outmh = l2_loss::forward(outmh, y)
+ loss_cmh = l2_loss::forward(cmh, yc)
+ lossmh = loss_outmh + loss_cmh
+ b[i,j] = old + h
+ [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outph = l2_loss::forward(outph, y)
+ loss_cph = l2_loss::forward(cph, yc)
+ lossph = loss_outph + loss_cph
+ b[i,j] = old # reset
+ db_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking out0.")
+ for (i in 1:nrow(out0)) {
+ for (j in 1:ncol(out0)) {
+ # Compute numerical derivative
+ old = as.scalar(out0[i,j])
+ out0[i,j] = old - h
+ [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outmh = l2_loss::forward(outmh, y)
+ loss_cmh = l2_loss::forward(cmh, yc)
+ lossmh = loss_outmh + loss_cmh
+ out0[i,j] = old + h
+ [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outph = l2_loss::forward(outph, y)
+ loss_cph = l2_loss::forward(cph, yc)
+ lossph = loss_outph + loss_cph
+ out0[i,j] = old # reset
+ dout0_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking c0.")
+ for (i in 1:nrow(c0)) {
+ for (j in 1:ncol(c0)) {
+ # Compute numerical derivative
+ old = as.scalar(c0[i,j])
+ c0[i,j] = old - h
+ [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outmh = l2_loss::forward(outmh, y)
+ loss_cmh = l2_loss::forward(cmh, yc)
+ lossmh = loss_outmh + loss_cmh
+ c0[i,j] = old + h
+ [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outph = l2_loss::forward(outph, y)
+ loss_cph = l2_loss::forward(cph, yc)
+ lossph = loss_outph + loss_cph
+ c0[i,j] = old # reset
+ dc0_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
+ }
+ }
+}
+
+max_pool2d = function() {
+ /*
+ * Gradient check for the 2D max pooling layer.
+ */
+ print("Grad checking the 2D max pooling layer with L2 loss.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 2 # num channels
+ Hin = 4 # input height
+ Win = 4 # input width
+ Hf = 2 # pool filter height
+ Wf = 2 # pool filter width
+ stride = 2
+ X = rand(rows=N, cols=C*Hin*Win)
+
+ for (pad in 0:1) {
+ print(" - Grad checking w/ pad="+pad+".")
+ Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
+ Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
+ y = rand(rows=N, cols=C*Hout*Wout)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ dout = l2_loss::backward(out, y)
+ dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+ }
+}
+
+max_pool2d_builtin = function() {
+ /*
+ * Gradient check for the 2D max pooling layer.
+ */
+ print("Grad checking the built-in 2D max pooling layer with L2 loss.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 2 # num channels
+ Hin = 4 # input height
+ Win = 4 # input width
+ Hf = 2 # pool filter height
+ Wf = 2 # pool filter width
+ stride = 2
+ X = rand(rows=N, cols=C*Hin*Win)
+
+ for (pad in 0:1) {
+ print(" - Grad checking w/ pad="+pad+".")
+ Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
+ Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
+ y = rand(rows=N, cols=C*Hout*Wout)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ dout = l2_loss::backward(out, y)
+ dX = max_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+ }
+}
+
+max_pool2d_simple = function() {
+ /*
+ * Gradient check for the simple reference 2D max pooling layer.
+ */
+ print("Grad checking the simple reference 2D max pooling layer with L2 loss.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 2 # num channels
+ Hin = 4 # input height
+ Win = 4 # input width
+ Hf = 2 # pool filter height
+ Wf = 2 # pool filter width
+ stride = 2
+ X = rand(rows=N, cols=C*Hin*Win)
+
+ for (pad in 0:1) {
+ print(" - Grad checking w/ pad="+pad+".")
+ Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
+ Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
+ y = rand(rows=N, cols=C*Hout*Wout)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ dout = l2_loss::backward(out, y)
+ dX = max_pool2d_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+ }
+}
+
+relu = function() {
+ /*
+ * Gradient check for the ReLU nonlinearity layer.
+ *
+ * NOTE: This could result in a false-negative in which the test
+ * fails due to a kink being crossed in the nonlinearity. This
+ * occurs when the tests, f(x-h) and f(x+h), end up on opposite
+ * sides of the zero threshold of max(0, fx). For now, just run
+ * the tests again. In the future, we can explicitly check for
+ * this and rerun the test automatically.
+ */
+ print("Grad checking the ReLU nonlinearity layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ M = 10 # num neurons
+ X = rand(rows=N, cols=M, min=-5, max=5)
+ y = rand(rows=N, cols=M)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = relu::forward(X)
+ dout = l2_loss::backward(out, y)
+ dX = relu::backward(dout, X)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = relu::forward(X)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = relu::forward(X)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+}
+
+rnn = function() {
+ /*
+ * Gradient check for the simple RNN layer.
+ */
+ print("Grad checking the simple RNN layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 10 # num features
+ T = 15 # num timesteps (sequence length)
+ M = 5 # num neurons
+ return_seq = TRUE
+ X = rand(rows=N, cols=T*D)
+ y = rand(rows=N, cols=T*M)
+ out0 = rand(rows=N, cols=M)
+ [W, b, dummy] = rnn::init(N, D, M)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ dout = l2_loss::backward(out, y)
+ [dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W.")
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossmh = l2_loss::forward(outmh, y)
+ W[i,j] = old + h
+ [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossph = l2_loss::forward(outph, y)
+ W[i,j] = old # reset
+ dW_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b.")
+ for (i in 1:nrow(b)) {
+ for (j in 1:ncol(b)) {
+ # Compute numerical derivative
+ old = as.scalar(b[i,j])
+ b[i,j] = old - h
+ [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossmh = l2_loss::forward(outmh, y)
+ b[i,j] = old + h
+ [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossph = l2_loss::forward(outph, y)
+ b[i,j] = old # reset
+ db_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking out0.")
+ for (i in 1:nrow(out0)) {
+ for (j in 1:ncol(out0)) {
+ # Compute numerical derivative
+ old = as.scalar(out0[i,j])
+ out0[i,j] = old - h
+ [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossmh = l2_loss::forward(outmh, y)
+ out0[i,j] = old + h
+ [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossph = l2_loss::forward(outph, y)
+ out0[i,j] = old # reset
+ dout0_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
+ }
+ }
+}
+
+scale_shift1d = function() {
+ /*
+ * Gradient check for the 1D scale & shift layer.
+ */
+ print("Grad checking the 1D scale & shift layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 100 # num features
+ X = rand(rows=N, cols=D)
+ y = rand(rows=N, cols=D)
+ [gamma, beta] = scale_shift1d::init(D)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = scale_shift1d::forward(X, gamma, beta)
+ dout = l2_loss::backward(out, y)
+ [dX, dgamma, dbeta] = scale_shift1d::backward(dout, out, X, gamma, beta)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = scale_shift1d::forward(X, gamma, beta)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = scale_shift1d::forward(X, gamma, beta)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking gamma.")
+ for (i in 1:nrow(gamma)) {
+ for (j in 1:ncol(gamma)) {
+ # Compute numerical derivative
+ old = as.scalar(gamma[i,j])
+ gamma[i,j] = old - h
+ outmh = scale_shift1d::forward(X, gamma, beta)
+ lossmh = l2_loss::forward(outmh, y)
+ gamma[i,j] = old + h
+ outph = scale_shift1d::forward(X, gamma, beta)
+ lossph = l2_loss::forward(outph, y)
+ gamma[i,j] = old # reset
+ dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+ lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking beta.")
+ for (i in 1:nrow(beta)) {
+ for (j in 1:ncol(beta)) {
+ # Compute numerical derivative
+ old = as.scalar(beta[i,j])
+ beta[i,j] = old - h
+ outmh = scale_shift1d::forward(X, gamma, beta)
+ lossmh = l2_loss::forward(outmh, y)
+ beta[i,j] = old + h
+ outph = scale_shift1d::forward(X, gamma, beta)
+ lossph = l2_loss::forward(outph, y)
+ beta[i,j] = old # reset
+ dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+ lossph, lossmh)
+ }
+ }
+}
+
+scale_shift2d = function() {
+ /*
+ * Gradient check for the 2D scale & shift layer.
+ */
+ print("Grad checking the 2D scale & shift layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ C = 2 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ X = rand(rows=N, cols=C*Hin*Win)
+ y = rand(rows=N, cols=C*Hin*Win)
+ [gamma, beta] = scale_shift2d::init(C)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ dout = l2_loss::backward(out, y)
+ [dX, dgamma, dbeta] = scale_shift2d::backward(dout, out, X, gamma, beta, C, Hin, Win)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking gamma.")
+ for (i in 1:nrow(gamma)) {
+ for (j in 1:ncol(gamma)) {
+ # Compute numerical derivative
+ old = as.scalar(gamma[i,j])
+ gamma[i,j] = old - h
+ outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossmh = l2_loss::forward(outmh, y)
+ gamma[i,j] = old + h
+ outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossph = l2_loss::forward(outph, y)
+ gamma[i,j] = old # reset
+ dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+ lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking beta.")
+ for (i in 1:nrow(beta)) {
+ for (j in 1:ncol(beta)) {
+ # Compute numerical derivative
+ old = as.scalar(beta[i,j])
+ beta[i,j] = old - h
+ outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossmh = l2_loss::forward(outmh, y)
+ beta[i,j] = old + h
+ outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossph = l2_loss::forward(outph, y)
+ beta[i,j] = old # reset
+ dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+ lossph, lossmh)
+ }
+ }
+}
+
+sigmoid = function() {
+ /*
+ * Gradient check for the sigmoid nonlinearity layer.
+ */
+ print("Grad checking the sigmoid nonlinearity layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ M = 10 # num neurons
+ X = rand(rows=N, cols=M)
+ y = rand(rows=N, cols=M)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = sigmoid::forward(X)
+ dout = l2_loss::backward(out, y)
+ dX = sigmoid::backward(dout, X)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = sigmoid::forward(X)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = sigmoid::forward(X)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+}
+
+softmax = function() {
+ /*
+ * Gradient check for the softmax layer.
+ */
+ print("Grad checking the softmax layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 10 # num classes
+ X = rand(rows=N, cols=D)
+ y = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
+ y = y / rowSums(y)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = softmax::forward(X)
+ dout = l2_loss::backward(out, y)
+ dX = softmax::backward(dout, X)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = softmax::forward(X)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = softmax::forward(X)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+}
+
+tanh = function() {
+ /*
+ * Gradient check for the hyperbolic tangent (tanh) nonlinearity
+ * layer.
+ */
+ print("Grad checking the tanh nonlinearity layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ M = 10 # num neurons
+ X = rand(rows=N, cols=M)
+ y = rand(rows=N, cols=M)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = tanh::forward(X)
+ dout = l2_loss::backward(out, y)
+ dX = tanh::backward(dout, X)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = tanh::forward(X)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = tanh::forward(X)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+}
+
+two_layer_affine_l2_net = function() {
+ /*
+ * Gradient check for a two-layer, fully-connected, feed-forward
+ * network with ReLU nonlinearity and L2 loss.
+ *
+ * NOTE: This could result in a false-negative in which the test
+ * fails due to a kink being crossed in the ReLU nonlinearity. This
+ * occurs when the tests, f(x-h) and f(x+h), end up on opposite
+ * sides of the zero threshold of max(0, fx). For now, just run
+ * the tests again. In the future, we can explicitly check for
+ * this and rerun the test automatically.
+ */
+ print("Grad checking a two-layer, fully-connected, feed-forward network with a ReLU " +
+ "nonlinearity, and an L2 loss function.")
+
+ # Generate input data
+ N = 1000 # num examples
+ D = 100 # num features
+ yD = 5 # num targets
+ X = rand(rows=N, cols=D, pdf="normal")
+ y = rand(rows=N, cols=yD)
+
+ # Create 2-layer, fully-connected network
+ M = 10 # number of hidden neurons
+ [W1, b1] = affine::init(D, M)
+ [W2, b2] = affine::init(M, yD)
+
+ # Optimize for short "burn-in" time to move to characteristic
+ # mode of operation and unmask any real issues.
+ print(" - Burn-in:")
+ lr = 0.0001
+ decay = 0.99
+ for(i in 1:5) {
+ # Compute forward and backward passes of net
+ [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
+ print(" - L2 loss: " + loss)
+
+ # Optimize with basic SGD
+ W1 = W1 - lr * dW1
+ b1 = b1 - lr * db1
+ W2 = W2 - lr * dW2
+ b2 = b2 - lr * db2
+ lr = lr * decay
+ }
+
+ # Compute analytical gradients
+ [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:2) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old_x = as.scalar(X[i,j])
+ X[i,j] = old_x - h
+ [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ X[i,j] = old_x + h
+ [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ X[i,j] = old_x # reset X[i,j]
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W1.")
+ for (i in 1:nrow(W1)) {
+ for (j in 1:ncol(W1)) {
+ # Compute numerical derivative
+ old_w = as.scalar(W1[i,j])
+ W1[i,j] = old_w - h
+ [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ W1[i,j] = old_w + h
+ [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ W1[i,j] = old_w # reset W[i,j]
+ dWij_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W2.")
+ for (i in 1:nrow(W2)) {
+ for (j in 1:ncol(W2)) {
+ # Compute numerical derivative
+ old_w = as.scalar(W2[i,j])
+ W2[i,j] = old_w - h
+ [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ W2[i,j] = old_w + h
+ [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ W2[i,j] = old_w # reset W[i,j]
+ dWij_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b1.")
+ for (i in 1:nrow(b1)) {
+ for (j in 1:ncol(b1)) {
+ # Compute numerical derivative
+ old_b = as.scalar(b1[i,j])
+ b1[i,j] = old_b - h
+ [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ b1[i,j] = old_b + h
+ [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ b1[i,j] = old_b # reset b[1,j]
+ dbij_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b2.")
+ for (i in 1:nrow(b2)) {
+ for (j in 1:ncol(b2)) {
+ # Compute numerical derivative
+ old_b = as.scalar(b2[i,j])
+ b2[i,j] = old_b - h
+ [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ b2[i,j] = old_b + h
+ [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ b2[i,j] = old_b # reset b[1,j]
+ dbij_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh)
+ }
+ }
+}
+
+/*
+ * Test network with forward/backward functions.
+ */
+two_layer_affine_l2_net_run = function(matrix[double] X, matrix[double] y,
+ matrix[double] W1, matrix[double] b1,
+ matrix[double] W2, matrix[double] b2)
+ return (matrix[double] pred, double loss,
+ matrix[double] dX,
+ matrix[double] dW1, matrix[double] db1,
+ matrix[double] dW2, matrix[double] db2) {
+ # Compute forward pass
+ [loss, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+
+ # Compute backward pass
+ [dX, dpred, daout, dhout, dW1, db1, dW2, db2] =
+ two_layer_affine_l2_net_backward(X, y, pred, aout, hout, W1, b1, W2, b2)
+}
+
+two_layer_affine_l2_net_forward = function(matrix[double] X, matrix[double] y,
+ matrix[double] W1, matrix[double] b1,
+ matrix[double] W2, matrix[double] b2)
+ return (double loss, matrix[double] pred, matrix[double] aout, matrix[double] hout) {
+ # Compute forward pass
+ hout = affine::forward(X, W1, b1)
+ aout = relu::forward(hout)
+ pred = affine::forward(aout, W2, b2)
+
+ # Compute loss
+ loss = l2_loss::forward(pred, y)
+}
+
+two_layer_affine_l2_net_backward = function(matrix[double] X, matrix[double] y, matrix[double] pred,
+ matrix[double] aout, matrix[double] hout,
+ matrix[double] W1, matrix[double] b1,
+ matrix[double] W2, matrix[double] b2)
+ return (matrix[double] dX, matrix[double] dpred,
+ matrix[double] daout, matrix[double] dhout,
+ matrix[double] dW1, matrix[double] db1, matrix[double] dW2, matrix[double] db2) {
+ # Compute backward pass
+ dpred = l2_loss::backward(pred, y)
+ [daout, dW2, db2] = affine::backward(dpred, aout, W2, b2)
+ dhout = relu::backward(daout, hout)
+ [dX, dW1, db1] = affine::backward(dhout, X, W1, b1)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/max_pool2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/max_pool2d_simple.dml b/scripts/nn/test/max_pool2d_simple.dml
new file mode 100644
index 0000000..188bd6e
--- /dev/null
+++ b/scripts/nn/test/max_pool2d_simple.dml
@@ -0,0 +1,172 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Max Pooling layer.
+ *
+ * This implementation is intended to be a simple, reference version.
+ */
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] out, int Hout, int Wout) {
+ /*
+ * Computes the forward pass for a 2D spatial max pooling layer.
+ * The input data has N examples, each represented as a 3D volume
+ * unrolled into a single vector.
+ *
+ * This implementation is intended to be a simple, reference version.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * A typical value is 0.
+ * - padw: Padding for left and right sides.
+ * A typical value is 0.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, C*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ */
+ N = nrow(X)
+ Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+ Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+ # Create output volume
+ out = matrix(0, rows=N, cols=C*Hout*Wout)
+
+ # Max pooling
+ parfor (n in 1:N, check=0) { # all examples
+ Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+
+ # Pad image
+ pad_value = -1/0
+ Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros
+ parfor (c in 1:C) {
+ Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped
+ Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+ Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+ Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape
+ }
+ img = Xn_padded # shape (C, (Hin+2*padh)*(Win+2*padw))
+
+ parfor (c in 1:C, check=0) { # all channels
+ img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+ parfor (hout in 1:Hout, check=0) { # all output rows
+ hin = (hout-1) * strideh + 1
+ parfor (wout in 1:Wout, check=0) { # all output columns
+ win = (wout-1) * stridew + 1
+ out[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout] = max(img_slice[hin:hin+Hf-1,
+ win:win+Wf-1])
+ }
+ }
+ }
+ }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for a 2D spatial max pooling layer.
+ * The input data has N examples, each represented as a 3D volume
+ * unrolled into a single vector.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, C*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * A typical value is 0.
+ * - padw: Padding for left and right sides.
+ * A typical value is 0.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ */
+ N = nrow(X)
+
+ # Create gradient volume
+ dX = matrix(0, rows=N, cols=C*Hin*Win)
+
+ # Gradient of max pooling
+ for (n in 1:N) { # all examples
+ Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+
+ # Pad image
+ pad_value = -1/0
+ Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros
+ parfor (c in 1:C) {
+ Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped
+ Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+ Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+ Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape
+ }
+ img = Xn_padded
+
+ dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+ for (c in 1:C) { # all channels
+ img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+ dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+ for (hout in 1:Hout, check=0) { # all output rows
+ hin = (hout-1) * strideh + 1
+ for (wout in 1:Wout) { # all output columns
+ win = (wout-1) * stridew + 1
+ img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
+ max_val_ind = img_slice_patch == max(img_slice_patch) # max value indicator matrix
+ # gradient passes through only for the max value(s) in this patch
+ dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
+ dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
+ + dimg_slice_patch
+ }
+ }
+ dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
+ }
+
+ # Unpad derivs on input
+ dXn = matrix(0, rows=C, cols=Hin*Win)
+ parfor (c in 1:C, check=0) {
+ dXn_padded_slice = matrix(dimg[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
+ dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
+ dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
+ }
+ dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/run_tests.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/run_tests.dml b/scripts/nn/test/run_tests.dml
new file mode 100644
index 0000000..d8173a9
--- /dev/null
+++ b/scripts/nn/test/run_tests.dml
@@ -0,0 +1,90 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Script to run tests.
+ */
+source("nn/test/grad_check.dml") as grad_check
+source("nn/test/test.dml") as test
+
+print("")
+print("Starting grad checks.")
+print("---")
+
+# Loss & loss-related functions
+grad_check::cross_entropy_loss()
+grad_check::l1_loss()
+grad_check::l1_reg()
+grad_check::l2_loss()
+grad_check::l2_reg()
+grad_check::log_loss()
+print("")
+
+# Core layers
+grad_check::affine()
+grad_check::batch_norm1d()
+grad_check::batch_norm2d()
+grad_check::conv2d()
+grad_check::conv2d_builtin()
+grad_check::conv2d_simple()
+grad_check::dropout()
+grad_check::lstm()
+grad_check::max_pool2d()
+grad_check::max_pool2d_builtin()
+grad_check::max_pool2d_simple()
+grad_check::relu()
+grad_check::rnn()
+grad_check::scale_shift1d()
+grad_check::scale_shift2d()
+grad_check::sigmoid()
+grad_check::softmax()
+grad_check::tanh()
+print("")
+
+# Example model
+grad_check::two_layer_affine_l2_net()
+print("")
+
+print("---")
+print("Grad checks complete -- look for any ERRORs or WARNINGs.")
+print("If any tests involving ReLUs failed, try a few times " +
+ "to ensure that they were not false negatives due to " +
+ "kinks being crossed.")
+print("")
+
+print("")
+print("Starting other tests.")
+print("---")
+
+test::batch_norm1d()
+test::batch_norm2d()
+test::conv2d()
+test::cross_entropy_loss()
+test::im2col()
+test::max_pool2d()
+test::padding()
+test::tanh()
+
+print("---")
+print("Other tests complete -- look for any ERRORs or WARNINGs.")
+print("")
+print("")
+