You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/04/26 21:42:27 UTC
[01/11] incubator-systemml git commit: [SYSTEMML-1524] Move
`examples` into `nn`
Repository: incubator-systemml
Updated Branches:
refs/heads/master aa2211ac0 -> 43c321d18
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml
new file mode 100644
index 0000000..e5755c4
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml
@@ -0,0 +1,331 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * MNIST LeNet Example
+ */
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/conv2d_builtin.dml") as conv2d
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/dropout.dml") as dropout
+source("nn/layers/l2_reg.dml") as l2_reg
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
+source("nn/layers/relu.dml") as relu
+source("nn/layers/softmax.dml") as softmax
+source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+
+train = function(matrix[double] X, matrix[double] y,
+ matrix[double] X_val, matrix[double] y_val,
+ int C, int Hin, int Win, int epochs)
+ return (matrix[double] W1, matrix[double] b1,
+ matrix[double] W2, matrix[double] b2,
+ matrix[double] W3, matrix[double] b3,
+ matrix[double] W4, matrix[double] b4) {
+ /*
+ * Trains a convolutional net using the "LeNet" architecture.
+ *
+ * The input matrix, X, has N examples, each represented as a 3D
+ * volume unrolled into a single vector. The targets, y, have K
+ * classes, and are one-hot encoded.
+ *
+ * Inputs:
+ * - X: Input data matrix, of shape (N, C*Hin*Win).
+ * - y: Target matrix, of shape (N, K).
+ * - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
+ * - y_val: Target validation matrix, of shape (N, K).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - epochs: Total number of full training loops over the full data set.
+ *
+ * Outputs:
+ * - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
+ * - b1: 1st layer biases vector, of shape (F1, 1).
+ * - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
+ * - b2: 2nd layer biases vector, of shape (F2, 1).
+ * - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
+ * - b3: 3rd layer biases vector, of shape (1, N3).
+ * - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
+ * - b4: 4th layer biases vector, of shape (1, K).
+ */
+ N = nrow(X)
+ K = ncol(y)
+
+ # Create network:
+ # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
+ Hf = 5 # filter height
+ Wf = 5 # filter width
+ stride = 1
+ pad = 2 # For same dimensions, (Hf - stride) / 2
+
+ F1 = 32 # num conv filters in conv1
+ F2 = 64 # num conv filters in conv2
+ N3 = 512 # num nodes in affine3
+ # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)
+
+ [W1, b1] = conv2d::init(F1, C, Hf, Wf) # inputs: (N, C*Hin*Win)
+ [W2, b2] = conv2d::init(F2, F1, Hf, Wf) # inputs: (N, F1*(Hin/2)*(Win/2))
+ [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3) # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
+ [W4, b4] = affine::init(N3, K) # inputs: (N, N3)
+ W4 = W4 / sqrt(2) # different initialization, since being fed into softmax, instead of relu
+
+ # Initialize SGD w/ Nesterov momentum optimizer
+ lr = 0.01 # learning rate
+ mu = 0.9 #0.5 # momentum
+ decay = 0.95 # learning rate decay constant
+ vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
+ vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
+ vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
+ vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)
+
+ # Regularization
+ lambda = 5e-04
+
+ # Optimize
+ print("Starting optimization")
+ batch_size = 64
+ iters = ceil(N / batch_size)
+ for (e in 1:epochs) {
+ for(i in 1:iters) {
+ # Get next batch
+ beg = ((i-1) * batch_size) %% N + 1
+ end = min(N, beg + batch_size - 1)
+ X_batch = X[beg:end,]
+ y_batch = y[beg:end,]
+
+ # Compute forward pass
+ ## layer 1: conv1 -> relu1 -> pool1
+ [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ outr1 = relu::forward(outc1)
+ [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+ strideh=2, stridew=2, pad=0, pad=0)
+ ## layer 2: conv2 -> relu2 -> pool2
+ [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
+ stride, stride, pad, pad)
+ outr2 = relu::forward(outc2)
+ [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+ strideh=2, stridew=2, pad=0, pad=0)
+ ## layer 3: affine3 -> relu3 -> dropout
+ outa3 = affine::forward(outp2, W3, b3)
+ outr3 = relu::forward(outa3)
+ [outd3, maskd3] = dropout::forward(outr3, 0.5, -1)
+ ## layer 4: affine4 -> softmax
+ outa4 = affine::forward(outd3, W4, b4)
+ probs = softmax::forward(outa4)
+
+ # Compute loss & accuracy for training & validation data every 100 iterations.
+ if (i %% 100 == 0) {
+ # Compute training loss & accuracy
+ loss_data = cross_entropy_loss::forward(probs, y_batch)
+ loss_reg_W1 = l2_reg::forward(W1, lambda)
+ loss_reg_W2 = l2_reg::forward(W2, lambda)
+ loss_reg_W3 = l2_reg::forward(W3, lambda)
+ loss_reg_W4 = l2_reg::forward(W4, lambda)
+ loss = loss_data + loss_reg_W1 + loss_reg_W2 + loss_reg_W3 + loss_reg_W4
+ accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
+
+ # Compute validation loss & accuracy
+ probs_val = predict(X_val, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
+ loss_val = cross_entropy_loss::forward(probs_val, y_val)
+ accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
+
+ # Output results
+ print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
+ + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+ }
+
+ # Compute data backward pass
+ ## loss:
+ dprobs = cross_entropy_loss::backward(probs, y_batch)
+ ## layer 4: affine4 -> softmax
+ douta4 = softmax::backward(dprobs, outa4)
+ [doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4)
+ ## layer 3: affine3 -> relu3 -> dropout
+ doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3)
+ douta3 = relu::backward(doutr3, outa3)
+ [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
+ ## layer 2: conv2 -> relu2 -> pool2
+ doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+ strideh=2, stridew=2, pad=0, pad=0)
+ doutc2 = relu::backward(doutr2, outc2)
+ [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
+ Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
+ ## layer 1: conv1 -> relu1 -> pool1
+ doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+ strideh=2, stridew=2, pad=0, pad=0)
+ doutc1 = relu::backward(doutr1, outc1)
+ [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win,
+ Hf, Wf, stride, stride, pad, pad)
+
+ # Compute regularization backward pass
+ dW1_reg = l2_reg::backward(W1, lambda)
+ dW2_reg = l2_reg::backward(W2, lambda)
+ dW3_reg = l2_reg::backward(W3, lambda)
+ dW4_reg = l2_reg::backward(W4, lambda)
+ dW1 = dW1 + dW1_reg
+ dW2 = dW2 + dW2_reg
+ dW3 = dW3 + dW3_reg
+ dW4 = dW4 + dW4_reg
+
+ # Optimize with SGD w/ Nesterov momentum
+ [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
+ [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
+ [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
+ [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
+ [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
+ [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
+ [W4, vW4] = sgd_nesterov::update(W4, dW4, lr, mu, vW4)
+ [b4, vb4] = sgd_nesterov::update(b4, db4, lr, mu, vb4)
+ }
+ # Anneal momentum towards 0.999
+ #mu = mu + (0.999 - mu)/(1+epochs-e)
+ # Decay learning rate
+ lr = lr * decay
+ }
+}
+
+predict = function(matrix[double] X, int C, int Hin, int Win,
+ matrix[double] W1, matrix[double] b1,
+ matrix[double] W2, matrix[double] b2,
+ matrix[double] W3, matrix[double] b3,
+ matrix[double] W4, matrix[double] b4)
+ return (matrix[double] probs) {
+ /*
+ * Computes the class probability predictions of a convolutional
+ * net using the "LeNet" architecture.
+ *
+ * The input matrix, X, has N examples, each represented as a 3D
+ * volume unrolled into a single vector.
+ *
+ * Inputs:
+ * - X: Input data matrix, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
+ * - b1: 1st layer biases vector, of shape (F1, 1).
+ * - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
+ * - b2: 2nd layer biases vector, of shape (F2, 1).
+ * - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
+ * - b3: 3rd layer biases vector, of shape (1, N3).
+ * - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
+ * - b4: 4th layer biases vector, of shape (1, K).
+ *
+ * Outputs:
+ * - probs: Class probabilities, of shape (N, K).
+ */
+ N = nrow(X)
+
+ # Network:
+ # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
+ Hf = 5 # filter height
+ Wf = 5 # filter width
+ stride = 1
+ pad = 2 # For same dimensions, (Hf - stride) / 2
+
+ F1 = nrow(W1) # num conv filters in conv1
+ F2 = nrow(W2) # num conv filters in conv2
+ N3 = ncol(W3) # num nodes in affine3
+ K = ncol(W4) # num nodes in affine4, equal to number of target dimensions (num classes)
+
+ # Compute predictions over mini-batches
+ probs = matrix(0, rows=N, cols=K)
+ batch_size = 64
+ iters = ceil(N / batch_size)
+ for(i in 1:iters) {
+ # Get next batch
+ beg = ((i-1) * batch_size) %% N + 1
+ end = min(N, beg + batch_size - 1)
+ X_batch = X[beg:end,]
+
+ # Compute forward pass
+ ## layer 1: conv1 -> relu1 -> pool1
+ [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ outr1 = relu::forward(outc1)
+ [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+ strideh=2, stridew=2, pad=0, pad=0)
+ ## layer 2: conv2 -> relu2 -> pool2
+ [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
+ stride, stride, pad, pad)
+ outr2 = relu::forward(outc2)
+ [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+ strideh=2, stridew=2, pad=0, pad=0)
+ ## layer 3: affine3 -> relu3
+ outa3 = affine::forward(outp2, W3, b3)
+ outr3 = relu::forward(outa3)
+ ## layer 4: affine4 -> softmax
+ outa4 = affine::forward(outr3, W4, b4)
+ probs_batch = softmax::forward(outa4)
+
+ # Store predictions
+ probs[beg:end,] = probs_batch
+ }
+}
+
+eval = function(matrix[double] probs, matrix[double] y)
+ return (double loss, double accuracy) {
+ /*
+ * Evaluates a convolutional net using the "LeNet" architecture.
+ *
+ * The probs matrix contains the class probability predictions
+ * of K classes over N examples. The targets, y, have K classes,
+ * and are one-hot encoded.
+ *
+ * Inputs:
+ * - probs: Class probabilities, of shape (N, K).
+ * - y: Target matrix, of shape (N, K).
+ *
+ * Outputs:
+ * - loss: Scalar loss, of shape (1).
+ * - accuracy: Scalar accuracy, of shape (1).
+ */
+ # Compute loss & accuracy
+ loss = cross_entropy_loss::forward(probs, y)
+ correct_pred = rowIndexMax(probs) == rowIndexMax(y)
+ accuracy = mean(correct_pred)
+}
+
+generate_dummy_data = function()
+ return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
+ /*
+ * Generate a dummy dataset similar to the MNIST dataset.
+ *
+ * Outputs:
+ * - X: Input data matrix, of shape (N, D).
+ * - y: Target matrix, of shape (N, K).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ */
+ # Generate dummy input data
+ N = 1024 # num examples
+ C = 1 # num input channels
+ Hin = 28 # input height
+ Win = 28 # input width
+ K = 10 # num target classes
+ X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+ classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))
+ y = table(seq(1, N), classes) # one-hot encoding
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml
new file mode 100644
index 0000000..4c8c434
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml
@@ -0,0 +1,77 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST Softmax - Predict
+#
+# This script computes the class probability predictions of a
+# trained softmax classifier on images of handwritten digits.
+#
+# Inputs:
+# - X: File containing training images.
+# The format is "pixel_1, pixel_2, ..., pixel_n".
+# - model_dir: Directory containing the trained weights and biases
+# of the model.
+# - out_dir: Directory to store class probability predictions for
+# each image.
+# - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
+# Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+# - probs: File containing class probability predictions for each
+# image.
+#
+# Data:
+# The X file should contain images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels.
+#
+# Sample Invocation:
+# 1. Download images.
+#
+# For example, save images to `nn/examples/data/mnist/images.csv`.
+#
+# 2. Execute using Spark
+# ```
+# spark-submit --master local[*] --driver-memory 5G
+# --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+# $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_softmax-predict.dml
+# -nvargs X=nn/examples/data/mnist/images.csv
+# model_dir=nn/examples/model/mnist_softmax out_dir=nn/examples/data/mnist
+#
+source("nn/examples/mnist_softmax.dml") as mnist_softmax
+
+# Read training data
+fmt = ifdef($fmt, "csv")
+X = read($X, format=fmt)
+
+# Scale images to [0,1], and one-hot encode the labels
+X = X / 255.0
+
+# Read model coefficients
+W = read($model_dir+"/W")
+b = read($model_dir+"/b")
+
+# Predict classes
+probs = mnist_softmax::predict(X, W, b)
+
+# Output results
+write(probs, $out_dir+"/probs."+fmt, format=fmt)
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml
new file mode 100644
index 0000000..09970f0
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml
@@ -0,0 +1,110 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST Softmax - Train
+#
+# This script trains a softmax classifier on images of handwritten
+# digits.
+#
+# Inputs:
+# - train: File containing labeled MNIST training images.
+# The format is "label, pixel_1, pixel_2, ..., pixel_n".
+# - test: File containing labeled MNIST test images.
+# The format is "label, pixel_1, pixel_2, ..., pixel_n".
+# - out_dir: Directory to store weights and bias matrices of
+# trained model, as well as final test accuracy.
+# - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
+# Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+# - W: File containing the trained weights of the model.
+# - b: File containing the trained biases of the model.
+# - accuracy: File containing the final accuracy on the test data.
+#
+# Data:
+# The MNIST dataset contains labeled images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels, and each label is
+# one of 10 possible digits in [0,9].
+#
+# Sample Invocation (running from wihtin the `examples` folder):
+# 1. Download data (60,000 training examples, and 10,000 test examples)
+# ```
+# nn/examples/get_mnist_data.sh
+# ```
+#
+# 2. Execute using Spark
+# ```
+# spark-submit --master local[*] --driver-memory 10G
+# --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+# $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_softmax-train.dml
+# -nvargs train=nn/examples/data/mnist/mnist_train.csv test=nn/examples/data/mnist/mnist_test.csv
+# epochs=1 out_dir=nn/examples/model/mnist_softmax
+# ```
+#
+source("nn/examples/mnist_softmax.dml") as mnist_softmax
+
+# Read training data
+fmt = ifdef($fmt, "csv")
+train = read($train, format=fmt)
+test = read($test, format=fmt)
+epochs = ifdef($epochs, 1)
+out_dir = ifdef($out_dir, ".")
+
+# Extract images and labels
+images = train[,2:ncol(train)]
+labels = train[,1]
+X_test = test[,2:ncol(test)]
+y_test = test[,1]
+
+# Scale images to [0,1], and one-hot encode the labels
+n = nrow(train)
+n_test = nrow(test)
+classes = 10
+images = images / 255.0
+labels = table(seq(1, n), labels+1, n, classes)
+X_test = X_test / 255.0
+y_test = table(seq(1, n_test), y_test+1, n_test, classes)
+
+# Split into training (55,000 examples) and validation (5,000 examples)
+X = images[5001:nrow(images),]
+X_val = images[1:5000,]
+y = labels[5001:nrow(images),]
+y_val = labels[1:5000,]
+
+# Train
+[W, b] = mnist_softmax::train(X, y, X_val, y_val, epochs)
+
+# Write model out
+write(W, out_dir+"/W")
+write(b, out_dir+"/b")
+
+# Eval on test set
+probs = mnist_softmax::predict(X_test, W, b)
+[loss, accuracy] = mnist_softmax::eval(probs, y_test)
+
+# Output results
+print("Test Accuracy: " + accuracy)
+write(accuracy, out_dir+"/accuracy")
+
+print("")
+print("")
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
new file mode 100644
index 0000000..a529a12
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
@@ -0,0 +1,178 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * MNIST Softmax Example
+ */
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/softmax.dml") as softmax
+source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+
+train = function(matrix[double] X, matrix[double] y,
+ matrix[double] X_val, matrix[double] y_val,
+ int epochs)
+ return (matrix[double] W, matrix[double] b) {
+ /*
+ * Trains a softmax classifier.
+ *
+ * The input matrix, X, has N examples, each with D features.
+ * The targets, y, have K classes, and are one-hot encoded.
+ *
+ * Inputs:
+ * - X: Input data matrix, of shape (N, D).
+ * - y: Target matrix, of shape (N, K).
+ * - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
+ * - y_val: Target validation matrix, of shape (N, K).
+ * - epochs: Total number of full training loops over the full data set.
+ *
+ * Outputs:
+ * - W: Weights (parameters) matrix, of shape (D, M).
+ * - b: Biases vector, of shape (1, M).
+ */
+ N = nrow(X) # num examples
+ D = ncol(X) # num features
+ K = ncol(y) # num classes
+
+ # Create softmax classifier:
+ # affine -> softmax
+ [W, b] = affine::init(D, K)
+ W = W / sqrt(2.0/(D)) * sqrt(1/(D))
+
+ # Initialize SGD w/ Nesterov momentum optimizer
+ lr = 0.2 # learning rate
+ mu = 0 # momentum
+ decay = 0.99 # learning rate decay constant
+ vW = sgd_nesterov::init(W) # optimizer momentum state for W
+ vb = sgd_nesterov::init(b) # optimizer momentum state for b
+
+ # Optimize
+ print("Starting optimization")
+ batch_size = 50
+ iters = 1000 #ceil(N / batch_size)
+ for (e in 1:epochs) {
+ for(i in 1:iters) {
+ # Get next batch
+ beg = ((i-1) * batch_size) %% N + 1
+ end = min(N, beg + batch_size - 1)
+ X_batch = X[beg:end,]
+ y_batch = y[beg:end,]
+
+ # Compute forward pass
+ ## affine & softmax:
+ out = affine::forward(X_batch, W, b)
+ probs = softmax::forward(out)
+
+ # Compute loss & accuracy for training & validation data
+ loss = cross_entropy_loss::forward(probs, y_batch)
+ accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
+ probs_val = predict(X_val, W, b)
+ loss_val = cross_entropy_loss::forward(probs_val, y_val)
+ accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
+ print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
+ accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+
+ # Compute backward pass
+ ## loss:
+ dprobs = cross_entropy_loss::backward(probs, y_batch)
+ ## affine & softmax:
+ dout = softmax::backward(dprobs, out)
+ [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b)
+
+ # Optimize with SGD w/ Nesterov momentum
+ [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW)
+ [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb)
+ }
+ # Anneal momentum towards 0.999
+ mu = mu + (0.999 - mu)/(1+epochs-e)
+ # Decay learning rate
+ lr = lr * decay
+ }
+}
+
+predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
+ return (matrix[double] probs) {
+ /*
+ * Computes the class probability predictions of a softmax classifier.
+ *
+ * The input matrix, X, has N examples, each with D features.
+ *
+ * Inputs:
+ * - X: Input data matrix, of shape (N, D).
+ * - W: Weights (parameters) matrix, of shape (D, M).
+ * - b: Biases vector, of shape (1, M).
+ *
+ * Outputs:
+ * - probs: Class probabilities, of shape (N, K).
+ */
+ # Compute forward pass
+ ## affine & softmax:
+ out = affine::forward(X, W, b)
+ probs = softmax::forward(out)
+}
+
+eval = function(matrix[double] probs, matrix[double] y)
+ return (double loss, double accuracy) {
+ /*
+ * Evaluates a softmax classifier.
+ *
+ * The probs matrix contains the class probability predictions
+ * of K classes over N examples. The targets, y, have K classes,
+ * and are one-hot encoded.
+ *
+ * Inputs:
+ * - probs: Class probabilities, of shape (N, K).
+ * - y: Target matrix, of shape (N, K).
+ *
+ * Outputs:
+ * - loss: Scalar loss, of shape (1).
+ * - accuracy: Scalar accuracy, of shape (1).
+ */
+ # Compute loss & accuracy
+ loss = cross_entropy_loss::forward(probs, y)
+ correct_pred = rowIndexMax(probs) == rowIndexMax(y)
+ accuracy = mean(correct_pred)
+}
+
+generate_dummy_data = function()
+ return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
+ /*
+ * Generate a dummy dataset similar to the MNIST dataset.
+ *
+ * Outputs:
+ * - X: Input data matrix, of shape (N, D).
+ * - y: Target matrix, of shape (N, K).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ */
+ # Generate dummy input data
+ N = 1024 # num examples
+ C = 1 # num input channels
+ Hin = 28 # input height
+ Win = 28 # input width
+ T = 10 # num targets
+ X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+ classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
+ y = table(seq(1, N), classes) # one-hot encoding
+}
+
[03/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn`
library to `scripts/nn`
Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/test.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/test.dml b/scripts/staging/SystemML-NN/nn/test/test.dml
deleted file mode 100644
index a5cb497..0000000
--- a/scripts/staging/SystemML-NN/nn/test/test.dml
+++ /dev/null
@@ -1,549 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Various tests, not including gradient checks.
- */
-source("nn/layers/batch_norm1d.dml") as batch_norm1d
-source("nn/layers/batch_norm2d.dml") as batch_norm2d
-source("nn/layers/conv2d.dml") as conv2d
-source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/max_pool2d.dml") as max_pool2d
-source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
-source("nn/layers/tanh.dml") as tanh
-source("nn/test/conv2d_simple.dml") as conv2d_simple
-source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
-source("nn/test/util.dml") as test_util
-source("nn/util.dml") as util
-
-batch_norm1d = function() {
- /*
- * Test for the 1D batch normalization function.
- */
- print("Testing the 1D batch normalization function.")
-
- # Generate data
- N = 4 # Number of examples
- D = 4 # Number of features
- mode = 'train' # execution mode
- mu = 0.9 # momentum of moving averages
- eps = 1e-5 # smoothing term
- X = matrix(seq(1,16), rows=N, cols=D)
-
- # Create layer
- [gamma, beta, ema_mean, ema_var] = batch_norm1d::init(D)
-
- # Forward
- [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-
- # Equivalency check
- target = matrix("-1.34160721 -1.34160721 -1.34160733 -1.34160709
- -0.44720244 -0.44720244 -0.44720244 -0.44720232
- 0.44720244 0.44720232 0.44720244 0.44720244
- 1.34160733 1.34160721 1.34160733 1.34160733", rows=1, cols=N*D)
- out = matrix(out, rows=1, cols=N*D)
- for (i in 1:length(out)) {
- rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
- as.scalar(target[1,i]), 1e-3, 1e-4)
- }
-}
-
-conv2d = function() {
- /*
- * Test for the 2D convolution functions.
- */
- print("Testing the 2D convolution functions.")
-
- # Generate data
- N = 2 # num examples
- C = 3 # num channels
- Hin = 5 # input height
- Win = 5 # input width
- F = 2 # num filters
- Hf = 3 # filter height
- Wf = 3 # filter width
- stride = 1
- pad = 1
- X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
-
- # Create layer
- [W, b] = conv2d::init(F, C, Hf, Wf)
-
- # Forward
- [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- [out_simple, Hout_simple, Wout_simple] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf,
- stride, stride, pad, pad)
- [out_builtin, Hout_builtin, Wout_builtin] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf,
- stride, stride, pad, pad)
-
- # Equivalency check
- out = matrix(out, rows=1, cols=N*F*Hout*Wout)
- out_simple = matrix(out_simple, rows=1, cols=N*F*Hout*Wout)
- out_builtin = matrix(out_builtin, rows=1, cols=N*F*Hout*Wout)
- for (i in 1:length(out)) {
- rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
- as.scalar(out_simple[1,i]), 1e-10, 1e-12)
- rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
- as.scalar(out_builtin[1,i]), 1e-10, 1e-12)
- }
-}
-
-cross_entropy_loss = function() {
- /*
- * Test for the cross-entropy loss function.
- *
- * Here we make sure that the cross-entropy loss function does
- * not propagate `infinity` values in the case that a prediction is
-` * exactly equal to 0.
- */
- print("Testing the cross-entropy loss function with zero-valued predictions.")
-
- # Generate data
- N = 3 # num examples
- K = 10 # num targets
- pred = matrix(0, rows=N, cols=K)
- y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
- y = y / rowSums(y) # normalized probs
-
- loss = cross_entropy_loss::forward(pred, y)
-
- inf = 1/0
- if (loss == inf) {
- print("ERROR: The cross-entropy loss function ouptuts infinity for all-zero predictions.")
- }
-}
-
-im2col = function() {
- /*
- * Test for the `im2col` and `col2im` functions.
- */
- print("Testing the im2col and col2im functions.")
-
- # Generate data
- C = 3 # num channels
- Hin = 5 # input height
- Win = 5 # input width
- Hf = 3 # filter height
- Wf = 3 # filter width
- stride = 2
- pad = (Hin * stride - Hin + Hf - stride) / 2
- Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
- Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
- x = rand(rows=C, cols=Hin*Win)
-
- # pad
- x_pad = util::pad_image(x, Hin, Win, pad, pad, 0)
-
- # im2col
- x_cols = util::im2col(x_pad, Hin+2*pad, Win+2*pad, Hf, Wf, stride, stride)
-
- if (ncol(x_cols) != Hout*Wout) {
- print("ERROR: im2col does not yield the correct output size: "
- + ncol(x_cols)+" (actual) vs. "+Hout*Wout+" (correct).")
- }
-
- # col2im
- x_pad2 = util::col2im(x_cols, C, Hin+2*pad, Win+2*pad, Hf, Wf, stride, stride, "none")
-
- # Equivalency check
- equivalent = test_util::all_equal(x_pad, x_pad2)
- if (!equivalent) {
- print("ERROR: im2col and then col2im does not yield the original image.")
- }
-}
-
-padding = function() {
- /*
- * Test for the `pad_image` and `unpad_image` functions.
- */
- print("Testing the padding and unpadding functions.")
-
- # Generate data
- C = 3 # num channels
- Hin = 5 # input height
- Win = 5 # input width
- pad = 3 # padding
- x = rand(rows=C, cols=Hin*Win)
-
- # Pad image
- x_pad = util::pad_image(x, Hin, Win, pad, pad, 0)
-
- # Check for padded rows & columns
- for (c in 1:C) {
- x_pad_slice = matrix(x_pad[c,], rows=Hin+2*pad, cols=Win+2*pad)
- for (i in 1:pad) {
- rowsum = sum(x_pad_slice[i,])
- colsum = sum(x_pad_slice[,i])
- if (rowsum != 0)
- print("ERROR: Padding was not applied to row " + i + ".")
- if (colsum != 0)
- print("ERROR: Padding was not applied to column " + i + ".")
- }
- }
-
- # Unpad image
- x1 = util::unpad_image(x_pad, Hin, Win, pad, pad)
-
- # Equivalency check
- equivalent = test_util::all_equal(x, x1)
- if (!equivalent) {
- print("ERROR: Padding and then unpadding does not yield the original image.")
- }
-}
-
-max_pool2d = function() {
- /*
- * Test for the 2D max pooling functions.
- */
- print("Testing the 2D max pooling functions.")
-
- # Generate data
- N = 2 # num examples
- C = 3 # num channels
- Hin = 8 # input height
- Win = 8 # input width
- Hf = 2 # filter height
- Wf = 2 # filter width
- stride = 2
- X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
-
- for (padh in 0:3) {
- for (padw in 0:3) {
- print(" - Testing w/ padh="+padh+" & padw="+padw+".")
- #if (1==1) {} # force correct printing
- #print(" - Testing forward")
- [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, padh, padw)
- [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
- stride, stride,
- padh, padw)
- [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win,
- Hf, Wf,
- stride, stride,
- padh, padw)
-
- # Equivalency check
- out = matrix(out, rows=1, cols=N*C*Hout*Wout)
- out_simple = matrix(out_simple, rows=1, cols=N*C*Hout*Wout)
- out_builtin = matrix(out_builtin, rows=1, cols=N*C*Hout*Wout)
- for (i in 1:length(out)) {
- rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
- as.scalar(out_simple[1,i]), 1e-10, 1e-12)
- rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
- as.scalar(out_builtin[1,i]), 1e-10, 1e-12)
- }
-
- #print(" - Testing backward")
- dout = rand(rows=N, cols=C*Hout*Wout, pdf="normal")
- dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
- padh, padw)
- dX_simple = max_pool2d_simple::backward(dout, Hout_simple, Wout_simple, X, C, Hin, Win,
- Hf, Wf, stride, stride, padh, padw)
- dX_builtin = max_pool2d_builtin::backward(dout, Hout_builtin, Wout_builtin, X, C, Hin, Win,
- Hf, Wf, stride, stride, padh, padw)
-
- # Equivalency check
- dX = matrix(dX, rows=1, cols=N*C*Hin*Win)
- dX_simple = matrix(dX_simple, rows=1, cols=N*C*Hin*Win)
- dX_builtin = matrix(dX_builtin, rows=1, cols=N*C*Hin*Win)
- for (i in 1:length(dX)) {
- rel_error = test_util::check_rel_error(as.scalar(dX[1,i]),
- as.scalar(dX_simple[1,i]), 1e-10, 1e-12)
- rel_error = test_util::check_rel_error(as.scalar(dX[1,i]),
- as.scalar(dX_builtin[1,i]), 1e-10, 1e-12)
- }
- }
- }
-
- # ---
- print(" - Testing for correct behavior against known answer w/ pad=0.")
- # generate data
- # -- channel 1
- # 1 2 3 4
- # 5 6 7 8
- # 9 10 11 12
- # 13 14 15 16
- # -- channel 2
- # 1 5 9 13
- # 2 6 10 14
- # 3 7 11 15
- # 4 8 12 16
- C = 2 # num channels
- Hin = 4 # input height
- Win = 4 # input width
- X = matrix(seq(1,16,1), rows=Hin, cols=Win)
- X = matrix(rbind(X, t(X)), rows=1, cols=C*Hin*Win) # C=2
- X = rbind(X, X) # n=2
- pad = 0
-
- # forward
- [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
- stride, stride, pad, pad)
- [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
- stride, stride, pad, pad)
-
- # equivalency check
- # -- channel 1
- # 6 8
- # 14 16
- # -- channel 2
- # 6 14
- # 8 16
- target = matrix("6 8 14 16 6 14 8 16", rows=1, cols=C*Hout*Wout)
- target = rbind(target, target) # n=2
- tmp = test_util::check_all_equal(out, target)
- tmp = test_util::check_all_equal(out_simple, target)
- tmp = test_util::check_all_equal(out_builtin, target)
-
- print(" - Testing for correct behavior against known answer w/ pad=1.")
- # generate data
- # -- channel 1
- # 0 0 0 0 0 0
- # 0 1 2 3 4 0
- # 0 5 6 7 8 0
- # 0 9 10 11 12 0
- # 0 13 14 15 16 0
- # 0 0 0 0 0 0
- # -- channel 2
- # 0 0 0 0 0 0
- # 0 1 5 9 13 0
- # 0 2 6 10 14 0
- # 0 3 7 11 15 0
- # 0 4 8 12 16 0
- # 0 0 0 0 0 0
- pad = 1
-
- # forward
- [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
- stride, stride, pad, pad)
- [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
- stride, stride, pad, pad)
-
- # equivalency check
- # -- channel 1
- # 1 3 4
- # 9 11 12
- # 13 15 16
- # -- channel 2
- # 1 9 13
- # 3 11 15
- # 4 12 16
- target = matrix("1 3 4 9 11 12 13 15 16 1 9 13 3 11 15 4 12 16", rows=1, cols=C*Hout*Wout)
- target = rbind(target, target) # n=2
- tmp = test_util::check_all_equal(out, target)
- tmp = test_util::check_all_equal(out_simple, target)
- tmp = test_util::check_all_equal(out_builtin, target)
-
- print(" - Testing for correct behavior against known answer w/ all negative matrix w/ pad=0.")
- # generate data
- # -- channel 1
- # -1 -2 -3 -4
- # -5 -6 -7 -8
- # -9 -10 -11 -12
- # -13 -14 -15 -16
- # -- channel 2
- # -1 -5 -9 -13
- # -2 -6 -10 -14
- # -3 -7 -11 -15
- # -4 -8 -12 -16
- X = X * -1
- pad = 0
-
- # forward
- [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
- stride, stride, pad, pad)
- [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
- stride, stride, pad, pad)
-
- # equivalency check
- # -- channel 1
- # -1 -3
- # -9 -11
- # -- channel 2
- # -1 -9
- # -3 -11
- target = matrix("-1 -3 -9 -11 -1 -9 -3 -11", rows=1, cols=C*Hout*Wout)
- target = rbind(target, target) # n=2
- tmp = test_util::check_all_equal(out, target)
- tmp = test_util::check_all_equal(out_simple, target)
- tmp = test_util::check_all_equal(out_builtin, target)
-
-
- print(" - Testing for correct behavior against known answer w/ all negative matrix w/ pad=1.")
- # generate data
- # -- channel 1
- # 0 0 0 0 0 0
- # 0 -1 -2 -3 -4 0
- # 0 -5 -6 -7 -8 0
- # 0 -9 -10 -11 -12 0
- # 0 -13 -14 -15 -16 0
- # 0 0 0 0 0 0
- # -- channel 2
- # 0 0 0 0 0 0
- # 0 -1 -5 -9 -13 0
- # 0 -2 -6 -10 -14 0
- # 0 -3 -7 -11 -15 0
- # 0 -4 -8 -12 -16 0
- # 0 0 0 0 0 0
- pad = 1
-
- # forward
- [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
- stride, stride, pad, pad)
- [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
- stride, stride, pad, pad)
-
- # equivalency check
- # -- channel 1
- # 0 0 0
- # 0 -6 0
- # 0 0 0
- # -- channel 2
- # 0 0 0
- # 0 -6 0
- # 0 0 0
- target = matrix("-1 -2 -4 -5 -6 -8 -13 -14 -16 -1 -5 -13 -2 -6 -14 -4 -8 -16",
- rows=1, cols=C*Hout*Wout)
- target = rbind(target, target) # n=2
- tmp = test_util::check_all_equal(out, target)
- tmp = test_util::check_all_equal(out_simple, target)
- tmp = test_util::check_all_equal(out_builtin, target)
-}
-
-batch_norm2d = function() {
- /*
- * Test for the 2D (spatial) batch normalization function.
- */
- print("Testing the 2D (spatial) batch normalization function.")
-
- # Generate data
- N = 2 # Number of examples
- C = 3 # num channels
- Hin = 4 # input height
- Win = 5 # input width
- mode = 'train' # execution mode
- mu = 0.9 # momentum of moving averages
- eps = 1e-5 # smoothing term
- X = matrix("70 29 23 55 72
- 42 98 68 48 39
- 34 73 44 6 40
- 74 18 18 53 53
-
- 63 85 72 61 72
- 32 36 23 29 63
- 9 43 43 49 43
- 31 43 89 94 50
-
- 62 12 32 41 87
- 25 48 99 52 61
- 12 83 60 55 34
- 30 42 68 88 51
-
-
- 67 59 62 67 84
- 8 76 24 19 57
- 10 89 63 72 2
- 59 56 16 15 70
-
- 32 69 55 39 93
- 84 36 4 30 40
- 70 100 36 76 59
- 69 15 40 24 34
-
- 51 67 11 13 32
- 66 85 55 85 38
- 32 35 17 83 34
- 55 58 52 0 99", rows=N, cols=C*Hin*Win)
-
- # Create layer
- [gamma, beta, ema_mean, ema_var] = batch_norm2d::init(C)
-
- # Forward
- [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
-
- # Equivalency check
- target = matrix("0.86215019 -0.76679718 -1.00517964 0.26619387 0.94161105
- -0.25030172 1.97460198 0.78268933 -0.01191914 -0.36949289
- -0.56814504 0.98134136 -0.17084086 -1.68059683 -0.32976246
- 1.02107191 -1.20383179 -1.20383179 0.18673301 0.18673301
-
- 0.50426388 1.41921711 0.87856293 0.42108631 0.87856293
- -0.78498828 -0.61863315 -1.15928721 -0.90975463 0.50426388
- -1.74153018 -0.32751167 -0.32751167 -0.07797909 -0.32751167
- -0.82657707 -0.32751167 1.58557224 1.79351616 -0.0363903
-
- 0.4607178 -1.49978399 -0.71558321 -0.36269283 1.44096887
- -0.99005347 -0.08822262 1.91148913 0.06861746 0.42150795
- -1.49978399 1.28412855 0.38229787 0.18624771 -0.63716316
- -0.79400325 -0.32348287 0.69597805 1.48017895 0.0294075
-
-
- 0.74295878 0.42511559 0.54430676 0.74295878 1.41837597
- -1.60113597 1.10053277 -0.96544927 -1.16410136 0.34565473
- -1.52167511 1.61702824 0.5840373 0.94161105 -1.83951855
- 0.42511559 0.30592418 -1.28329265 -1.32302308 0.86215019
-
- -0.78498828 0.75379658 0.17155361 -0.4938668 1.75192738
- 1.37762833 -0.61863315 -1.9494741 -0.86816585 -0.45227802
- 0.79538536 2.04304862 -0.61863315 1.04491806 0.33790874
- 0.75379658 -1.49199748 -0.45227802 -1.11769855 -0.70181072
-
- 0.0294075 0.65676796 -1.53899395 -1.46057391 -0.71558321
- 0.61755812 1.36254871 0.18624771 1.36254871 -0.48032296
- -0.71558321 -0.59795308 -1.30373383 1.28412855 -0.63716316
- 0.18624771 0.30387771 0.06861746 -1.97030437 1.91148913",
- rows=1, cols=N*C*Hin*Win)
- out = matrix(out, rows=1, cols=N*C*Hin*Win)
- for (i in 1:length(out)) {
- rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
- as.scalar(target[1,i]), 1e-3, 1e-4)
- }
-}
-
-tanh = function() {
- /*
- * Test for the `tanh` forward function.
- */
- print("Testing the tanh forward function.")
-
- # Generate data
- N = 2 # num examples
- C = 3 # num channels
- X = rand(rows=N, cols=C, pdf="normal")
-
- out = tanh::forward(X)
- out_ref = (exp(X) - exp(-X)) / (exp(X) + exp(-X))
-
- # Equivalency check
- for (i in 1:nrow(out)) {
- for (j in 1:ncol(out)) {
- rel_error = test_util::check_rel_error(as.scalar(out[i,j]), as.scalar(out_ref[i,j]),
- 1e-10, 1e-12)
- }
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/util.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/util.dml b/scripts/staging/SystemML-NN/nn/test/util.dml
deleted file mode 100644
index e32a885..0000000
--- a/scripts/staging/SystemML-NN/nn/test/util.dml
+++ /dev/null
@@ -1,155 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Test utility functions.
- */
-
-all_equal = function(matrix[double] X1, matrix[double] X2)
- return(boolean equivalent) {
- /*
- * Determine if two matrices are equivalent.
- *
- * Inputs:
- * - X1: Inputs, of shape (any, any).
- * - X2: Inputs, of same shape as X1.
- *
- * Outputs:
- * - equivalent: Whether or not the two matrices are equivalent.
- */
- equivalent = as.logical(prod(X1 == X2))
-}
-
-check_all_equal = function(matrix[double] X1, matrix[double] X2)
- return(boolean equivalent) {
- /*
- * Check if two matrices are equivalent, and report any issues.
- *
- * Issues an "ERROR" statement if elements of the two matrices are
- * not equal.
- *
- * Inputs:
- * - X1: Inputs, of shape (any, any).
- * - X2: Inputs, of same shape as X1.
- *
- * Outputs:
- * - equivalent: Whether or not the two matrices are equivalent.
- */
- # Determine if matrices are equivalent
- equivalent = all_equal(X1, X2)
-
- # Evaluate relative error
- if (!equivalent) {
- print("ERROR: The two matrices are not equivalent.")
- }
-}
-
-compute_rel_error = function(double x1, double x2)
- return (double rel_error) {
- /*
- * Relative error measure between two values.
- *
- * Uses smoothing to avoid divide-by-zero errors.
- *
- * Inputs:
- * - x1: First value.
- * - x2: Second value.
- *
- * Outputs:
- * - rel_error: Relative error measure between the two values.
- */
- rel_error = abs(x1-x2) / max(1e-8, abs(x1)+abs(x2))
-}
-
-check_rel_error = function(double x1, double x2, double thresh_error, double thresh_warn)
- return (double rel_error) {
- /*
- * Check and report any issues with the relative error measure between
- * two values.
- *
- * Issues an "ERROR" statement for relative errors > thresh_error,
- * indicating that the implementation is likely incorrect.
- *
- * Issues a "WARNING" statement for relative errors < thresh_error
- * but > thresh_warn, indicating that the implementation may be
- * incorrect.
- *
- * Inputs:
- * - x1: First value.
- * - x2: Second value.
- * - thresh_error: Error threshold.
- * - thresh_warn: Warning threshold.
- *
- * Outputs:
- * - rel_error: Relative error measure between the two values.
- */
- # Compute relative error
- rel_error = compute_rel_error(x1, x2)
-
- # Evaluate relative error
- if (rel_error > thresh_error) {
- print("ERROR: Relative error " + rel_error + " > " + thresh_error + " with " + x1 +
- " vs " + x2 + ".")
- }
- else if (rel_error > thresh_warn & rel_error <= thresh_error) {
- print("WARNING: Relative error " + rel_error + " > " + thresh_warn + " & <= " + thresh_error +
- " with " + x1 + " vs " + x2 + ".")
- }
-}
-
-check_rel_grad_error = function(double dw_a, double dw_n, double lossph, double lossmh)
- return (double rel_error) {
- /*
- * Check and report any issues with the relative error measure between
- * the analytical and numerical partial derivatives.
- *
- * - Issues an "ERROR" statement for relative errors > 1e-2,
- * indicating that the gradient is likely incorrect.
- * - Issues a "WARNING" statement for relative errors < 1e-2
- * but > 1e-4, indicating that the may be incorrect.
- *
- * Inputs:
- * - dw_a: Analytical partial derivative wrt w.
- * - dw_n: Numerical partial derivative wrt w.
- * - lossph: Loss evaluated with w set to w+h.
- * - lossmh: Loss evaluated with w set to w-h.
- *
- * Outputs:
- * - rel_error: Relative error measure between the two derivatives.
- */
- # Compute relative error
- rel_error = compute_rel_error(dw_a, dw_n)
-
- # Evaluate relative error
- thresh_error = 1e-2
- thresh_warn = 1e-4
- if (rel_error > thresh_error) {
- print("ERROR: Relative error " + rel_error + " > " + thresh_error + " with " + dw_a +
- " analytical vs " + dw_n + " numerical, with lossph " + lossph +
- " and lossmh " + lossmh)
- }
- else if (rel_error > thresh_warn & rel_error <= thresh_error) {
- print("WARNING: Relative error " + rel_error + " > " + thresh_warn + " & <= " + thresh_error +
- " with " + dw_a + " analytical vs " + dw_n + " numerical, with lossph " + lossph +
- " and lossmh " + lossmh)
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/util.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/util.dml b/scripts/staging/SystemML-NN/nn/util.dml
deleted file mode 100644
index 3a73f08..0000000
--- a/scripts/staging/SystemML-NN/nn/util.dml
+++ /dev/null
@@ -1,202 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Utility functions.
- */
-
-channel_sums = function(matrix[double] X, int C, int Hin, int Win)
- return (matrix[double] out) {
- /*
- * Computes a channel-wise summation over a 4D input.
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- *
- * Outputs:
- * - out: Outputs, of shape (C, 1).
- */
- # Here we sum each column, reshape to (C, Hin*Win), and sum each row to result in the summation
- # for each channel.
- out = rowSums(matrix(colSums(X), rows=C, cols=Hin*Win)) # shape (C, 1)
-}
-
-im2col = function(matrix[double] img, int Hin, int Win, int Hf, int Wf, int strideh, int stridew)
- return (matrix[double] img_cols) {
- /*
- * Rearrange local image regions (patches) into columns.
- *
- * Assumes image has already been padded as necessary.
- *
- * Inputs:
- * - img: Input image, of shape (C, Hin*Win), where C is the number
- * of input channels (depth).
- * - Hin: Input height, including padding.
- * - Win: Input width, including padding.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- *
- * Outputs:
- * - img_cols: Local spatial regions (patches) of the image stretched
- * out into columns, of shape (C*Hf*Wf, Hout*Wout).
- */
- C = nrow(img)
- Hout = as.integer(floor((Hin-Hf)/strideh + 1))
- Wout = as.integer(floor((Win-Wf)/stridew + 1))
-
- # Note: We start with `img_cols` transposed to allow for row-major
- # left-indexing inside the loop, which is more performant.
- img_cols = matrix(0, rows=Hout*Wout, cols=C*Hf*Wf) # zeros
- parfor (hout in 1:Hout, check=0) { # all output rows
- hin = (hout-1)*strideh + 1
- parfor (wout in 1:Wout, check=0) { # all output columns
- win = (wout-1)*stridew + 1
- # Extract a local patch of the input image corresponding spatially to the filter sizes.
- img_patch = matrix(0, rows=C, cols=Hf*Wf) # zeros
- parfor (c in 1:C) { # all channels
- img_slice = matrix(img[c,], rows=Hin, cols=Win) # reshape
- img_patch[c,] = matrix(img_slice[hin:hin+Hf-1, win:win+Wf-1], rows=1, cols=Hf*Wf)
- }
- img_cols[(hout-1)*Wout + wout,] = t(matrix(img_patch, rows=C*Hf*Wf, cols=1)) # reshape
- }
- }
- img_cols = t(img_cols)
-}
-
-col2im = function(matrix[double] img_cols, int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, string reduction)
- return (matrix[double] img) {
- /*
- * Create an image from columns of local image regions (patches).
- *
- * The reduction strategy determines how to deal with overlapping
- * patches. If it is set to "add", any overlapping patches will be
- * added together when creating the image. This is useful when
- * computing gradients on the original image given gradients on the
- * patches. Otherwise, if "none" is provided, any overlapping
- * patches will just override previous ones when creating the image.
- * This is useful when recreating an image from the output of
- * `im2col`.
- *
- * Assumes original image was already padded as necessary.
- *
- * Inputs:
- * - img_cols: Local spatial regions (patches) of the image stretched
- * out into columns, of shape (C*Hf*Wf, Hout*Wout).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height, including padding.
- * - Win: Input width, including padding.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - reduction: The reduction strategy to use for overlapping
- * patches. Valid options are "add" and "none".
- *
- * Outputs:
- * - img: Input image, of shape (C, Hin*Win).
- */
- Hout = as.integer(floor((Hin-Hf)/strideh + 1))
- Wout = as.integer(floor((Win-Wf)/stridew + 1))
-
- img = matrix(0, rows=C, cols=Hin*Win) # zeros
- for (hout in 1:Hout) { # all output rows
- hin = (hout-1)*strideh + 1
- for (wout in 1:Wout) { # all output columns
- win = (wout-1)*stridew + 1
- # Extract a local patch of the input image corresponding spatially to the filter sizes.
- img_patch = matrix(img_cols[,(hout-1)*Wout + wout], rows=C, cols=Hf*Wf) # zeros
- parfor (c in 1:C) { # all channels
- img_patch_slice = matrix(img_patch[c,], rows=Hf, cols=Wf) # reshape
- if (reduction == "add") {
- img_slice = matrix(0, rows=Hin, cols=Win)
- img_slice[hin:hin+Hf-1, win:win+Wf-1] = img_patch_slice
- img[c,] = img[c,] + matrix(img_slice, rows=1, cols=Hin*Win)
- } else {
- img_slice = matrix(img[c,], rows=Hin, cols=Win)
- img_slice[hin:hin+Hf-1, win:win+Wf-1] = img_patch_slice
- img[c,] = matrix(img_slice, rows=1, cols=Hin*Win)
- }
- }
- }
- }
-}
-
-pad_image = function(matrix[double] img, int Hin, int Win, int padh, int padw, double pad_value)
- return (matrix[double] img_padded) {
- /*
- * Pads an image along the height and width dimensions with zeros.
- *
- * Inputs:
- * - img: Input image, of shape (C, Hin*Win), where C is the number
- * of input channels (depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - padh: Padding for top and bottom sides.
- * - padw: Padding for left and right sides.
- * - pad_value: Value to use for the padding.
- * A typical value is 0.
- *
- * Outputs:
- * - img_padded: The input image padded along the height and width
- * dimensions, of shape (C, (Hin+2*padh)*(Win+2*padw)).
- */
- C = nrow(img)
- img_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros
- parfor (c in 1:C) {
- img_slice = matrix(img[c,], rows=Hin, cols=Win) # depth slice C reshaped
- img_padded_slice = matrix(pad_value, rows=Hin+2*padh, cols=Win+2*padw)
- img_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = img_slice
- img_padded[c,] = matrix(img_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape
- }
-}
-
-unpad_image = function(matrix[double] img_padded, int Hin, int Win, int padh, int padw)
- return (matrix[double] img) {
- /*
- * Unpads an image along the height and width dimensions.
- *
- * Inputs:
- * - img_padded: The input image padded along the height and width
- * dimensions, of shape (C, (Hin+2*padh)*(Win+2*padw)).
- * - Hin: Input height of unpadded image.
- * - Win: Input width of unpadded image.
- * - padh: Padding for top and bottom sides.
- * - padw: Padding for left and right sides.
- *
- * Outputs:
- * - img: Input image, of shape (C, Hin*Win), where C is the number
- * of input channels (depth).
- */
- C = nrow(img_padded)
- img = matrix(0, rows=C, cols=Hin*Win)
- parfor (c in 1:C) {
- img_padded_slice = matrix(img_padded[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
- img_slice = img_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
- img[c,] = matrix(img_slice, rows=1, cols=Hin*Win)
- }
-}
-
[02/11] incubator-systemml git commit: [SYSTEMML-1524] Move
`examples` into `nn`
Posted by du...@apache.org.
[SYSTEMML-1524] Move `examples` into `nn`
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/1f5cf697
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/1f5cf697
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/1f5cf697
Branch: refs/heads/master
Commit: 1f5cf697c49313861a3bdbcc634f7a56daabdc16
Parents: aa2211a
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Wed Apr 26 14:40:40 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Wed Apr 26 14:40:40 2017 -0700
----------------------------------------------------------------------
scripts/staging/SystemML-NN/README.md | 2 +-
.../examples/Example - MNIST LeNet.ipynb | 198 -----------
.../Example - MNIST Softmax Classifier.ipynb | 185 -----------
scripts/staging/SystemML-NN/examples/README.md | 75 -----
.../SystemML-NN/examples/get_mnist_data.sh | 28 --
.../examples/mnist_lenet-predict.dml | 87 -----
.../SystemML-NN/examples/mnist_lenet-train.dml | 123 -------
.../SystemML-NN/examples/mnist_lenet.dml | 331 -------------------
.../examples/mnist_softmax-predict.dml | 74 -----
.../examples/mnist_softmax-train.dml | 108 ------
.../SystemML-NN/examples/mnist_softmax.dml | 177 ----------
scripts/staging/SystemML-NN/examples/nn | 1 -
.../nn/examples/Example - MNIST LeNet.ipynb | 189 +++++++++++
.../Example - MNIST Softmax Classifier.ipynb | 179 ++++++++++
.../staging/SystemML-NN/nn/examples/README.md | 74 +++++
.../SystemML-NN/nn/examples/get_mnist_data.sh | 28 ++
.../nn/examples/mnist_lenet-predict.dml | 91 +++++
.../nn/examples/mnist_lenet-train.dml | 123 +++++++
.../SystemML-NN/nn/examples/mnist_lenet.dml | 331 +++++++++++++++++++
.../nn/examples/mnist_softmax-predict.dml | 77 +++++
.../nn/examples/mnist_softmax-train.dml | 110 ++++++
.../SystemML-NN/nn/examples/mnist_softmax.dml | 178 ++++++++++
22 files changed, 1381 insertions(+), 1388 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/README.md
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/README.md b/scripts/staging/SystemML-NN/README.md
index 3943765..b80f2c6 100644
--- a/scripts/staging/SystemML-NN/README.md
+++ b/scripts/staging/SystemML-NN/README.md
@@ -22,7 +22,7 @@ limitations under the License.
### A deep learning library for [Apache SystemML](https://github.com/apache/incubator-systemml).
## Examples:
-#### Please see the [`examples`](examples) folder for more detailed examples, or view the following two quick examples.
+#### Please see the [`examples`](nn/examples) folder for more detailed examples, or view the following two quick examples.
### Neural net for regression with vanilla SGD:
```python
# Imports
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/Example - MNIST LeNet.ipynb
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/Example - MNIST LeNet.ipynb b/scripts/staging/SystemML-NN/examples/Example - MNIST LeNet.ipynb
deleted file mode 100644
index 3ad210e..0000000
--- a/scripts/staging/SystemML-NN/examples/Example - MNIST LeNet.ipynb
+++ /dev/null
@@ -1,198 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Quick Setup"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "# Create a SystemML MLContext object\n",
- "from systemml import MLContext, dml\n",
- "ml = MLContext(sc)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Download Data - MNIST"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9]. Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "%%sh\n",
- "mkdir -p data/mnist/\n",
- "cd data/mnist/\n",
- "curl -O http://pjreddie.com/media/files/mnist_train.csv\n",
- "curl -O http://pjreddie.com/media/files/mnist_test.csv"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## SystemML \"LeNet\" Neural Network"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 1. Train"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "script_string = \"\"\"\n",
- "source(\"mnist_lenet.dml\") as mnist_lenet\n",
- "\n",
- "# Read training data\n",
- "data = read($data, format=\"csv\")\n",
- "n = nrow(data)\n",
- "\n",
- "# Extract images and labels\n",
- "images = data[,2:ncol(data)]\n",
- "labels = data[,1]\n",
- "\n",
- "# Scale images to [-1,1], and one-hot encode the labels\n",
- "images = (images / 255.0) * 2 - 1\n",
- "labels = table(seq(1, n), labels+1, n, 10)\n",
- "\n",
- "# Split into training (55,000 examples) and validation (5,000 examples)\n",
- "X = images[5001:nrow(images),]\n",
- "X_val = images[1:5000,]\n",
- "y = labels[5001:nrow(images),]\n",
- "y_val = labels[1:5000,]\n",
- "\n",
- "# Train\n",
- "[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win)\n",
- "\"\"\"\n",
- "script = (dml(script_string).input(\"$data\", \"data/mnist/mnist_train.csv\")\n",
- " .input(C=1, Hin=28, Win=28)\n",
- " .output(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))\n",
- "W1, b1, W2, b2, W3, b3, W4, b4 = (ml.execute(script)\n",
- " .get(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 2. Compute Test Accuracy"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "script_string = \"\"\"\n",
- "source(\"mnist_lenet.dml\") as mnist_lenet\n",
- "\n",
- "# Read test data\n",
- "data = read($data, format=\"csv\")\n",
- "n = nrow(data)\n",
- "\n",
- "# Extract images and labels\n",
- "X_test = data[,2:ncol(data)]\n",
- "y_test = data[,1]\n",
- "\n",
- "# Scale images to [-1,1], and one-hot encode the labels\n",
- "X_test = (X_test / 255.0) * 2 - 1\n",
- "y_test = table(seq(1, n), y_test+1, n, 10)\n",
- "\n",
- "# Eval on test set\n",
- "probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)\n",
- "[loss, accuracy] = mnist_lenet::eval(probs, y_test)\n",
- "\n",
- "print(\"Test Accuracy: \" + accuracy)\n",
- "\"\"\"\n",
- "script = dml(script_string).input(**{\"$data\": \"data/mnist/mnist_train.csv\",\n",
- " \"C\": 1, \"Hin\": 28, \"Win\": 28,\n",
- " \"W1\": W1, \"b1\": b1,\n",
- " \"W2\": W2, \"b2\": b2,\n",
- " \"W3\": W3, \"b3\": b3,\n",
- " \"W4\": W4, \"b4\": b4})\n",
- "ml.execute(script)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 3. Extract Model Into Spark DataFrames For Future Use"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "W1_df = W1.toDF()\n",
- "b1_df = b1.toDF()\n",
- "W2_df = W2.toDF()\n",
- "b2_df = b2.toDF()\n",
- "W3_df = W3.toDF()\n",
- "b3_df = b3.toDF()\n",
- "W4_df = W4.toDF()\n",
- "b4_df = b4.toDF()\n",
- "W1_df, b1_df, W2_df, b2_df, W3_df, b3_df, W4_df, b4_df"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.5.2"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/Example - MNIST Softmax Classifier.ipynb
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/Example - MNIST Softmax Classifier.ipynb b/scripts/staging/SystemML-NN/examples/Example - MNIST Softmax Classifier.ipynb
deleted file mode 100644
index 7f2c2f0..0000000
--- a/scripts/staging/SystemML-NN/examples/Example - MNIST Softmax Classifier.ipynb
+++ /dev/null
@@ -1,185 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Quick Setup"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false,
- "scrolled": false
- },
- "outputs": [],
- "source": [
- "# Create a SystemML MLContext object\n",
- "from systemml import MLContext, dml\n",
- "ml = MLContext(sc)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Download Data - MNIST"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9]. Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "%%sh\n",
- "mkdir -p data/mnist/\n",
- "cd data/mnist/\n",
- "curl -O http://pjreddie.com/media/files/mnist_train.csv\n",
- "curl -O http://pjreddie.com/media/files/mnist_test.csv"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## SystemML Softmax Model"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 1. Train"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "training = \"\"\"\n",
- "source(\"mnist_softmax.dml\") as mnist_softmax\n",
- "\n",
- "# Read training data\n",
- "data = read($data, format=\"csv\")\n",
- "n = nrow(data)\n",
- "\n",
- "# Extract images and labels\n",
- "images = data[,2:ncol(data)]\n",
- "labels = data[,1]\n",
- "\n",
- "# Scale images to [0,1], and one-hot encode the labels\n",
- "images = images / 255.0\n",
- "labels = table(seq(1, n), labels+1, n, 10)\n",
- "\n",
- "# Split into training (55,000 examples) and validation (5,000 examples)\n",
- "X = images[5001:nrow(images),]\n",
- "X_val = images[1:5000,]\n",
- "y = labels[5001:nrow(images),]\n",
- "y_val = labels[1:5000,]\n",
- "\n",
- "# Train\n",
- "[W, b] = mnist_softmax::train(X, y, X_val, y_val)\n",
- "\"\"\"\n",
- "script = dml(training).input(\"$data\", \"data/mnist/mnist_train.csv\").output(\"W\", \"b\")\n",
- "W, b = ml.execute(script).get(\"W\", \"b\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 2. Compute Test Accuracy"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "testing = \"\"\"\n",
- "source(\"mnist_softmax.dml\") as mnist_softmax\n",
- "\n",
- "# Read test data\n",
- "data = read($data, format=\"csv\")\n",
- "n = nrow(data)\n",
- "\n",
- "# Extract images and labels\n",
- "X_test = data[,2:ncol(data)]\n",
- "y_test = data[,1]\n",
- "\n",
- "# Scale images to [0,1], and one-hot encode the labels\n",
- "X_test = X_test / 255.0\n",
- "y_test = table(seq(1, n), y_test+1, n, 10)\n",
- "\n",
- "# Eval on test set\n",
- "probs = mnist_softmax::predict(X_test, W, b)\n",
- "[loss, accuracy] = mnist_softmax::eval(probs, y_test)\n",
- "\n",
- "print(\"Test Accuracy: \" + accuracy)\n",
- "\"\"\"\n",
- "script = dml(testing).input(\"$data\", \"data/mnist/mnist_test.csv\", W=W, b=b)\n",
- "ml.execute(script)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 3. Extract Model Into Spark DataFrames For Future Use"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "W_df = W.toDF()\n",
- "b_df = b.toDF()\n",
- "W_df, b_df"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.5.2"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/README.md
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/README.md b/scripts/staging/SystemML-NN/examples/README.md
deleted file mode 100644
index ffacea2..0000000
--- a/scripts/staging/SystemML-NN/examples/README.md
+++ /dev/null
@@ -1,75 +0,0 @@
-<!--
-{% comment %}
-Licensed to the Apache Software Foundation (ASF) under one or more
-contributor license agreements. See the NOTICE file distributed with
-this work for additional information regarding copyright ownership.
-The ASF licenses this file to you under the Apache License, Version 2.0
-(the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-{% endcomment %}
--->
-
-# SystemML-NN Examples
-
-#### This folder contains scripts and PySpark Jupyter notebooks serving as examples of using the *SystemML-NN* (`nn`) deep learning library.
-
----
-
-# Examples
-### MNIST Softmax Classifier
-
-* This example trains a softmax classifier, which is essentially a multi-class logistic regression model, on the MNIST data. The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
-* Notebook: `Example - MNIST Softmax Classifier.ipynb`.
-* DML Functions: `mnist_softmax.dml`
-* Training script: `mnist_softmax-train.dml`
-* Prediction script: `mnist_softmax-predict.dml`
-
-### MNIST "LeNet" Neural Net
-
-* This example trains a neural network on the MNIST data using a ["LeNet" architecture](http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf). The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
-* Notebook: `Example - MNIST LeNet.ipynb`.
-* DML Functions: `mnist_lenet.dml`
-* Training script: `mnist_lenet-train.dml`
-* Prediction script: `mnist_lenet-predict.dml`
-
----
-
-# Setup
-## Code
-* To run the examples, please first download and unzip the project via GitHub using the "Clone or download" button on the [homepage of the project](https://github.com/dusenberrymw/systemml-nn), *or* via the following commands:
-
- ```
- curl -LO https://github.com/dusenberrymw/systemml-nn/archive/master.zip
- unzip master.zip
- ```
-
-* Then, move into the `examples` folder via:
- ```
- cd systemml-nn-master/examples/
- ```
-
-## Data
-* These examples use the classic [MNIST](http://yann.lecun.com/exdb/mnist/) dataset, which contains labeled 28x28 pixel images of handwritten digits in the range of 0-9. There are 60,000 training images, and 10,000 testing images. Of the 60,000 training images, 5,000 will be used as validation images.
-* **Download**:
- * **Notebooks**: The data will be automatically downloaded as a step in either of the example notebooks.
- * **Training scripts**: Please run `get_mnist_data.sh` to download the data separately.
-
-## Execution
-* These examples contain scripts written in SystemML's R-like language (`*.dml`), as well as PySpark Jupyter notebooks (`*.ipynb`). The scripts contain the math for the algorithms, enclosed in functions, and the notebooks serve as full, end-to-end examples of reading in data, training models using the functions within the scripts, and evaluating final performance.
-* **Notebooks**: To run the notebook examples, please install the SystemML Python package with `pip install systemml`, and then startup Jupyter in the following manner from this directory (or for more information, please see [this great blog post](http://spark.tc/0-to-life-changing-application-with-apache-systemml/)):
-
- ```
- PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook" pyspark --master local[*] --driver-memory 3G --driver-class-path SystemML.jar --jars SystemML.jar
- ```
-
- Note that all printed output, such as training statistics, from the SystemML scripts will be sent to the terminal in which Jupyter was started (for now...).
-
-* **Scripts**: To run the scripts from the command line using `spark-submit`, please see the comments located at the top of the `-train` and `-predict` scripts.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/get_mnist_data.sh
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/get_mnist_data.sh b/scripts/staging/SystemML-NN/examples/get_mnist_data.sh
deleted file mode 100755
index deb0c40..0000000
--- a/scripts/staging/SystemML-NN/examples/get_mnist_data.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-DIR="$(cd "$(dirname "$0")" && pwd)"
-mkdir -p $DIR/data/mnist/
-cd $DIR/data/mnist/
-curl -O https://pjreddie.com/media/files/mnist_train.csv
-curl -O https://pjreddie.com/media/files/mnist_test.csv
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml b/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml
deleted file mode 100644
index 759418d..0000000
--- a/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml
+++ /dev/null
@@ -1,87 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST LeNet - Predict
-#
-# This script computes the class probability predictions of a
-# trained convolutional net using the "LeNet" architecture on
-# images of handwritten digits.
-#
-# Inputs:
-# - X: File containing training images.
-# The format is "pixel_1, pixel_2, ..., pixel_n".
-# - C: Number of color chanels in the images.
-# - Hin: Input image height.
-# - Win: Input image width.
-# - model_dir: Directory containing the trained weights and biases
-# of the model.
-# - out_dir: Directory to store class probability predictions for
-# each image.
-# - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
-# Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-# - probs: File containing class probability predictions for each
-# image.
-#
-# Data:
-# The X file should contain images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels.
-#
-# Sample Invocation:
-# Execute using Spark
-# ```
-# spark-submit --master local[*] --driver-memory 5G
-# --conf spark.driver.maxResultSize=0 --conf spark.akka.frameSize=128
-# $SYSTEMML_HOME/target/SystemML.jar -f mnist_lenet-predict.dml
-# -nvargs X=data/mnist/images.csv C=1 Hin=28 Win=28
-# model_dir=model/mnist_lenet out_dir=data/mnist
-# ```
-#
-source("mnist_lenet.dml") as mnist_lenet
-
-# Read training data
-fmt = ifdef($fmt, "csv")
-X = read($X, format=fmt)
-C = $C
-Hin = $Hin
-Win = $Win
-
-# Scale images to [-1,1]
-X = (X / 255.0) * 2 - 1
-
-# Read model coefficients
-W1 = read($model_dir+"/W1")
-b1 = read($model_dir+"/b1")
-W2 = read($model_dir+"/W2")
-b2 = read($model_dir+"/b2")
-W3 = read($model_dir+"/W3")
-b3 = read($model_dir+"/b3")
-W4 = read($model_dir+"/W4")
-b4 = read($model_dir+"/b4")
-
-# Predict classes
-probs = mnist_lenet::predict(X, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
-
-# Output results
-write(probs, $out_dir+"/probs."+fmt, format=fmt)
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml b/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml
deleted file mode 100644
index eafb34c..0000000
--- a/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml
+++ /dev/null
@@ -1,123 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST LeNet - Train
-#
-# This script trains a convolutional net using the "LeNet" architecture
-# on images of handwritten digits.
-#
-# Inputs:
-# - train: File containing labeled MNIST training images.
-# The format is "label, pixel_1, pixel_2, ..., pixel_n".
-# - test: File containing labeled MNIST test images.
-# The format is "label, pixel_1, pixel_2, ..., pixel_n".
-# - C: Number of color chanels in the images.
-# - Hin: Input image height.
-# - Win: Input image width.
-# - epochs: [DEFAULT: 10] Total number of full training loops over
-# the full data set.
-# - out_dir: [DEFAULT: "."] Directory to store weights and bias
-# matrices of trained model, as well as final test accuracy.
-# - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
-# Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-# - W1, W2, W3, W4: Files containing the trained weights of the model.
-# - b1, b2, b3, b4: Files containing the trained biases of the model.
-# - accuracy: File containing the final accuracy on the test data.
-#
-# Data:
-# The MNIST dataset contains labeled images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels, and each label is
-# one of 10 possible digits in [0,9].
-#
-# Sample Invocation (running from wihtin the `examples` folder):
-# 1. Download data (60,000 training examples, and 10,000 test examples)
-# ```
-# get_mnist_data.sh
-# ```
-#
-# 2. Execute using Spark
-# ```
-# spark-submit --master local[*] --driver-memory 10G
-# --conf spark.driver.maxResultSize=0 --conf spark.akka.frameSize=128
-# $SYSTEMML_HOME/target/SystemML.jar -f mnist_lenet-train.dml
-# -nvargs train=data/mnist/mnist_train.csv test=data/mnist/mnist_test.csv
-# C=1 Hin=28 Win=28 epochs=10 out_dir=model/mnist_lenet
-# ```
-#
-source("mnist_lenet.dml") as mnist_lenet
-
-# Read training data & settings
-fmt = ifdef($fmt, "csv")
-train = read($train, format=fmt)
-test = read($test, format=fmt)
-C = $C
-Hin = $Hin
-Win = $Win
-epochs = ifdef($epochs, 10)
-out_dir = ifdef($out_dir, ".")
-
-# Extract images and labels
-images = train[,2:ncol(train)]
-labels = train[,1]
-X_test = test[,2:ncol(test)]
-y_test = test[,1]
-
-# Scale images to [-1,1], and one-hot encode the labels
-n = nrow(train)
-n_test = nrow(test)
-images = (images / 255.0) * 2 - 1
-labels = table(seq(1, n), labels+1, n, 10)
-X_test = (X_test / 255.0) * 2 - 1
-y_test = table(seq(1, n_test), y_test+1, n_test, 10)
-
-# Split into training (55,000 examples) and validation (5,000 examples)
-X = images[5001:nrow(images),]
-X_val = images[1:5000,]
-y = labels[5001:nrow(images),]
-y_val = labels[1:5000,]
-
-# Train
-[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)
-
-# Write model out
-write(W1, out_dir+"/W1")
-write(b1, out_dir+"/b1")
-write(W2, out_dir+"/W2")
-write(b2, out_dir+"/b2")
-write(W3, out_dir+"/W3")
-write(b3, out_dir+"/b3")
-write(W4, out_dir+"/W4")
-write(b4, out_dir+"/b4")
-
-# Eval on test set
-probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
-[loss, accuracy] = mnist_lenet::eval(probs, y_test)
-
-# Output results
-print("Test Accuracy: " + accuracy)
-write(accuracy, out_dir+"/accuracy")
-
-print("")
-print("")
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet.dml b/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
deleted file mode 100644
index e5755c4..0000000
--- a/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
+++ /dev/null
@@ -1,331 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * MNIST LeNet Example
- */
-# Imports
-source("nn/layers/affine.dml") as affine
-source("nn/layers/conv2d_builtin.dml") as conv2d
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/dropout.dml") as dropout
-source("nn/layers/l2_reg.dml") as l2_reg
-source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
-source("nn/layers/relu.dml") as relu
-source("nn/layers/softmax.dml") as softmax
-source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
-
-train = function(matrix[double] X, matrix[double] y,
- matrix[double] X_val, matrix[double] y_val,
- int C, int Hin, int Win, int epochs)
- return (matrix[double] W1, matrix[double] b1,
- matrix[double] W2, matrix[double] b2,
- matrix[double] W3, matrix[double] b3,
- matrix[double] W4, matrix[double] b4) {
- /*
- * Trains a convolutional net using the "LeNet" architecture.
- *
- * The input matrix, X, has N examples, each represented as a 3D
- * volume unrolled into a single vector. The targets, y, have K
- * classes, and are one-hot encoded.
- *
- * Inputs:
- * - X: Input data matrix, of shape (N, C*Hin*Win).
- * - y: Target matrix, of shape (N, K).
- * - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
- * - y_val: Target validation matrix, of shape (N, K).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - epochs: Total number of full training loops over the full data set.
- *
- * Outputs:
- * - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
- * - b1: 1st layer biases vector, of shape (F1, 1).
- * - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
- * - b2: 2nd layer biases vector, of shape (F2, 1).
- * - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
- * - b3: 3rd layer biases vector, of shape (1, N3).
- * - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
- * - b4: 4th layer biases vector, of shape (1, K).
- */
- N = nrow(X)
- K = ncol(y)
-
- # Create network:
- # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
- Hf = 5 # filter height
- Wf = 5 # filter width
- stride = 1
- pad = 2 # For same dimensions, (Hf - stride) / 2
-
- F1 = 32 # num conv filters in conv1
- F2 = 64 # num conv filters in conv2
- N3 = 512 # num nodes in affine3
- # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)
-
- [W1, b1] = conv2d::init(F1, C, Hf, Wf) # inputs: (N, C*Hin*Win)
- [W2, b2] = conv2d::init(F2, F1, Hf, Wf) # inputs: (N, F1*(Hin/2)*(Win/2))
- [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3) # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
- [W4, b4] = affine::init(N3, K) # inputs: (N, N3)
- W4 = W4 / sqrt(2) # different initialization, since being fed into softmax, instead of relu
-
- # Initialize SGD w/ Nesterov momentum optimizer
- lr = 0.01 # learning rate
- mu = 0.9 #0.5 # momentum
- decay = 0.95 # learning rate decay constant
- vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
- vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
- vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
- vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)
-
- # Regularization
- lambda = 5e-04
-
- # Optimize
- print("Starting optimization")
- batch_size = 64
- iters = ceil(N / batch_size)
- for (e in 1:epochs) {
- for(i in 1:iters) {
- # Get next batch
- beg = ((i-1) * batch_size) %% N + 1
- end = min(N, beg + batch_size - 1)
- X_batch = X[beg:end,]
- y_batch = y[beg:end,]
-
- # Compute forward pass
- ## layer 1: conv1 -> relu1 -> pool1
- [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- outr1 = relu::forward(outc1)
- [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
- strideh=2, stridew=2, pad=0, pad=0)
- ## layer 2: conv2 -> relu2 -> pool2
- [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
- stride, stride, pad, pad)
- outr2 = relu::forward(outc2)
- [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
- strideh=2, stridew=2, pad=0, pad=0)
- ## layer 3: affine3 -> relu3 -> dropout
- outa3 = affine::forward(outp2, W3, b3)
- outr3 = relu::forward(outa3)
- [outd3, maskd3] = dropout::forward(outr3, 0.5, -1)
- ## layer 4: affine4 -> softmax
- outa4 = affine::forward(outd3, W4, b4)
- probs = softmax::forward(outa4)
-
- # Compute loss & accuracy for training & validation data every 100 iterations.
- if (i %% 100 == 0) {
- # Compute training loss & accuracy
- loss_data = cross_entropy_loss::forward(probs, y_batch)
- loss_reg_W1 = l2_reg::forward(W1, lambda)
- loss_reg_W2 = l2_reg::forward(W2, lambda)
- loss_reg_W3 = l2_reg::forward(W3, lambda)
- loss_reg_W4 = l2_reg::forward(W4, lambda)
- loss = loss_data + loss_reg_W1 + loss_reg_W2 + loss_reg_W3 + loss_reg_W4
- accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
-
- # Compute validation loss & accuracy
- probs_val = predict(X_val, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
- loss_val = cross_entropy_loss::forward(probs_val, y_val)
- accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
-
- # Output results
- print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
- + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
- }
-
- # Compute data backward pass
- ## loss:
- dprobs = cross_entropy_loss::backward(probs, y_batch)
- ## layer 4: affine4 -> softmax
- douta4 = softmax::backward(dprobs, outa4)
- [doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4)
- ## layer 3: affine3 -> relu3 -> dropout
- doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3)
- douta3 = relu::backward(doutr3, outa3)
- [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
- ## layer 2: conv2 -> relu2 -> pool2
- doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
- strideh=2, stridew=2, pad=0, pad=0)
- doutc2 = relu::backward(doutr2, outc2)
- [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
- Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
- ## layer 1: conv1 -> relu1 -> pool1
- doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
- strideh=2, stridew=2, pad=0, pad=0)
- doutc1 = relu::backward(doutr1, outc1)
- [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win,
- Hf, Wf, stride, stride, pad, pad)
-
- # Compute regularization backward pass
- dW1_reg = l2_reg::backward(W1, lambda)
- dW2_reg = l2_reg::backward(W2, lambda)
- dW3_reg = l2_reg::backward(W3, lambda)
- dW4_reg = l2_reg::backward(W4, lambda)
- dW1 = dW1 + dW1_reg
- dW2 = dW2 + dW2_reg
- dW3 = dW3 + dW3_reg
- dW4 = dW4 + dW4_reg
-
- # Optimize with SGD w/ Nesterov momentum
- [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
- [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
- [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
- [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
- [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
- [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
- [W4, vW4] = sgd_nesterov::update(W4, dW4, lr, mu, vW4)
- [b4, vb4] = sgd_nesterov::update(b4, db4, lr, mu, vb4)
- }
- # Anneal momentum towards 0.999
- #mu = mu + (0.999 - mu)/(1+epochs-e)
- # Decay learning rate
- lr = lr * decay
- }
-}
-
-predict = function(matrix[double] X, int C, int Hin, int Win,
- matrix[double] W1, matrix[double] b1,
- matrix[double] W2, matrix[double] b2,
- matrix[double] W3, matrix[double] b3,
- matrix[double] W4, matrix[double] b4)
- return (matrix[double] probs) {
- /*
- * Computes the class probability predictions of a convolutional
- * net using the "LeNet" architecture.
- *
- * The input matrix, X, has N examples, each represented as a 3D
- * volume unrolled into a single vector.
- *
- * Inputs:
- * - X: Input data matrix, of shape (N, C*Hin*Win).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
- * - b1: 1st layer biases vector, of shape (F1, 1).
- * - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
- * - b2: 2nd layer biases vector, of shape (F2, 1).
- * - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
- * - b3: 3rd layer biases vector, of shape (1, N3).
- * - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
- * - b4: 4th layer biases vector, of shape (1, K).
- *
- * Outputs:
- * - probs: Class probabilities, of shape (N, K).
- */
- N = nrow(X)
-
- # Network:
- # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
- Hf = 5 # filter height
- Wf = 5 # filter width
- stride = 1
- pad = 2 # For same dimensions, (Hf - stride) / 2
-
- F1 = nrow(W1) # num conv filters in conv1
- F2 = nrow(W2) # num conv filters in conv2
- N3 = ncol(W3) # num nodes in affine3
- K = ncol(W4) # num nodes in affine4, equal to number of target dimensions (num classes)
-
- # Compute predictions over mini-batches
- probs = matrix(0, rows=N, cols=K)
- batch_size = 64
- iters = ceil(N / batch_size)
- for(i in 1:iters) {
- # Get next batch
- beg = ((i-1) * batch_size) %% N + 1
- end = min(N, beg + batch_size - 1)
- X_batch = X[beg:end,]
-
- # Compute forward pass
- ## layer 1: conv1 -> relu1 -> pool1
- [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- outr1 = relu::forward(outc1)
- [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
- strideh=2, stridew=2, pad=0, pad=0)
- ## layer 2: conv2 -> relu2 -> pool2
- [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
- stride, stride, pad, pad)
- outr2 = relu::forward(outc2)
- [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
- strideh=2, stridew=2, pad=0, pad=0)
- ## layer 3: affine3 -> relu3
- outa3 = affine::forward(outp2, W3, b3)
- outr3 = relu::forward(outa3)
- ## layer 4: affine4 -> softmax
- outa4 = affine::forward(outr3, W4, b4)
- probs_batch = softmax::forward(outa4)
-
- # Store predictions
- probs[beg:end,] = probs_batch
- }
-}
-
-eval = function(matrix[double] probs, matrix[double] y)
- return (double loss, double accuracy) {
- /*
- * Evaluates a convolutional net using the "LeNet" architecture.
- *
- * The probs matrix contains the class probability predictions
- * of K classes over N examples. The targets, y, have K classes,
- * and are one-hot encoded.
- *
- * Inputs:
- * - probs: Class probabilities, of shape (N, K).
- * - y: Target matrix, of shape (N, K).
- *
- * Outputs:
- * - loss: Scalar loss, of shape (1).
- * - accuracy: Scalar accuracy, of shape (1).
- */
- # Compute loss & accuracy
- loss = cross_entropy_loss::forward(probs, y)
- correct_pred = rowIndexMax(probs) == rowIndexMax(y)
- accuracy = mean(correct_pred)
-}
-
-generate_dummy_data = function()
- return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
- /*
- * Generate a dummy dataset similar to the MNIST dataset.
- *
- * Outputs:
- * - X: Input data matrix, of shape (N, D).
- * - y: Target matrix, of shape (N, K).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- */
- # Generate dummy input data
- N = 1024 # num examples
- C = 1 # num input channels
- Hin = 28 # input height
- Win = 28 # input width
- K = 10 # num target classes
- X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
- classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))
- y = table(seq(1, N), classes) # one-hot encoding
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml b/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml
deleted file mode 100644
index 4101e27..0000000
--- a/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml
+++ /dev/null
@@ -1,74 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST Softmax - Predict
-#
-# This script computes the class probability predictions of a
-# trained softmax classifier on images of handwritten digits.
-#
-# Inputs:
-# - X: File containing training images.
-# The format is "pixel_1, pixel_2, ..., pixel_n".
-# - model_dir: Directory containing the trained weights and biases
-# of the model.
-# - out_dir: Directory to store class probability predictions for
-# each image.
-# - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
-# Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-# - probs: File containing class probability predictions for each
-# image.
-#
-# Data:
-# The X file should contain images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels.
-#
-# Sample Invocation:
-# Execute using Spark
-# ```
-# spark-submit --master local[*] --driver-memory 5G
-# --conf spark.driver.maxResultSize=0 --conf spark.akka.frameSize=128
-# $SYSTEMML_HOME/target/SystemML.jar -f mnist_softmax-predict.dml
-# -nvargs X=data/mnist/images.csv model_dir=model/mnist_softmax
-# out_dir=data/mnist
-# ```
-#
-source("mnist_softmax.dml") as mnist_softmax
-
-# Read training data
-fmt = ifdef($fmt, "csv")
-X = read($X, format=fmt)
-
-# Scale images to [0,1], and one-hot encode the labels
-X = X / 255.0
-
-# Read model coefficients
-W = read($model_dir+"/W")
-b = read($model_dir+"/b")
-
-# Predict classes
-probs = mnist_softmax::predict(X, W, b)
-
-# Output results
-write(probs, $out_dir+"/probs."+fmt, format=fmt)
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml b/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml
deleted file mode 100644
index 2941dfa..0000000
--- a/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml
+++ /dev/null
@@ -1,108 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST Softmax - Train
-#
-# This script trains a softmax classifier on images of handwritten
-# digits.
-#
-# Inputs:
-# - train: File containing labeled MNIST training images.
-# The format is "label, pixel_1, pixel_2, ..., pixel_n".
-# - test: File containing labeled MNIST test images.
-# The format is "label, pixel_1, pixel_2, ..., pixel_n".
-# - out_dir: Directory to store weights and bias matrices of
-# trained model, as well as final test accuracy.
-# - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
-# Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-# - W: File containing the trained weights of the model.
-# - b: File containing the trained biases of the model.
-# - accuracy: File containing the final accuracy on the test data.
-#
-# Data:
-# The MNIST dataset contains labeled images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels, and each label is
-# one of 10 possible digits in [0,9].
-#
-# Sample Invocation (running from wihtin the `examples` folder):
-# 1. Download data (60,000 training examples, and 10,000 test examples)
-# ```
-# get_mnist_data.sh
-# ```
-#
-# 2. Execute using Spark
-# ```
-# spark-submit --master local[*] --driver-memory 5G
-# --conf spark.driver.maxResultSize=0 --conf spark.akka.frameSize=128
-# $SYSTEMML_HOME/target/SystemML.jar -f mnist_softmax-train.dml
-# -nvargs train=data/mnist/mnist_train.csv test=data/mnist/mnist_test.csv
-# out_dir=model/mnist_softmax
-# ```
-#
-source("mnist_softmax.dml") as mnist_softmax
-
-# Read training data
-fmt = ifdef($fmt, "csv")
-train = read($train, format=fmt)
-test = read($test, format=fmt)
-
-# Extract images and labels
-images = train[,2:ncol(train)]
-labels = train[,1]
-X_test = test[,2:ncol(test)]
-y_test = test[,1]
-
-# Scale images to [0,1], and one-hot encode the labels
-n = nrow(train)
-n_test = nrow(test)
-classes = 10
-images = images / 255.0
-labels = table(seq(1, n), labels+1, n, classes)
-X_test = X_test / 255.0
-y_test = table(seq(1, n_test), y_test+1, n_test, classes)
-
-# Split into training (55,000 examples) and validation (5,000 examples)
-X = images[5001:nrow(images),]
-X_val = images[1:5000,]
-y = labels[5001:nrow(images),]
-y_val = labels[1:5000,]
-
-# Train
-[W, b] = mnist_softmax::train(X, y, X_val, y_val)
-
-# Write model out
-write(W, $out_dir+"/W")
-write(b, $out_dir+"/b")
-
-# Eval on test set
-probs = mnist_softmax::predict(X_test, W, b)
-[loss, accuracy] = mnist_softmax::eval(probs, y_test)
-
-# Output results
-print("Test Accuracy: " + accuracy)
-write(accuracy, $out_dir+"/accuracy")
-
-print("")
-print("")
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/mnist_softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_softmax.dml b/scripts/staging/SystemML-NN/examples/mnist_softmax.dml
deleted file mode 100644
index dc712f6..0000000
--- a/scripts/staging/SystemML-NN/examples/mnist_softmax.dml
+++ /dev/null
@@ -1,177 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * MNIST Softmax Example
- */
-# Imports
-source("nn/layers/affine.dml") as affine
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/softmax.dml") as softmax
-source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
-
-train = function(matrix[double] X, matrix[double] y,
- matrix[double] X_val, matrix[double] y_val)
- return (matrix[double] W, matrix[double] b) {
- /*
- * Trains a softmax classifier.
- *
- * The input matrix, X, has N examples, each with D features.
- * The targets, y, have K classes, and are one-hot encoded.
- *
- * Inputs:
- * - X: Input data matrix, of shape (N, D).
- * - y: Target matrix, of shape (N, K).
- * - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
- * - y_val: Target validation matrix, of shape (N, K).
- *
- * Outputs:
- * - W: Weights (parameters) matrix, of shape (D, M).
- * - b: Biases vector, of shape (1, M).
- */
- N = nrow(X) # num examples
- D = ncol(X) # num features
- K = ncol(y) # num classes
-
- # Create softmax classifier:
- # affine -> softmax
- [W, b] = affine::init(D, K)
- W = W / sqrt(2.0/(D)) * sqrt(1/(D))
-
- # Initialize SGD w/ Nesterov momentum optimizer
- lr = 0.2 # learning rate
- mu = 0 # momentum
- decay = 0.99 # learning rate decay constant
- vW = sgd_nesterov::init(W) # optimizer momentum state for W
- vb = sgd_nesterov::init(b) # optimizer momentum state for b
-
- # Optimize
- print("Starting optimization")
- batch_size = 50
- epochs = 1
- iters = 1000 #ceil(N / batch_size)
- for (e in 1:epochs) {
- for(i in 1:iters) {
- # Get next batch
- beg = ((i-1) * batch_size) %% N + 1
- end = min(N, beg + batch_size - 1)
- X_batch = X[beg:end,]
- y_batch = y[beg:end,]
-
- # Compute forward pass
- ## affine & softmax:
- out = affine::forward(X_batch, W, b)
- probs = softmax::forward(out)
-
- # Compute loss & accuracy for training & validation data
- loss = cross_entropy_loss::forward(probs, y_batch)
- accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
- probs_val = predict(X_val, W, b)
- loss_val = cross_entropy_loss::forward(probs_val, y_val)
- accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
- print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
- accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
-
- # Compute backward pass
- ## loss:
- dprobs = cross_entropy_loss::backward(probs, y_batch)
- ## affine & softmax:
- dout = softmax::backward(dprobs, out)
- [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b)
-
- # Optimize with SGD w/ Nesterov momentum
- [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW)
- [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb)
- }
- # Anneal momentum towards 0.999
- mu = mu + (0.999 - mu)/(1+epochs-e)
- # Decay learning rate
- lr = lr * decay
- }
-}
-
-predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
- return (matrix[double] probs) {
- /*
- * Computes the class probability predictions of a softmax classifier.
- *
- * The input matrix, X, has N examples, each with D features.
- *
- * Inputs:
- * - X: Input data matrix, of shape (N, D).
- * - W: Weights (parameters) matrix, of shape (D, M).
- * - b: Biases vector, of shape (1, M).
- *
- * Outputs:
- * - probs: Class probabilities, of shape (N, K).
- */
- # Compute forward pass
- ## affine & softmax:
- out = affine::forward(X, W, b)
- probs = softmax::forward(out)
-}
-
-eval = function(matrix[double] probs, matrix[double] y)
- return (double loss, double accuracy) {
- /*
- * Evaluates a softmax classifier.
- *
- * The probs matrix contains the class probability predictions
- * of K classes over N examples. The targets, y, have K classes,
- * and are one-hot encoded.
- *
- * Inputs:
- * - probs: Class probabilities, of shape (N, K).
- * - y: Target matrix, of shape (N, K).
- *
- * Outputs:
- * - loss: Scalar loss, of shape (1).
- * - accuracy: Scalar accuracy, of shape (1).
- */
- # Compute loss & accuracy
- loss = cross_entropy_loss::forward(probs, y)
- correct_pred = rowIndexMax(probs) == rowIndexMax(y)
- accuracy = mean(correct_pred)
-}
-
-generate_dummy_data = function()
- return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
- /*
- * Generate a dummy dataset similar to the MNIST dataset.
- *
- * Outputs:
- * - X: Input data matrix, of shape (N, D).
- * - y: Target matrix, of shape (N, K).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- */
- # Generate dummy input data
- N = 1024 # num examples
- C = 1 # num input channels
- Hin = 28 # input height
- Win = 28 # input width
- T = 10 # num targets
- X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
- classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
- y = table(seq(1, N), classes) # one-hot encoding
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/examples/nn
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/nn b/scripts/staging/SystemML-NN/examples/nn
deleted file mode 120000
index cfe2905..0000000
--- a/scripts/staging/SystemML-NN/examples/nn
+++ /dev/null
@@ -1 +0,0 @@
-../nn
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb b/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb
new file mode 100644
index 0000000..0423269
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb
@@ -0,0 +1,189 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Quick Setup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create a SystemML MLContext object\n",
+ "from systemml import MLContext, dml\n",
+ "ml = MLContext(sc)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Download Data - MNIST"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9]. Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%sh\n",
+ "mkdir -p data/mnist/\n",
+ "cd data/mnist/\n",
+ "curl -O https://pjreddie.com/media/files/mnist_train.csv\n",
+ "curl -O https://pjreddie.com/media/files/mnist_test.csv"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## SystemML \"LeNet\" Neural Network"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1. Train"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "script_string = \"\"\"\n",
+ "source(\"nn/examples/mnist_lenet.dml\") as mnist_lenet\n",
+ "\n",
+ "# Read training data\n",
+ "data = read($data, format=\"csv\")\n",
+ "n = nrow(data)\n",
+ "\n",
+ "# Extract images and labels\n",
+ "images = data[,2:ncol(data)]\n",
+ "labels = data[,1]\n",
+ "\n",
+ "# Scale images to [-1,1], and one-hot encode the labels\n",
+ "images = (images / 255.0) * 2 - 1\n",
+ "labels = table(seq(1, n), labels+1, n, 10)\n",
+ "\n",
+ "# Split into training (55,000 examples) and validation (5,000 examples)\n",
+ "X = images[5001:nrow(images),]\n",
+ "X_val = images[1:5000,]\n",
+ "y = labels[5001:nrow(images),]\n",
+ "y_val = labels[1:5000,]\n",
+ "\n",
+ "# Train\n",
+ "epochs = 10\n",
+ "[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)\n",
+ "\"\"\"\n",
+ "script = (dml(script_string).input(\"$data\", \"data/mnist/mnist_train.csv\")\n",
+ " .input(C=1, Hin=28, Win=28)\n",
+ " .output(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))\n",
+ "W1, b1, W2, b2, W3, b3, W4, b4 = (ml.execute(script)\n",
+ " .get(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2. Compute Test Accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "script_string = \"\"\"\n",
+ "source(\"nn/examples/mnist_lenet.dml\") as mnist_lenet\n",
+ "\n",
+ "# Read test data\n",
+ "data = read($data, format=\"csv\")\n",
+ "n = nrow(data)\n",
+ "\n",
+ "# Extract images and labels\n",
+ "X_test = data[,2:ncol(data)]\n",
+ "y_test = data[,1]\n",
+ "\n",
+ "# Scale images to [-1,1], and one-hot encode the labels\n",
+ "X_test = (X_test / 255.0) * 2 - 1\n",
+ "y_test = table(seq(1, n), y_test+1, n, 10)\n",
+ "\n",
+ "# Eval on test set\n",
+ "probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)\n",
+ "[loss, accuracy] = mnist_lenet::eval(probs, y_test)\n",
+ "\n",
+ "print(\"Test Accuracy: \" + accuracy)\n",
+ "\"\"\"\n",
+ "script = dml(script_string).input(**{\"$data\": \"data/mnist/mnist_train.csv\",\n",
+ " \"C\": 1, \"Hin\": 28, \"Win\": 28,\n",
+ " \"W1\": W1, \"b1\": b1,\n",
+ " \"W2\": W2, \"b2\": b2,\n",
+ " \"W3\": W3, \"b3\": b3,\n",
+ " \"W4\": W4, \"b4\": b4})\n",
+ "ml.execute(script)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3. Extract Model Into Spark DataFrames For Future Use"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "W1_df = W1.toDF()\n",
+ "b1_df = b1.toDF()\n",
+ "W2_df = W2.toDF()\n",
+ "b2_df = b2.toDF()\n",
+ "W3_df = W3.toDF()\n",
+ "b3_df = b3.toDF()\n",
+ "W4_df = W4.toDF()\n",
+ "b4_df = b4.toDF()\n",
+ "W1_df, b1_df, W2_df, b2_df, W3_df, b3_df, W4_df, b4_df"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 + Spark 2.x + SystemML",
+ "language": "python",
+ "name": "pyspark3_2.x"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb b/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb
new file mode 100644
index 0000000..5e7182a
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb
@@ -0,0 +1,179 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Quick Setup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "# Create a SystemML MLContext object\n",
+ "from systemml import MLContext, dml\n",
+ "ml = MLContext(sc)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Download Data - MNIST"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9]. Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "%%sh\n",
+ "mkdir -p data/mnist/\n",
+ "cd data/mnist/\n",
+ "curl -O https://pjreddie.com/media/files/mnist_train.csv\n",
+ "curl -O https://pjreddie.com/media/files/mnist_test.csv"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## SystemML Softmax Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1. Train"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "training = \"\"\"\n",
+ "source(\"nn/examples/mnist_softmax.dml\") as mnist_softmax\n",
+ "\n",
+ "# Read training data\n",
+ "data = read($data, format=\"csv\")\n",
+ "n = nrow(data)\n",
+ "\n",
+ "# Extract images and labels\n",
+ "images = data[,2:ncol(data)]\n",
+ "labels = data[,1]\n",
+ "\n",
+ "# Scale images to [0,1], and one-hot encode the labels\n",
+ "images = images / 255.0\n",
+ "labels = table(seq(1, n), labels+1, n, 10)\n",
+ "\n",
+ "# Split into training (55,000 examples) and validation (5,000 examples)\n",
+ "X = images[5001:nrow(images),]\n",
+ "X_val = images[1:5000,]\n",
+ "y = labels[5001:nrow(images),]\n",
+ "y_val = labels[1:5000,]\n",
+ "\n",
+ "# Train\n",
+ "epochs = 1\n",
+ "[W, b] = mnist_softmax::train(X, y, X_val, y_val, epochs)\n",
+ "\"\"\"\n",
+ "script = dml(training).input(\"$data\", \"data/mnist/mnist_train.csv\").output(\"W\", \"b\")\n",
+ "W, b = ml.execute(script).get(\"W\", \"b\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2. Compute Test Accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "testing = \"\"\"\n",
+ "source(\"nn/examples/mnist_softmax.dml\") as mnist_softmax\n",
+ "\n",
+ "# Read test data\n",
+ "data = read($data, format=\"csv\")\n",
+ "n = nrow(data)\n",
+ "\n",
+ "# Extract images and labels\n",
+ "X_test = data[,2:ncol(data)]\n",
+ "y_test = data[,1]\n",
+ "\n",
+ "# Scale images to [0,1], and one-hot encode the labels\n",
+ "X_test = X_test / 255.0\n",
+ "y_test = table(seq(1, n), y_test+1, n, 10)\n",
+ "\n",
+ "# Eval on test set\n",
+ "probs = mnist_softmax::predict(X_test, W, b)\n",
+ "[loss, accuracy] = mnist_softmax::eval(probs, y_test)\n",
+ "\n",
+ "print(\"Test Accuracy: \" + accuracy)\n",
+ "\"\"\"\n",
+ "script = dml(testing).input(\"$data\", \"data/mnist/mnist_test.csv\", W=W, b=b)\n",
+ "ml.execute(script)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3. Extract Model Into Spark DataFrames For Future Use"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "W_df = W.toDF()\n",
+ "b_df = b.toDF()\n",
+ "W_df, b_df"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/README.md
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/README.md b/scripts/staging/SystemML-NN/nn/examples/README.md
new file mode 100644
index 0000000..d5e9d04
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/README.md
@@ -0,0 +1,74 @@
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements. See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+# SystemML-NN Examples
+
+#### This folder contains scripts and PySpark Jupyter notebooks serving as examples of using the *SystemML-NN* (`nn`) deep learning library.
+
+---
+
+# Examples
+### MNIST Softmax Classifier
+
+* This example trains a softmax classifier, which is essentially a multi-class logistic regression model, on the MNIST data. The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
+* Notebook: `Example - MNIST Softmax Classifier.ipynb`.
+* DML Functions: `mnist_softmax.dml`
+* Training script: `mnist_softmax-train.dml`
+* Prediction script: `mnist_softmax-predict.dml`
+
+### MNIST "LeNet" Neural Net
+
+* This example trains a neural network on the MNIST data using a ["LeNet" architecture](http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf). The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
+* Notebook: `Example - MNIST LeNet.ipynb`.
+* DML Functions: `mnist_lenet.dml`
+* Training script: `mnist_lenet-train.dml`
+* Prediction script: `mnist_lenet-predict.dml`
+
+---
+
+# Setup
+## Code
+* To run the examples, please first download and unzip the project via GitHub using the "Clone or download" button on the [homepage of the project](https://github.com/dusenberrymw/systemml-nn), *or* via the following commands:
+
+ ```
+ git clone https://github.com/dusenberrymw/systemml-nn.git
+ ```
+
+* Then, move into the `systemml-nn` folder via:
+ ```
+ cd systemml-nn
+ ```
+
+## Data
+* These examples use the classic [MNIST](http://yann.lecun.com/exdb/mnist/) dataset, which contains labeled 28x28 pixel images of handwritten digits in the range of 0-9. There are 60,000 training images, and 10,000 testing images. Of the 60,000 training images, 5,000 will be used as validation images.
+* **Download**:
+ * **Notebooks**: The data will be automatically downloaded as a step in either of the example notebooks.
+ * **Training scripts**: Please run `get_mnist_data.sh` to download the data separately.
+
+## Execution
+* These examples contain scripts written in SystemML's R-like language (`*.dml`), as well as PySpark Jupyter notebooks (`*.ipynb`). The scripts contain the math for the algorithms, enclosed in functions, and the notebooks serve as full, end-to-end examples of reading in data, training models using the functions within the scripts, and evaluating final performance.
+* **Notebooks**: To run the notebook examples, please install the SystemML Python package with `pip install systemml`, and then startup Jupyter in the following manner from this directory (or for more information, please see [this great blog post](http://spark.tc/0-to-life-changing-application-with-apache-systemml/)):
+
+ ```
+ PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook" pyspark --master local[*] --driver-memory 3G --driver-class-path SystemML.jar --jars SystemML.jar
+ ```
+
+ Note that all printed output, such as training statistics, from the SystemML scripts will be sent to the terminal in which Jupyter was started (for now...).
+
+* **Scripts**: To run the scripts from the command line using `spark-submit`, please see the comments located at the top of the `-train` and `-predict` scripts.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh b/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh
new file mode 100755
index 0000000..deb0c40
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+DIR="$(cd "$(dirname "$0")" && pwd)"
+mkdir -p $DIR/data/mnist/
+cd $DIR/data/mnist/
+curl -O https://pjreddie.com/media/files/mnist_train.csv
+curl -O https://pjreddie.com/media/files/mnist_test.csv
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml
new file mode 100644
index 0000000..85a5307
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml
@@ -0,0 +1,91 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST LeNet - Predict
+#
+# This script computes the class probability predictions of a
+# trained convolutional net using the "LeNet" architecture on
+# images of handwritten digits.
+#
+# Inputs:
+# - X: File containing training images.
+# The format is "pixel_1, pixel_2, ..., pixel_n".
+# - C: Number of color chanels in the images.
+# - Hin: Input image height.
+# - Win: Input image width.
+# - model_dir: Directory containing the trained weights and biases
+# of the model.
+# - out_dir: Directory to store class probability predictions for
+# each image.
+# - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
+# Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+# - probs: File containing class probability predictions for each
+# image.
+#
+# Data:
+# The X file should contain images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels.
+#
+# Sample Invocation (running from outside the `nn` folder):
+# 1. Download images.
+#
+# For example, save images to `nn/examples/data/mnist/images.csv`.
+#
+# 2. Execute using Spark
+# ```
+# spark-submit --master local[*] --driver-memory 5G
+# --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+# $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_lenet-predict.dml
+# -nvargs X=nn/examples/data/mnist/images.csv C=1 Hin=28 Win=28
+# model_dir=nn/examples/model/mnist_lenet out_dir=nn/examples/data/mnist
+# ```
+#
+source("nn/examples/mnist_lenet.dml") as mnist_lenet
+
+# Read training data
+fmt = ifdef($fmt, "csv")
+X = read($X, format=fmt)
+C = $C
+Hin = $Hin
+Win = $Win
+
+# Scale images to [-1,1]
+X = (X / 255.0) * 2 - 1
+
+# Read model coefficients
+W1 = read($model_dir+"/W1")
+b1 = read($model_dir+"/b1")
+W2 = read($model_dir+"/W2")
+b2 = read($model_dir+"/b2")
+W3 = read($model_dir+"/W3")
+b3 = read($model_dir+"/b3")
+W4 = read($model_dir+"/W4")
+b4 = read($model_dir+"/b4")
+
+# Predict classes
+probs = mnist_lenet::predict(X, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
+
+# Output results
+write(probs, $out_dir+"/probs."+fmt, format=fmt)
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/1f5cf697/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml
new file mode 100644
index 0000000..0fc733e
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml
@@ -0,0 +1,123 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST LeNet - Train
+#
+# This script trains a convolutional net using the "LeNet" architecture
+# on images of handwritten digits.
+#
+# Inputs:
+# - train: File containing labeled MNIST training images.
+# The format is "label, pixel_1, pixel_2, ..., pixel_n".
+# - test: File containing labeled MNIST test images.
+# The format is "label, pixel_1, pixel_2, ..., pixel_n".
+# - C: Number of color chanels in the images.
+# - Hin: Input image height.
+# - Win: Input image width.
+# - epochs: [DEFAULT: 10] Total number of full training loops over
+# the full data set.
+# - out_dir: [DEFAULT: "."] Directory to store weights and bias
+# matrices of trained model, as well as final test accuracy.
+# - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
+# Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+# - W1, W2, W3, W4: Files containing the trained weights of the model.
+# - b1, b2, b3, b4: Files containing the trained biases of the model.
+# - accuracy: File containing the final accuracy on the test data.
+#
+# Data:
+# The MNIST dataset contains labeled images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels, and each label is
+# one of 10 possible digits in [0,9].
+#
+# Sample Invocation (running from outside the `nn` folder):
+# 1. Download data (60,000 training examples, and 10,000 test examples)
+# ```
+# nn/examples/get_mnist_data.sh
+# ```
+#
+# 2. Execute using Spark
+# ```
+# spark-submit --master local[*] --driver-memory 10G
+# --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+# $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_lenet-train.dml
+# -nvargs train=nn/examples/data/mnist/mnist_train.csv test=nn/examples/data/mnist/mnist_test.csv
+# C=1 Hin=28 Win=28 epochs=10 out_dir=nn/examples/model/mnist_lenet
+# ```
+#
+source("nn/examples/mnist_lenet.dml") as mnist_lenet
+
+# Read training data & settings
+fmt = ifdef($fmt, "csv")
+train = read($train, format=fmt)
+test = read($test, format=fmt)
+C = $C
+Hin = $Hin
+Win = $Win
+epochs = ifdef($epochs, 10)
+out_dir = ifdef($out_dir, ".")
+
+# Extract images and labels
+images = train[,2:ncol(train)]
+labels = train[,1]
+X_test = test[,2:ncol(test)]
+y_test = test[,1]
+
+# Scale images to [-1,1], and one-hot encode the labels
+n = nrow(train)
+n_test = nrow(test)
+images = (images / 255.0) * 2 - 1
+labels = table(seq(1, n), labels+1, n, 10)
+X_test = (X_test / 255.0) * 2 - 1
+y_test = table(seq(1, n_test), y_test+1, n_test, 10)
+
+# Split into training (55,000 examples) and validation (5,000 examples)
+X = images[5001:nrow(images),]
+X_val = images[1:5000,]
+y = labels[5001:nrow(images),]
+y_val = labels[1:5000,]
+
+# Train
+[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)
+
+# Write model out
+write(W1, out_dir+"/W1")
+write(b1, out_dir+"/b1")
+write(W2, out_dir+"/W2")
+write(b2, out_dir+"/b2")
+write(W3, out_dir+"/W3")
+write(b3, out_dir+"/b3")
+write(W4, out_dir+"/W4")
+write(b4, out_dir+"/b4")
+
+# Eval on test set
+probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
+[loss, accuracy] = mnist_lenet::eval(probs, y_test)
+
+# Output results
+print("Test Accuracy: " + accuracy)
+write(accuracy, out_dir+"/accuracy")
+
+print("")
+print("")
+
[06/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn`
library to `scripts/nn`
Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
deleted file mode 100644
index a529a12..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
+++ /dev/null
@@ -1,178 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * MNIST Softmax Example
- */
-# Imports
-source("nn/layers/affine.dml") as affine
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/softmax.dml") as softmax
-source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
-
-train = function(matrix[double] X, matrix[double] y,
- matrix[double] X_val, matrix[double] y_val,
- int epochs)
- return (matrix[double] W, matrix[double] b) {
- /*
- * Trains a softmax classifier.
- *
- * The input matrix, X, has N examples, each with D features.
- * The targets, y, have K classes, and are one-hot encoded.
- *
- * Inputs:
- * - X: Input data matrix, of shape (N, D).
- * - y: Target matrix, of shape (N, K).
- * - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
- * - y_val: Target validation matrix, of shape (N, K).
- * - epochs: Total number of full training loops over the full data set.
- *
- * Outputs:
- * - W: Weights (parameters) matrix, of shape (D, M).
- * - b: Biases vector, of shape (1, M).
- */
- N = nrow(X) # num examples
- D = ncol(X) # num features
- K = ncol(y) # num classes
-
- # Create softmax classifier:
- # affine -> softmax
- [W, b] = affine::init(D, K)
- W = W / sqrt(2.0/(D)) * sqrt(1/(D))
-
- # Initialize SGD w/ Nesterov momentum optimizer
- lr = 0.2 # learning rate
- mu = 0 # momentum
- decay = 0.99 # learning rate decay constant
- vW = sgd_nesterov::init(W) # optimizer momentum state for W
- vb = sgd_nesterov::init(b) # optimizer momentum state for b
-
- # Optimize
- print("Starting optimization")
- batch_size = 50
- iters = 1000 #ceil(N / batch_size)
- for (e in 1:epochs) {
- for(i in 1:iters) {
- # Get next batch
- beg = ((i-1) * batch_size) %% N + 1
- end = min(N, beg + batch_size - 1)
- X_batch = X[beg:end,]
- y_batch = y[beg:end,]
-
- # Compute forward pass
- ## affine & softmax:
- out = affine::forward(X_batch, W, b)
- probs = softmax::forward(out)
-
- # Compute loss & accuracy for training & validation data
- loss = cross_entropy_loss::forward(probs, y_batch)
- accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
- probs_val = predict(X_val, W, b)
- loss_val = cross_entropy_loss::forward(probs_val, y_val)
- accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
- print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
- accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
-
- # Compute backward pass
- ## loss:
- dprobs = cross_entropy_loss::backward(probs, y_batch)
- ## affine & softmax:
- dout = softmax::backward(dprobs, out)
- [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b)
-
- # Optimize with SGD w/ Nesterov momentum
- [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW)
- [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb)
- }
- # Anneal momentum towards 0.999
- mu = mu + (0.999 - mu)/(1+epochs-e)
- # Decay learning rate
- lr = lr * decay
- }
-}
-
-predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
- return (matrix[double] probs) {
- /*
- * Computes the class probability predictions of a softmax classifier.
- *
- * The input matrix, X, has N examples, each with D features.
- *
- * Inputs:
- * - X: Input data matrix, of shape (N, D).
- * - W: Weights (parameters) matrix, of shape (D, M).
- * - b: Biases vector, of shape (1, M).
- *
- * Outputs:
- * - probs: Class probabilities, of shape (N, K).
- */
- # Compute forward pass
- ## affine & softmax:
- out = affine::forward(X, W, b)
- probs = softmax::forward(out)
-}
-
-eval = function(matrix[double] probs, matrix[double] y)
- return (double loss, double accuracy) {
- /*
- * Evaluates a softmax classifier.
- *
- * The probs matrix contains the class probability predictions
- * of K classes over N examples. The targets, y, have K classes,
- * and are one-hot encoded.
- *
- * Inputs:
- * - probs: Class probabilities, of shape (N, K).
- * - y: Target matrix, of shape (N, K).
- *
- * Outputs:
- * - loss: Scalar loss, of shape (1).
- * - accuracy: Scalar accuracy, of shape (1).
- */
- # Compute loss & accuracy
- loss = cross_entropy_loss::forward(probs, y)
- correct_pred = rowIndexMax(probs) == rowIndexMax(y)
- accuracy = mean(correct_pred)
-}
-
-generate_dummy_data = function()
- return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
- /*
- * Generate a dummy dataset similar to the MNIST dataset.
- *
- * Outputs:
- * - X: Input data matrix, of shape (N, D).
- * - y: Target matrix, of shape (N, K).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- */
- # Generate dummy input data
- N = 1024 # num examples
- C = 1 # num input channels
- Hin = 28 # input height
- Win = 28 # input width
- T = 10 # num targets
- X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
- classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
- y = table(seq(1, N), classes) # one-hot encoding
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/affine.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/affine.dml b/scripts/staging/SystemML-NN/nn/layers/affine.dml
deleted file mode 100644
index c9a740b..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/affine.dml
+++ /dev/null
@@ -1,92 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Affine (fully-connected) layer.
- */
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b)
- return (matrix[double] out) {
- /*
- * Computes the forward pass for an affine (fully-connected) layer
- * with M neurons. The input data has N examples, each with D
- * features.
- *
- * Inputs:
- * - X: Inputs, of shape (N, D).
- * - W: Weights, of shape (D, M).
- * - b: Biases, of shape (1, M).
- *
- * Outputs:
- * - out: Outputs, of shape (N, M).
- */
- out = X %*% W + b
-}
-
-backward = function(matrix[double] dout, matrix[double] X,
- matrix[double] W, matrix[double] b)
- return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
- /*
- * Computes the backward pass for a fully-connected (affine) layer
- * with M neurons.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of shape (N, M).
- * - X: Inputs, of shape (N, D).
- * - W: Weights, of shape (D, M).
- * - b: Biases, of shape (1, M).
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, D).
- * - dW: Gradient wrt `W`, of shape (D, M).
- * - db: Gradient wrt `b`, of shape (1, M).
- */
- dX = dout %*% t(W)
- dW = t(X) %*% dout
- db = colSums(dout)
-}
-
-init = function(int D, int M)
- return (matrix[double] W, matrix[double] b) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * We use the heuristic by He et al., which limits the magnification
- * of inputs/gradients during forward/backward passes by scaling
- * unit-Gaussian weights by a factor of sqrt(2/n), under the
- * assumption of relu neurons.
- * - http://arxiv.org/abs/1502.01852
- *
- * Inputs:
- * - D: Dimensionality of the input features (number of features).
- * - M: Number of neurons in this layer.
- *
- * Outputs:
- * - W: Weights, of shape (D, M).
- * - b: Biases, of shape (1, M).
- */
- W = rand(rows=D, cols=M, pdf="normal") * sqrt(2.0/D)
- b = matrix(0, rows=1, cols=M)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
deleted file mode 100644
index 2ccffdb..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
+++ /dev/null
@@ -1,210 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 1D Batch Normalization layer.
- */
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
- string mode, matrix[double] ema_mean, matrix[double] ema_var,
- double mu, double epsilon)
- return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
- matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
- /*
- * Computes the forward pass for a 1D batch normalization layer.
- * The input data has N examples, each with D features.
- *
- * A batch normalization layer uses the per-feature sample mean and
- * per-feature uncorrected sample variance during training to
- * normalize each feature of the input data. Additionally, it
- * introduces learnable parameters (gamma, beta) to control the
- * amount of normalization.
- *
- * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
- *
- * This implementation maintains exponential moving averages of the
- * mean and variance during training for use during testing.
- *
- * Reference:
- * - Batch Normalization: Accelerating Deep Network Training by
- * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
- * - https://arxiv.org/abs/1502.03167
- *
- * Inputs:
- * - X: Inputs, of shape (N, D).
- * - gamma: Scale parameters, of shape (1, D).
- * - beta: Shift parameters, of shape (1, D).
- * - mode: 'train' or 'test' to indicate if the model is currently
- * being trained or tested. During training, the current batch
- * mean and variance will be used to normalize the inputs, while
- * during testing, the exponential average of the mean and
- * variance over all previous batches will be used.
- * - ema_mean: Exponential moving average of the mean, of
- * shape (1, D).
- * - ema_var: Exponential moving average of the variance, of
- * shape (1, D).
- * - mu: Momentum value for moving averages.
- * Typical values are in the range of [0.9, 0.999].
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Typical values are in the range of [1e-5, 1e-3].
- *
- * Outputs:
- * - out: Outputs, of shape (N, D).
- * - ema_mean_upd: Updated exponential moving average of the mean,
- * of shape (1, D).
- * - ema_var_upd: Updated exponential moving average of the variance,
- * of shape (1, D).
- * - cache_mean: Cache of the batch mean, of shape (1, D).
- * Note: This is used for performance during training.
- * - cache_var: Cache of the batch variance, of shape (1, D).
- * Note: This is used for performance during training.
- * - cache_norm: Cache of the normalized inputs, of shape (N, D).
- * Note: This is used for performance during training.
- */
- N = nrow(X)
-
- if (mode == 'train') {
- # Compute feature-wise mean and variance
- mean = colMeans(X) # shape (1, D)
- # var = (1/N) * colSums((X-mean)^2)
- var = colVars(X) * ((N-1)/N) # compute uncorrected variance, of shape (1, D)
- # Update moving averages
- ema_mean_upd = mu*ema_mean + (1-mu)*mean
- ema_var_upd = mu*ema_var + (1-mu)*var
- }
- else {
- # Use moving averages of mean and variance during testing
- mean = ema_mean
- var = ema_var
- ema_mean_upd = ema_mean
- ema_var_upd = ema_var
- }
-
- # Normalize, shift, and scale
- # norm = (X-mean)*(var+epsilon)^(-1/2)
- norm = (X-mean) / sqrt(var+epsilon) # shape (N, D)
- out = norm*gamma + beta # shape (N, D)
-
- # Save variable for backward pass
- cache_mean = mean
- cache_var = var
- cache_norm = norm
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
- matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
- matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
- matrix[double] X, matrix[double] gamma, matrix[double] beta,
- string mode, matrix[double] ema_mean, matrix[double] ema_var,
- double mu, double epsilon)
- return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
- /*
- * Computes the backward pass for a 1D batch normalization layer.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of shape (N, D).
- * - out: Outputs from the forward pass, of shape (N, D).
- * - ema_mean_upd: Updated exponential moving average of the mean
- * from the forward pass, of shape (1, D).
- * - ema_var_upd: Updated exponential moving average of the variance
- * from the forward pass, of shape (1, D).
- * - cache_mean: Cache of the batch mean from the forward pass, of
- * shape (1, D). Note: This is used for performance during
- * training.
- * - cache_var: Cache of the batch variance from the forward pass,
- * of shape (1, D). Note: This is used for performance during
- * training.
- * - cache_norm: Cache of the normalized inputs from the forward
- * pass, of shape (N, D). Note: This is used for performance
- * during training.
- * - X: Inputs, of shape (N, D).
- * - gamma: Scale parameters, of shape (1, D).
- * - beta: Shift parameters, of shape (1, D).
- * - mode: 'train' or 'test' to indicate if the model is currently
- * being trained or tested. During training, the current batch
- * mean and variance will be used to normalize the inputs, while
- * during testing, the exponential average of the mean and
- * variance over all previous batches will be used.
- * - ema_mean: Exponential moving average of the mean, of
- * shape (1, D).
- * - ema_var: Exponential moving average of the variance, of
- * shape (1, D).
- * - mu: Momentum value for moving averages.
- * Typical values are in the range of [0.9, 0.999].
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Typical values are in the range of [1e-5, 1e-3].
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, D).
- * - dgamma: Gradient wrt `W`, of shape (1, D).
- * - dbeta: Gradient wrt `b`, of shape (1, D).
- *
- */
- N = nrow(X)
- mean = cache_mean
- var = cache_var
- norm = cache_norm
- centered = X-mean
-
- if (mode == 'train') {
- # Compute gradients during training
- dgamma = colSums(dout*norm) # shape (1, D)
- dbeta = colSums(dout) # shape (1, D)
- dnorm = dout * gamma # shape (N, D)
- dvar = (-1/2) * colSums(centered * (var+epsilon)^(-3/2) * dnorm) # shape (1, D)
- dmean = colSums((-dnorm/sqrt(var+epsilon)) + ((-2/N)*centered*dvar)) # shape (1, D)
- dX = (dnorm/sqrt(var+epsilon)) + ((2/N)*centered*dvar) + ((1/N)*dmean) # shape (N, D)
- }
- else {
- # Compute gradients during testing
- dgamma = colSums(dout*norm) # shape (1, D)
- dbeta = colSums(dout) # shape (1, D)
- dnorm = dout * gamma # shape (N, D)
- dX = dnorm / sqrt(var+epsilon) # shape (N, D)
- }
-}
-
-init = function(int D)
- return (matrix[double] gamma, matrix[double] beta,
- matrix[double] ema_mean, matrix[double] ema_var) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * Inputs:
- * - D: Dimensionality of the input features (number of features).
- *
- * Outputs:
- * - gamma: Scale parameters, of shape (1, D).
- * - beta: Shift parameters, of shape (1, D).
- * - ema_mean: Exponential moving average of the mean, of
- * shape (1, D).
- * - ema_var: Exponential moving average of the variance, of
- * shape (1, D).
- */
- gamma = matrix(1, rows=1, cols=D)
- beta = matrix(0, rows=1, cols=D)
- ema_mean = matrix(0, rows=1, cols=D)
- ema_var = matrix(1, rows=1, cols=D)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
deleted file mode 100644
index 49c6746..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
+++ /dev/null
@@ -1,238 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D (Spatial) Batch Normalization layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
- int C, int Hin, int Win, string mode,
- matrix[double] ema_mean, matrix[double] ema_var,
- double mu, double epsilon)
- return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
- matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
- /*
- * Computes the forward pass for a 2D (spatial) batch normalization
- * layer. The input data has N examples, each represented as a 3D
- * volume unrolled into a single vector.
- *
- * A spatial batch normalization layer uses the per-channel sample
- * mean and per-channel uncorrected sample variance during training
- * to normalize each channel of the input data. Additionally, it
- * introduces learnable parameters (gamma, beta) to control the
- * amount of normalization.
- *
- * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
- *
- * This implementation maintains exponential moving averages of the
- * mean and variance during training for use during testing.
- *
- * Reference:
- * - Batch Normalization: Accelerating Deep Network Training by
- * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
- * - https://arxiv.org/abs/1502.03167
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - gamma: Scale parameters, of shape (C, 1).
- * - beta: Shift parameters, of shape (C, 1).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - mode: 'train' or 'test' to indicate if the model is currently
- * being trained or tested. During training, the current batch
- * mean and variance will be used to normalize the inputs, while
- * during testing, the exponential average of the mean and
- * variance over all previous batches will be used.
- * - ema_mean: Exponential moving average of the mean, of
- * shape (C, 1).
- * - ema_var: Exponential moving average of the variance, of
- * shape (C, 1).
- * - mu: Momentum value for moving averages.
- * Typical values are in the range of [0.9, 0.999].
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Typical values are in the range of [1e-5, 1e-3].
- *
- * Outputs:
- * - out: Outputs, of shape (N, C*Hin*Win).
- * - ema_mean_upd: Updated exponential moving average of the mean,
- * of shape (C, 1).
- * - ema_var_upd: Updated exponential moving average of the variance,
- * of shape (C, 1).
- * - cache_mean: Cache of the batch mean, of shape (C, 1).
- * Note: This is used for performance during training.
- * - cache_var: Cache of the batch variance, of shape (C, 1).
- * Note: This is used for performance during training.
- * - cache_norm: Cache of the normalized inputs, of
- * shape (C, N*Hin*Win). Note: This is used for performance
- * during training.
- */
- N = nrow(X)
-
- if (mode == 'train') {
- # Compute channel-wise mean and variance
- # Since we don't have tensors, we will compute the means and variances in a piece-wise fashion.
- # - mean of total group is mean of subgroup means
- # - variance is the mean of the subgroup variances + the variance of the subgroup means
- subgrp_means = matrix(colMeans(X), rows=C, cols=Hin*Win)
- subgrp_vars = matrix(colVars(X) * ((N-1)/N), rows=C, cols=Hin*Win) # uncorrected variances
- mean = rowMeans(subgrp_means) # shape (C, 1)
- var = rowMeans(subgrp_vars) + rowVars(subgrp_means)*(((Hin*Win)-1)/(Hin*Win)) # shape (C, 1)
- # Update moving averages
- ema_mean_upd = mu*ema_mean + (1-mu)*mean
- ema_var_upd = mu*ema_var + (1-mu)*var
- }
- else {
- # Use moving averages of mean and variance during testing
- mean = ema_mean
- var = ema_var
- ema_mean_upd = ema_mean
- ema_var_upd = ema_var
- }
-
- # Normalize, shift, and scale
- # norm = (X-mean)*(var+epsilon)^(-1/2)
- # = (X-mean) / sqrt(var+epsilon)
- centered = bias_add(X, -mean) # shape (N, C*Hin*Win)
- norm = bias_multiply(centered, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win)
- # out = norm*gamma + beta
- scaled = bias_multiply(norm, gamma) # shape (N, C*Hin*Win)
- out = bias_add(scaled, beta) # shape (N, C*Hin*Win)
-
- # Save variable for backward pass
- cache_mean = mean
- cache_var = var
- cache_norm = norm
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
- matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
- matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
- matrix[double] X, matrix[double] gamma, matrix[double] beta,
- int C, int Hin, int Win, string mode,
- matrix[double] ema_mean, matrix[double] ema_var,
- double mu, double epsilon)
- return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
- /*
- * Computes the backward pass for a 2D (spatial) batch normalization
- * layer.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
- * - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
- * - ema_mean_upd: Updated exponential moving average of the mean
- * from the forward pass, of shape (C, 1).
- * - ema_var_upd: Updated exponential moving average of the variance
- * from the forward pass, of shape (C, 1).
- * - cache_mean: Cache of the batch mean from the forward pass, of
- * shape (C, 1). Note: This is used for performance during
- * training.
- * - cache_var: Cache of the batch variance from the forward pass,
- * of shape (C, 1). Note: This is used for performance during
- * training.
- * - cache_norm: Cache of the normalized inputs from the forward
- * pass, of shape (C, N*Hin*Win). Note: This is used for
- * performance during training.
- * - X: Input data matrix to the forward pass, of
- * shape (N, C*Hin*Win).
- * - gamma: Scale parameters, of shape (C, 1).
- * - beta: Shift parameters, of shape (C, 1).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - mode: 'train' or 'test' to indicate if the model is currently
- * being trained or tested. During training, the current batch
- * mean and variance will be used to normalize the inputs, while
- * during testing, the exponential average of the mean and
- * variance over all previous batches will be used.
- * - ema_mean: Exponential moving average of the mean, of
- * shape (C, 1).
- * - ema_var: Exponential moving average of the variance, of
- * shape (C, 1).
- * - mu: Momentum value for moving averages.
- * Typical values are in the range of [0.9, 0.999].
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Typical values are in the range of [1e-5, 1e-3].
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
- * - dgamma: Gradient wrt `W`, of shape (C, 1).
- * - dbeta: Gradient wrt `b`, of shape (C, 1).
- *
- */
- N = nrow(X)
- mean = cache_mean
- var = cache_var
- norm = cache_norm
- centered = bias_add(X, -mean) # shape (N, C*Hin*Win)
-
- if (mode == 'train') {
- # Compute gradients during training
- dgamma = util::channel_sums(dout*norm, C, Hin, Win) # shape (C, 1)
- dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
- dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
- dvar = util::channel_sums((-1/2) * bias_multiply(centered, (var+epsilon)^(-3/2)) * dnorm,
- C, Hin, Win) # shape (C, 1)
- dmean_norm_branch = util::channel_sums(bias_multiply(dnorm, -1/sqrt(var+epsilon)), C, Hin, Win)
- dmean_var_branch = util::channel_sums((-2/(N*Hin*Win)) * centered, C, Hin, Win)
- dmean_var_branch = dmean_var_branch * dvar # we can't use a function within an expression yet
- dmean = dmean_norm_branch + dmean_var_branch # shape (C, 1)
- dX_norm_branch = bias_multiply(dnorm, 1/sqrt(var+epsilon))
- dX_mean_branch = (1/(N*Hin*Win)) * bias_add(matrix(0, rows=1, cols=C*Hin*Win), dmean)
- dX_var_branch = (2/(N*Hin*Win)) * bias_multiply(centered, dvar)
- dX = dX_norm_branch + dX_mean_branch + dX_var_branch # shape (N, C*Hin*Win)
- }
- else {
- # Compute gradients during testing
- dgamma = util::channel_sums(dout*norm, C, Hin, Win) # shape (C, 1)
- dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
- dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
- dX = bias_multiply(dnorm, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win)
- }
-}
-
-init = function(int C)
- return (matrix[double] gamma, matrix[double] beta,
- matrix[double] ema_mean, matrix[double] ema_var) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * Inputs:
- * - C: Number of input channels (dimensionality of input depth).
- *
- * Outputs:
- * - gamma: Scale parameters, of shape (C, 1).
- * - beta: Shift parameters, of shape (C, 1).
- * - ema_mean: Exponential moving average of the mean, of
- * shape (C, 1).
- * - ema_var: Exponential moving average of the variance, of
- * shape (C, 1).
- */
- gamma = matrix(1, rows=C, cols=1)
- beta = matrix(0, rows=C, cols=1)
- ema_mean = matrix(0, rows=C, cols=1)
- ema_var = matrix(1, rows=C, cols=1)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d.dml b/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
deleted file mode 100644
index 9d03568..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
+++ /dev/null
@@ -1,194 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] out, int Hout, int Wout) {
- /*
- * Computes the forward pass for a 2D spatial convolutional layer with
- * F filters. The input data has N examples, each represented as a 3D
- * volume unrolled into a single vector.
- *
- * This implementation uses `im2col` internally for each image to
- * extract local image regions (patches) into columns, and then
- * performs a matrix multiplication with the filters to compute the
- * output maps.
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * For same output height as input, set `padh = (Hf - 1) / 2`,
- * assuming `strideh = 1`.
- * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
- * preserves the spatial dimensions of the input.
- * - padw: Padding for left and right sides.
- * For same output width as input, set `padw = (Wf - 1) / 2`,
- * assuming `stridew = 1`.
- * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
- * preserves the spatial dimensions of the input.
- *
- * Outputs:
- * - out: Outputs, of shape (N, F*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- */
- N = nrow(X)
- F = nrow(W)
- Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
- Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
- # Create output volume
- out = matrix(0, rows=N, cols=F*Hout*Wout)
-
- # Convolution - im2col implementation
- parfor (n in 1:N) { # all examples
- Xn = matrix(X[n,], rows=C, cols=Hin*Win) # reshape
-
- # Pad image
- Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0) # shape (C, (Hin+2*padh)*(Win+2*padw))
-
- # Extract local image patches into columns with im2col, of shape (C*Hf*Wf, Hout*Wout)
- Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
-
- # Convolve patches with filters
- outn = W %*% Xn_padded_cols + b # shape (F, Hout*Wout)
- out[n,] = matrix(outn, rows=1, cols=F*Hout*Wout) # reshape
- }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
- matrix[double] X, matrix[double] W, matrix[double] b,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
- /*
- * Computes the backward pass for a 2D spatial convolutional layer
- * with F filters.
- *
- * This implementation uses `im2col` and `col2im` internally.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of
- * shape (N, F*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * - padw: Padding for left and right sides.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
- * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
- * - db: Gradient wrt `b`, of shape (F, 1).
- */
- N = nrow(X)
- F = nrow(W)
-
- # Create gradient volumes
- # Note: Create convenience gradient volumes for dW and db that will
- # allow for one gradient to be stored per example, allowing for
- # parallel computation at the expense of memory. We will reduce at
- # the end.
- dX = matrix(0, rows=N, cols=C*Hin*Win)
- dWN = matrix(0, rows=N, cols=F*C*Hf*Wf) # dW = matrix(0, rows=F, cols=C*Hf*Wf)
- dbN = matrix(0, rows=N, cols=F) # db = matrix(0, rows=F, cols=1)
-
- # Partial derivatives for convolution - im2col implementation
- parfor (n in 1:N) { # all examples
- doutn = matrix(dout[n,], rows=F, cols=Hout*Wout)
-
- # Compute dW
- Xn = matrix(X[n,], rows=C, cols=Hin*Win) # reshape
- Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0) # shape (C, (Hin+2*padh)*(Win+2*padw))
- Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
- # dW = dW + doutn %*% t(Xn_padded_cols)
- dWN[n,] = matrix(doutn %*% t(Xn_padded_cols), rows=1, cols=F*C*Hf*Wf)
-
- # Compute db
- # db = db + rowSums(doutn)
- dbN[n,] = matrix(rowSums(doutn), rows=1, cols=F)
-
- # Compute dX
- dXn_padded_cols = t(W) %*% doutn # shape (C*Hf*Wf, Hout*Wout)
- dXn_padded = util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, Wf,
- strideh, stridew, "add")
- dXn = util::unpad_image(dXn_padded, Hin, Win, padh, padw)
- dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win) # reshape
- }
-
- # Reduce convenience gradient volumes with one gradient per example
- # into single gradients for W and b.
- dW = matrix(colSums(dWN), rows=F, cols=C*Hf*Wf)
- db = matrix(colSums(dbN), rows=F, cols=1)
-}
-
-init = function(int F, int C, int Hf, int Wf)
- return (matrix[double] W, matrix[double] b) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * We use the heuristic by He et al., which limits the magnification
- * of inputs/gradients during forward/backward passes by scaling
- * unit-Gaussian weights by a factor of sqrt(2/n), under the
- * assumption of relu neurons.
- * - http://arxiv.org/abs/1502.01852
- *
- * Inputs:
- * - F: Number of filters.
- * - C: Number of input channels (dimensionality of depth).
- * - Hf: Filter height.
- * - Wf: Filter width.
- *
- * Outputs:
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- */
- W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
- b = matrix(0, rows=F, cols=1)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
deleted file mode 100644
index bda7a9c..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
+++ /dev/null
@@ -1,160 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- *
- * This implementation uses a built-in operator for higher performance.
- */
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] out, int Hout, int Wout) {
- /*
- * Computes the forward pass for a 2D spatial convolutional layer with
- * F filters. The input data has N examples, each represented as a 3D
- * volume unrolled into a single vector.
- *
- * This implementation uses a built-in operator for higher
- * performance.
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- * - C: Number of input channels (dimensionality of depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * For same output height as input, set `padh = (Hf - 1) / 2`,
- * assuming `strideh = 1`.
- * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
- * preserves the spatial dimensions of the input.
- * - padw: Padding for left and right sides.
- * For same output width as input, set `padw = (Wf - 1) / 2`,
- * assuming `stridew = 1`.
- * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
- * preserves the spatial dimensions of the input.
- *
- * Outputs:
- * - out: Outputs, of shape (N, F*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- */
- N = nrow(X)
- F = nrow(W)
- Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
- Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
- # Convolution - built-in implementation
- out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
- stride=[strideh,stridew], padding=[padh,padw])
-
- # Add bias term to each output filter
- out = bias_add(out, b)
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
- matrix[double] X, matrix[double] W, matrix[double] b,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
- /*
- * Computes the backward pass for a 2D spatial convolutional layer
- * with F filters.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of
- * shape (N, F*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- * - C: Number of input channels (dimensionality of depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * For same output height as input, set `padh = (Hf - 1) / 2`,
- * assuming `strideh = 1`.
- * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
- * preserves the spatial dimensions of the input.
- * - padw: Padding for left and right sides.
- * For same output width as input, set `padw = (Wf - 1) / 2`,
- * assuming `stridew = 1`.
- * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
- * preserves the spatial dimensions of the input.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
- * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
- * - db: Gradient wrt `b`, of shape (F, 1).
- */
- N = nrow(X)
- F = nrow(W)
-
- # Partial derivatives for convolution - built-in implementation
- dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
- input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
- dX = conv2d_backward_data(W, dout, stride=[strideh, stridew], padding=[padh,padw],
- input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
-
- # Partial derivatives for bias vector
- db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))
-}
-
-init = function(int F, int C, int Hf, int Wf)
- return (matrix[double] W, matrix[double] b) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * We use the heuristic by He et al., which limits the magnification
- * of inputs/gradients during forward/backward passes by scaling
- * unit-Gaussian weights by a factor of sqrt(2/n), under the
- * assumption of relu neurons.
- * - http://arxiv.org/abs/1502.01852
- *
- * Inputs:
- * - F: Number of filters.
- * - C: Number of input channels (dimensionality of depth).
- * - Hf: Filter height.
- * - Wf: Filter width.
- *
- * Outputs:
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- */
- W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
- b = matrix(0, rows=F, cols=1)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
deleted file mode 100644
index 63db502..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
+++ /dev/null
@@ -1,78 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Cross-Entropy loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
- return (double loss) {
- /*
- * Computes the forward pass for a cross-entropy loss function. The
- * inputs consist of N examples, each with K dimensions corresponding
- * to normalized probabilities of K classes.
- *
- * ```
- * L_i = -y_i^T * log(pred_i)
- * L = (1/N) sum(L_i) for i=1 to N
- * ```
- *
- * In these equations, `L` is the total loss, `L_i` is the loss for
- * example `i`, `y_i` is the K-dimensional vector of target class
- * probabilities, `pred_i` is K-dimensional vector of predicted
- * class probabilities, and `N` is the number of examples.
- *
- * This can be interpreted as the negative log-likelihood assuming
- * a Bernoulli distribution generalized to K dimensions, or a
- * Multinomial with one observation.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, K).
- * - y: Targets, of shape (N, K).
- *
- * Outputs:
- * - loss: Average loss.
- */
- N = nrow(y)
- eps = 1e-10 # numerical stability to avoid log(0)
- losses = rowSums(-y * log(pred+eps))
- loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
- return (matrix[double] dpred) {
- /*
- * Computes the backward pass of a cross-entropy loss function. The
- * inputs consist of N examples, each with K dimensions corresponding
- * to normalized probabilities of K classes.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, K).
- * - y: Targets, of shape (N, K).
- *
- * Outputs:
- * - dpred: Gradient wrt `pred`, of shape (N, K).
- */
- N = nrow(y)
- eps = 1e-10 # numerical stability to avoid divide-by-zero
- dpred = (1/N) * -y * (1/(pred+eps))
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/dropout.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/dropout.dml b/scripts/staging/SystemML-NN/nn/layers/dropout.dml
deleted file mode 100644
index a36878b..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/dropout.dml
+++ /dev/null
@@ -1,76 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Dropout layer.
- */
-
-forward = function(matrix[double] X, double p, int seed)
- return (matrix[double] out, matrix[double] mask) {
- /*
- * Computes the forward pass for an inverted dropout layer.
- *
- * Drops the inputs element-wise with a probability p, and divides
- * by p to maintain the expected values of those inputs (which are
- * the outputs of neurons) at test time.
- *
- * Inputs:
- * - X: Inputs, of shape (any, any).
- * - p: Probability of keeping a neuron output.
- * - seed: [Optional: -1] Random number generator seed to allow for
- * deterministic evaluation. Set to -1 for a random seed.
- *
- * Outputs:
- * - out: Outputs, of same shape as `X`.
- * - mask: Dropout mask used to compute the output.
- */
- # Normally, we might use something like
- # `mask = rand(rows=nrow(X), cols=ncol(X), min=0, max=1, seed=seed) <= p`
- # to create a dropout mask. Fortunately, SystemML has a `sparsity` parameter on
- # the `rand` function that allows use to create a mask directly.
- if (seed == -1) {
- mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p)
- } else {
- mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed)
- }
- out = X * mask / p
-}
-
-backward = function(matrix[double] dout, matrix[double] X, double p, matrix[double] mask)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for an inverted dropout layer.
- *
- * Applies the mask to the upstream gradient, and divides by p to
- * maintain the expected values at test time.
- *
- * Inputs:
- * - dout: Gradient wrt `out`, of same shape as `X`.
- * - X: Inputs, of shape (any, any).
- * - p: Probability of keeping a neuron output.
- * - mask: Dropout mask used to compute the output.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of same shape as `X`.
- */
- dX = mask / p * dout
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
deleted file mode 100644
index b74566d..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
+++ /dev/null
@@ -1,72 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L1 loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
- return (double loss) {
- /*
- * Computes the forward pass for an L1 loss function. The inputs
- * consist of N examples, each with M dimensions to predict.
- *
- * ```
- * L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j.
- * L = (1/N) sum(L_i) for i=1 to N
- * ```
- *
- * In these equations, `L` is the total loss, `L_i` is the loss for
- * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
- * prediction, and `N` is the number of examples.
- *
- * This can be interpreted as the negative log-likelihood assuming
- * a Laplace distribution.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, M).
- * - y: Targets, of shape (N, M).
- *
- * Outputs:
- * - loss: Average loss.
- */
- N = nrow(y)
- losses = rowSums(abs(pred-y))
- loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
- return (matrix[double] dpred) {
- /*
- * Computes the backward pass for an L1 loss function. The inputs
- * consist of N examples, each with M dimensions to predict.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, M).
- * - y: Targets, of shape (N, M).
- *
- * Outputs:
- * - dpred: Gradient wrt `pred`, of shape (N, M).
- */
- N = nrow(y)
- dpred = sign(pred-y) / N
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
deleted file mode 100644
index 2b81c0b..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
+++ /dev/null
@@ -1,56 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L1 regularization.
- */
-
-forward = function(matrix[double] X, double lambda)
- return (double reg_loss) {
- /*
- * Computes the forward pass for an L1 regularization function.
- *
- * Inputs:
- * - X: Inputs, of shape (any, any).
- * - lambda: Regularization strength.
- * A typical value is 0.01.
- *
- * Outputs:
- * - reg_loss: Total regularization loss.
- */
- reg_loss = lambda * sum(abs(X))
-}
-
-backward = function(matrix[double] X, double lambda)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for an L1 regularization function.
- *
- * Inputs:
- * - X: Inputs, of shape (any, any).
- * - lambda: Regularization strength.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of same shape as `X`.
- */
- dX = lambda * sign(X)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
deleted file mode 100644
index 0482f25..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
+++ /dev/null
@@ -1,72 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L2 loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
- return (double loss) {
- /*
- * Computes the forward pass for an L2 loss function. The inputs
- * consist of N examples, each with M dimensions to predict.
- *
- * ```
- * L_i = (1/2) norm(pred_i - y_i)^2
- * L = (1/N) sum(L_i) for i=1 to N
- * ```
- *
- * In these equations, `L` is the total loss, `L_i` is the loss for
- * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
- * prediction, and `N` is the number of examples.
- *
- * This can be interpreted as the negative log-likelihood assuming
- * a Gaussian distribution.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, M).
- * - y: Targets, of shape (N, M).
- *
- * Outputs:
- * - loss: Average loss.
- */
- N = nrow(y)
- losses = 0.5 * rowSums((pred-y)^2)
- loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
- return (matrix[double] dpred) {
- /*
- * Computes the backward pass for an L2 loss function. The inputs
- * consist of N examples, each with M dimensions to predict.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, M).
- * - y: Targets, of shape (N, M).
- *
- * Outputs:
- * - dpred: Gradient wrt `pred`, of shape (N, M).
- */
- N = nrow(y)
- dpred = (pred-y) / N
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
deleted file mode 100644
index 7255efe..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
+++ /dev/null
@@ -1,56 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L2 regularization.
- */
-
-forward = function(matrix[double] X, double lambda)
- return (double reg_loss) {
- /*
- * Computes the forward pass for an L2 regularization function.
- *
- * Inputs:
- * - X: Inputs, of shape (any, any).
- * - lambda: Regularization strength.
- * A typical value is 0.01.
- *
- * Outputs:
- * - reg_loss: Total regularization loss.
- */
- reg_loss = 0.5 * lambda * sum(X^2)
-}
-
-backward = function(matrix[double] X, double lambda)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for an L2 regularization function.
- *
- * Inputs:
- * - X: Inputs, of shape (any, any).
- * - lambda: Regularization strength.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of same shape as `X`.
- */
- dX = lambda * X
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
deleted file mode 100644
index 15914f7..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
+++ /dev/null
@@ -1,76 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Log loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
- return (double loss) {
- /*
- * Computes the forward pass for a log loss function.
- *
- * ```
- * L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i)
- * L = (1/N) sum(L_i) for i=1 to N
- * ```
- *
- * In these equations, `L` is the total loss, `L_i` is the loss for
- * example `i`, `y_i` is the binary target, `pred_i` is probability
- * of the true class (i.e. `y=1`), and `N` is the number of examples.
- *
- * This can be interpreted as the negative log-likelihood assuming
- * a Bernoulli distribution.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, 1).
- * Predictions should be probabilities of the true
- * class (i.e. probability of `y=1`).
- * - y: Targets, of shape (N, 1).
- * Targets should be binary in the set {0, 1}.
- *
- * Outputs:
- * - loss: Average loss.
- */
- N = nrow(y)
- losses = -y*log(pred) - (1-y)*log(1-pred)
- loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
- return (matrix[double] dpred) {
- /*
- * Computes the backward pass for a log loss function.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, 1).
- * Predictions should be probabilities of the true
- * class (i.e. probability of `y=1`).
- * - y: Targets, of shape (N, 1).
- * Targets should be binary in the set {0, 1}.
- *
- * Outputs:
- * - dpred: Gradient wrt `pred`, of shape (N, 1).
- */
- N = nrow(y)
- dpred = (1/N) * (pred-y) / (pred*(1-pred))
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/lstm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/lstm.dml b/scripts/staging/SystemML-NN/nn/layers/lstm.dml
deleted file mode 100644
index a75add4..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/lstm.dml
+++ /dev/null
@@ -1,260 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * LSTM layer.
- */
-source("nn/layers/sigmoid.dml") as sigmoid
-source("nn/layers/tanh.dml") as tanh
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
- boolean return_sequences, matrix[double] out0, matrix[double] c0)
- return (matrix[double] out, matrix[double] c,
- matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog) {
- /*
- * Computes the forward pass for an LSTM layer with M neurons.
- * The input data has N sequences of T examples, each with D features.
- *
- * In an LSTM, an internal cell state is maintained, additive
- * interactions operate over the cell state at each timestep, and
- * some amount of this cell state is exposed as output at each
- * timestep. Additionally, the output of the previous timestep is fed
- * back in as an additional input at the current timestep.
- *
- * Reference:
- * - Long Short-Term Memory, Hochreiter, 1997
- * - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
- *
- * Inputs:
- * - X: Inputs, of shape (N, T*D).
- * - W: Weights, of shape (D+M, 4M).
- * - b: Biases, of shape (1, 4M).
- * - T: Length of example sequences (number of timesteps).
- * - D: Dimensionality of the input features (number of features).
- * - return_sequences: Whether to return `out` at all timesteps,
- * or just for the final timestep.
- * - out0: Outputs from previous timestep, of shape (N, M).
- * Note: This is *optional* and could just be an empty matrix.
- * - c0: Initial cell state, of shape (N, M).
- * Note: This is *optional* and could just be an empty matrix.
- *
- * Outputs:
- * - out: If `return_sequences` is True, outputs for all timesteps,
- * of shape (N, T*M). Else, outputs for the final timestep, of
- * shape (N, M).
- * - c: Cell state for final timestep, of shape (N, M).
- * - cache_out: Cache of outputs, of shape (T, N*M).
- * Note: This is used for performance during training.
- * - cache_c: Cache of cell state, of shape (T, N*M).
- * Note: This is used for performance during training.
- * - cache_ifog: Cache of intermediate values, of shape (T, N*4M).
- * Note: This is used for performance during training.
- */
- N = nrow(X)
- M = as.integer(ncol(W)/4)
- out_prev = out0
- c_prev = c0
- c = c_prev
- if (return_sequences) {
- out = matrix(0, rows=N, cols=T*M)
- }
- else {
- out = matrix(0, rows=N, cols=M)
- }
- # caches to be used during the backward pass for performance
- cache_out = matrix(0, rows=T, cols=N*M)
- cache_c = matrix(0, rows=T, cols=N*M)
- cache_ifog = matrix(0, rows=T, cols=N*4*M)
-
- for (t in 1:T) { # each timestep
- X_t = X[,(t-1)*D+1:t*D] # shape (N, D)
- input = cbind(X_t, out_prev) # shape (N, D+M)
- ifog = input %*% W + b # input, forget, output, and g gates; shape (N, 4M)
- tmp = sigmoid::forward(ifog[,1:3*M]) # i,f,o gates squashed with sigmoid
- ifog[,1:3*M] = tmp
- tmp = tanh::forward(ifog[,3*M+1:4*M]) # g gate squashed with tanh
- ifog[,3*M+1:4*M] = tmp
- # c_t = f*prev_c + i*g
- c = ifog[,M+1:2*M]*c_prev + ifog[,1:M]*ifog[,3*M+1:4*M] # shape (N, M)
- # out_t = o*tanh(c)
- tmp = tanh::forward(c)
- out_t = ifog[,2*M+1:3*M] * tmp # shape (N, M)
-
- # store
- if (return_sequences) {
- out[,(t-1)*M+1:t*M] = out_t
- }
- else {
- out = out_t
- }
- out_prev = out_t
- c_prev = c
- cache_out[t,] = matrix(out_t, rows=1, cols=N*M) # reshape
- cache_c[t,] = matrix(c, rows=1, cols=N*M) # reshape
- cache_ifog[t,] = matrix(ifog, rows=1, cols=N*4*M) # reshape
- }
-}
-
-backward = function(matrix[double] dout, matrix[double] dc,
- matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
- boolean given_sequences, matrix[double] out0, matrix[double] c0,
- matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog)
- return (matrix[double] dX, matrix[double] dW, matrix[double] db,
- matrix[double] dout0, matrix[double] dc0) {
- /*
- * Computes the backward pass for an LSTM layer with M neurons.
- *
- * Inputs:
- * - dout: Gradient wrt `out`. If `given_sequences` is `True`,
- * contains gradients on outputs for all timesteps, of
- * shape (N, T*M). Else, contains the gradient on the output
- * for the final timestep, of shape (N, M).
- * - dc: Gradient wrt `c` (from later in time), of shape (N, M).
- * This would come from later in time if the cell state was used
- * downstream as the initial cell state for another LSTM layer.
- * Typically, this would be used when a sequence was cut at
- * timestep `T` and then continued in the next batch. If `c`
- * was not used downstream, then `dc` would be an empty matrix.
- * - X: Inputs, of shape (N, T*D).
- * - W: Weights, of shape (D+M, 4M).
- * - b: Biases, of shape (1, 4M).
- * - T: Length of example sequences (number of timesteps).
- * - D: Dimensionality of the input features.
- * - given_sequences: Whether `dout` is for all timesteps,
- * or just for the final timestep. This is based on whether
- * `return_sequences` was true in the forward pass.
- * - out0: Outputs from previous timestep, of shape (N, M).
- * Note: This is *optional* and could just be an empty matrix.
- * - c0: Initial cell state, of shape (N, M).
- * Note: This is *optional* and could just be an empty matrix.
- * - cache_out: Cache of outputs, of shape (T, N*M).
- * Note: This is used for performance during training.
- * - cache_c: Cache of cell state, of shape (T, N*M).
- * Note: This is used for performance during training.
- * - cache_ifog: Cache of intermediate values, of shape (T, N*4*M).
- * Note: This is used for performance during training.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, T*D).
- * - dW: Gradient wrt `W`, of shape (D+M, 4M).
- * - db: Gradient wrt `b`, of shape (1, 4M).
- * - dout0: Gradient wrt `out0`, of shape (N, M).
- * - dc0: Gradient wrt `c0`, of shape (N, M).
- */
- N = nrow(X)
- M = as.integer(ncol(W)/4)
- dX = matrix(0, rows=N, cols=T*D)
- dW = matrix(0, rows=D+M, cols=4*M)
- db = matrix(0, rows=1, cols=4*M)
- dout0 = matrix(0, rows=N, cols=M)
- dc0 = matrix(0, rows=N, cols=M)
- dct = dc
- if (!given_sequences) {
- # only given dout for output at final timestep, so prepend empty douts for all other timesteps
- dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout) # shape (N, T*M)
- }
-
- t = T
- for (iter in 1:T) { # each timestep in reverse order
- X_t = X[,(t-1)*D+1:t*D] # shape (N, D)
- dout_t = dout[,(t-1)*M+1:t*M] # shape (N, M)
- out_t = matrix(cache_out[t,], rows=N, cols=M) # shape (N, M)
- ct = matrix(cache_c[t,], rows=N, cols=M) # shape (N, M)
- if (t == 1) {
- out_prev = out0 # shape (N, M)
- c_prev = c0 # shape (N, M)
- }
- else {
- out_prev = matrix(cache_out[t-1,], rows=N, cols=M) # shape (N, M)
- c_prev = matrix(cache_c[t-1,], rows=N, cols=M) # shape (N, M)
- }
- input = cbind(X_t, out_prev) # shape (N, D+M)
- ifog = matrix(cache_ifog[t,], rows=N, cols=4*M)
- i = ifog[,1:M] # input gate, shape (N, M)
- f = ifog[,M+1:2*M] # forget gate, shape (N, M)
- o = ifog[,2*M+1:3*M] # output gate, shape (N, M)
- g = ifog[,3*M+1:4*M] # g gate, shape (N, M)
-
- tmp = tanh::backward(dout_t, ct)
- dct = dct + o*tmp # shape (N, M)
- tmp = tanh::forward(ct)
- do = tmp * dout_t # output gate, shape (N, M)
- df = c_prev * dct # forget gate, shape (N, M)
- dc_prev = f * dct # shape (N, M)
- di = g * dct # input gate, shape (N, M)
- dg = i * dct # g gate, shape (N, M)
-
- di_raw = i * (1-i) * di
- df_raw = f * (1-f) * df
- do_raw = o * (1-o) * do
- dg_raw = (1-g^2) * dg
- difog_raw = cbind(di_raw, cbind(df_raw, cbind(do_raw, dg_raw))) # shape (N, 4M)
-
- dW = dW + t(input) %*% difog_raw # shape (D+M, 4M)
- db = db + colSums(difog_raw) # shape (1, 4M)
- dinput = difog_raw %*% t(W) # shape (N, D+M)
- dX[,(t-1)*D+1:t*D] = dinput[,1:D]
- dout_prev = dinput[,D+1:D+M] # shape (N, M)
- if (t == 1) {
- dout0 = dout_prev # shape (N, M)
- dc0 = dc_prev # shape (N, M)
- }
- else {
- dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M)
- dct = dc_prev # shape (N, M)
- }
- t = t - 1
- }
-}
-
-init = function(int N, int D, int M)
- return (matrix[double] W, matrix[double] b, matrix[double] out0, matrix[double] c0) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * We use the Glorot uniform heuristic which limits the magnification
- * of inputs/gradients during forward/backward passes by scaling
- * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
- * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
- *
- * Inputs:
- * - N: Number of examples in batch.
- * - D: Dimensionality of the input features (number of features).
- * - M: Number of neurons in this layer.
- *
- * Outputs:
- * - W: Weights, of shape (D+M, 4M).
- * - b: Biases, of shape (1, 4M).
- * - out0: Empty previous timestep output matrix, of shape (N, M).
- * - c0: Empty initial cell state matrix, of shape (N, M).
- */
- fan_in = D+M
- fan_out = 4*M
- scale = sqrt(6/(fan_in+fan_out))
- W = rand(rows=D+M, cols=4*M, min=-scale, max=scale, pdf="uniform")
- b = matrix(0, rows=1, cols=4*M)
- out0 = matrix(0, rows=N, cols=M)
- c0 = matrix(0, rows=N, cols=M)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
deleted file mode 100644
index fba1a4c..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
+++ /dev/null
@@ -1,159 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Max Pooling layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] out, int Hout, int Wout) {
- /*
- * Computes the forward pass for a 2D spatial max pooling layer.
- * The input data has N examples, each represented as a 3D volume
- * unrolled into a single vector.
- *
- * This implementation uses `im2col` internally for each image to
- * extract local image regions (patches) of each channel slice into
- * columns, and then performs max pooling over the patches to compute
- * the output maps.
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * A typical value is 0.
- * - padw: Padding for left and right sides.
- * A typical value is 0.
- *
- * Outputs:
- * - out: Outputs, of shape (N, C*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- */
- N = nrow(X)
- Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
- Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
- pad_value = -1/0 # in max pooling we pad with -infinity
-
- # Create output volume
- out = matrix(0, rows=N, cols=C*Hout*Wout)
-
- # Max pooling - im2col implementation
- parfor (n in 1:N) { # all examples
- img = matrix(X[n,], rows=C, cols=Hin*Win) # reshape
-
- if (padh > 0 | padw > 0) {
- # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
- img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
- }
-
- img_maxes = matrix(0, rows=C, cols=Hout*Wout) # zeros
- parfor (c in 1:C) { # all channels
- # Extract local image slice patches into columns with im2col, of shape (Hf*Wf, Hout*Wout)
- img_slice_cols = util::im2col(img[c,], Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
-
- # Max pooling on patches
- img_maxes[c,] = colMaxs(img_slice_cols)
- }
-
- out[n,] = matrix(img_maxes, rows=1, cols=C*Hout*Wout)
- }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for a 2D spatial max pooling layer.
- * The input data has N examples, each represented as a 3D volume
- * unrolled into a single vector.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of
- * shape (N, C*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- * - X: Input data matrix, of shape (N, C*Hin*Win).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * A typical value is 0.
- * - padw: Padding for left and right sides.
- * A typical value is 0.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
- */
- N = nrow(X)
- pad_value = -1/0 # in max pooling we pad with -infinity
-
- # Create gradient volume
- dX = matrix(0, rows=N, cols=C*Hin*Win)
-
- # Gradient of max pooling
- parfor (n in 1:N, check=0) { # all examples
- img = matrix(X[n,], rows=C, cols=Hin*Win)
- if (padh > 0 | padw > 0) {
- # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
- img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
- }
-
- dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
- parfor (c in 1:C, check=0) { # all channels
- img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
- dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
- for (hout in 1:Hout, check=0) { # all output rows
- hin = (hout-1)*strideh + 1
- for (wout in 1:Wout) { # all output columns
- win = (wout-1)*stridew + 1
- img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
- max_val_ind = img_slice_patch == max(img_slice_patch) # max value indicator matrix
- # gradient passes through only for the max value(s) in this patch
- dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
- dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
- + dimg_slice_patch
- }
- }
- dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
- }
-
- if (padh > 0 | padw > 0) {
- # Unpad image gradient
- dimg = util::unpad_image(dimg, Hin, Win, padh, padw) # shape (C, (Hin+2*padh)*(Win+2*padw))
- }
- dX[n,] = matrix(dimg, rows=1, cols=C*Hin*Win)
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
deleted file mode 100644
index 880f818..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
+++ /dev/null
@@ -1,103 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Max Pooling layer.
- *
- * This implementation uses a built-in operator for higher performance.
- */
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] out, int Hout, int Wout) {
- /*
- * Computes the forward pass for a 2D spatial max pooling layer.
- * The input data has N examples, each represented as a 3D volume
- * unrolled into a single vector.
- *
- * This implementation uses a built-in operator for higher
- * performance.
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * A typical value is 0.
- * - padw: Padding for left and right sides.
- * A typical value is 0.
- *
- * Outputs:
- * - out: Outputs, of shape (N, C*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- */
- N = nrow(X)
- Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
- Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
- # Max pooling - built-in implementation
- out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
- stride=[strideh,stridew], padding=[padh,padw])
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for a 2D spatial max pooling layer.
- * The input data has N examples, each represented as a 3D volume
- * unrolled into a single vector.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of
- * shape (N, C*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * A typical value is 0.
- * - padw: Padding for left and right sides.
- * A typical value is 0.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
- */
- N = nrow(X)
-
- # Gradient of max pooling
- dX = max_pool_backward(X, dout, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
- stride=[strideh,stridew], padding=[padh,padw])
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/relu.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/relu.dml b/scripts/staging/SystemML-NN/nn/layers/relu.dml
deleted file mode 100644
index 93a6e90..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/relu.dml
+++ /dev/null
@@ -1,59 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Rectified Linear Unit (ReLU) nonlinearity layer.
- */
-
-forward = function(matrix[double] X)
- return (matrix[double] out) {
- /*
- * Computes the forward pass for a ReLU nonlinearity layer.
- *
- * Performs an element-wise evaluation of `f(input) = max(0, input)`.
- *
- * Inputs:
- * - X: Inputs, of shape (any, any).
- *
- * Outputs:
- * - out: Outputs, of same shape as `X`.
- */
- out = max(X, 0)
-}
-
-backward = function(matrix[double] dout, matrix[double] X)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for a ReLU nonlinearity layer.
- *
- * Essentially performs a pass-through of the upstream gradient
- * for cells > 0.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
- * - X: Previous input data matrix, of shape (any, any).
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of same shape as `X`.
- */
- dX = (X > 0) * dout
-}
-
[08/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn`
library to `scripts/nn`
Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/grad_check.dml b/scripts/nn/test/grad_check.dml
new file mode 100644
index 0000000..f3bc9a7
--- /dev/null
+++ b/scripts/nn/test/grad_check.dml
@@ -0,0 +1,1769 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Gradient checks for various architectures.
+ */
+source("nn/layers/affine.dml") as affine
+source("nn/layers/batch_norm1d.dml") as batch_norm1d
+source("nn/layers/batch_norm2d.dml") as batch_norm2d
+source("nn/layers/conv2d.dml") as conv2d
+source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/dropout.dml") as dropout
+source("nn/layers/l1_loss.dml") as l1_loss
+source("nn/layers/l1_reg.dml") as l1_reg
+source("nn/layers/l2_loss.dml") as l2_loss
+source("nn/layers/l2_reg.dml") as l2_reg
+source("nn/layers/log_loss.dml") as log_loss
+source("nn/layers/lstm.dml") as lstm
+source("nn/layers/max_pool2d.dml") as max_pool2d
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
+source("nn/layers/relu.dml") as relu
+source("nn/layers/rnn.dml") as rnn
+source("nn/layers/scale_shift1d.dml") as scale_shift1d
+source("nn/layers/scale_shift2d.dml") as scale_shift2d
+source("nn/layers/sigmoid.dml") as sigmoid
+source("nn/layers/softmax.dml") as softmax
+source("nn/layers/tanh.dml") as tanh
+source("nn/test/conv2d_simple.dml") as conv2d_simple
+source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
+source("nn/test/util.dml") as test_util
+
+affine = function() {
+ /*
+ * Gradient check for the affine layer.
+ */
+ print("Grad checking the affine layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 100 # num features
+ M = 10 # num neurons
+ X = rand(rows=N, cols=D)
+ y = rand(rows=N, cols=M)
+ [W, b] = affine::init(D, M)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = affine::forward(X, W, b)
+ dout = l2_loss::backward(out, y)
+ [dX, dW, db] = affine::backward(dout, X, W, b)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = affine::forward(X, W, b)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = affine::forward(X, W, b)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W.")
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ outmh = affine::forward(X, W, b)
+ lossmh = l2_loss::forward(outmh, y)
+ W[i,j] = old + h
+ outph = affine::forward(X, W, b)
+ lossph = l2_loss::forward(outph, y)
+ W[i,j] = old # reset
+ dW_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b.")
+ for (i in 1:nrow(b)) {
+ for (j in 1:ncol(b)) {
+ # Compute numerical derivative
+ old = as.scalar(b[i,j])
+ b[i,j] = old - h
+ outmh = affine::forward(X, W, b)
+ lossmh = l2_loss::forward(outmh, y)
+ b[i,j] = old + h
+ outph = affine::forward(X, W, b)
+ lossph = l2_loss::forward(outph, y)
+ b[i,j] = old # reset
+ db_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+ }
+ }
+}
+
+batch_norm1d = function() {
+ /*
+ * Gradient check for the 1D batch normalization layer.
+ */
+ print("Grad checking the 1D batch normalization layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 100 # num features
+ mu = 0.9 # momentum
+ eps = 1e-5 # epsilon
+ X = rand(rows=N, cols=D)
+ y = rand(rows=N, cols=D)
+ gamma = rand(rows=1, cols=D)
+ beta = rand(rows=1, cols=D)
+ ema_mean = rand(rows=1, cols=D)
+ ema_var = rand(rows=1, cols=D)
+ #[dummy, dummy, ema_mean, ema_var] = batch_norm1d::init(D)
+
+ # Check training & testing modes
+ for (i in 1:2) {
+ if (i == 1)
+ mode = 'train'
+ else
+ mode = 'test'
+ print(" - Grad checking the '"+mode+"' mode.")
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ dout = l2_loss::backward(out, y)
+ [dX, dgamma, dbeta] = batch_norm1d::backward(dout, out, ema_mean_upd, ema_var_upd,
+ cache_mean, cache_var, cache_norm,
+ X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking gamma.")
+ for (i in 1:nrow(gamma)) {
+ for (j in 1:ncol(gamma)) {
+ # Compute numerical derivative
+ old = as.scalar(gamma[i,j])
+ gamma[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ gamma[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ gamma[i,j] = old # reset
+ dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+ lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking beta.")
+ for (i in 1:nrow(beta)) {
+ for (j in 1:ncol(beta)) {
+ # Compute numerical derivative
+ old = as.scalar(beta[i,j])
+ beta[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ beta[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ beta[i,j] = old # reset
+ dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+ lossph, lossmh)
+ }
+ }
+ }
+}
+
+batch_norm2d = function() {
+ /*
+ * Gradient check for the 2D (spatial) batch normalization layer.
+ */
+ print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ C = 2 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ mu = 0.9 # momentum
+ eps = 1e-5 # epsilon
+ X = rand(rows=N, cols=C*Hin*Win)
+ y = rand(rows=N, cols=C*Hin*Win)
+ gamma = rand(rows=C, cols=1)
+ beta = rand(rows=C, cols=1)
+ ema_mean = rand(rows=C, cols=1)
+ ema_var = rand(rows=C, cols=1)
+ #[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C)
+
+ # Check training & testing modes
+ for (i in 1:2) {
+ if (i == 1)
+ mode = 'train'
+ else
+ mode = 'test'
+ print(" - Grad checking the '"+mode+"' mode.")
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ dout = l2_loss::backward(out, y)
+ [dX, dgamma, dbeta] = batch_norm2d::backward(dout, out, ema_mean_upd, ema_var_upd,
+ cache_mean, cache_var, cache_norm,
+ X, gamma, beta, C, Hin, Win, mode,
+ ema_mean, ema_var, mu, eps)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking gamma.")
+ for (i in 1:nrow(gamma)) {
+ for (j in 1:ncol(gamma)) {
+ # Compute numerical derivative
+ old = as.scalar(gamma[i,j])
+ gamma[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ gamma[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ gamma[i,j] = old # reset
+ dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+ lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking beta.")
+ for (i in 1:nrow(beta)) {
+ for (j in 1:ncol(beta)) {
+ # Compute numerical derivative
+ old = as.scalar(beta[i,j])
+ beta[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ beta[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ beta[i,j] = old # reset
+ dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+ lossph, lossmh)
+ }
+ }
+ }
+}
+
+conv2d = function() {
+ /*
+ * Gradient check for the 2D convolutional layer using `im2col`.
+ */
+ print("Grad checking the `im2col` 2D convolutional layer with L2 loss.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 2 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ F = 2 # num filters
+ Hf = 3 # filter height
+ Wf = 3 # filter width
+ stride = 1
+ pad = 1
+ X = rand(rows=N, cols=C*Hin*Win)
+ y = rand(rows=N, cols=F*Hin*Win)
+
+ # Create layers
+ [W, b] = conv2d::init(F, C, Hf, Wf)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ dout = l2_loss::backward(out, y)
+ [dX, dW, db] = conv2d::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W.")
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ W[i,j] = old + h
+ [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ W[i,j] = old # reset
+ dW_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b.")
+ for (i in 1:nrow(b)) {
+ for (j in 1:ncol(b)) {
+ # Compute numerical derivative
+ old = as.scalar(b[i,j])
+ b[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ b[i,j] = old + h
+ [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ b[i,j] = old # reset
+ db_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+ }
+ }
+}
+
+conv2d_builtin = function() {
+ /*
+ * Gradient check for the 2D convolutional layer using built-in
+ * functions.
+ */
+ print("Grad checking the built-in 2D convolutional layer with L2 loss.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 2 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ F = 2 # num filters
+ Hf = 3 # filter height
+ Wf = 3 # filter width
+ stride = 1
+ pad = 1
+ X = rand(rows=N, cols=C*Hin*Win)
+ y = rand(rows=N, cols=F*Hin*Win)
+
+ # Create layers
+ [W, b] = conv2d_builtin::init(F, C, Hf, Wf)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ dout = l2_loss::backward(out, y)
+ [dX, dW, db] = conv2d_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W.")
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ W[i,j] = old + h
+ [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ W[i,j] = old # reset
+ dW_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b.")
+ for (i in 1:nrow(b)) {
+ for (j in 1:ncol(b)) {
+ # Compute numerical derivative
+ old = as.scalar(b[i,j])
+ b[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ b[i,j] = old + h
+ [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ b[i,j] = old # reset
+ db_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+ }
+ }
+}
+
+conv2d_simple = function() {
+ /*
+ * Gradient check for the simple reference 2D convolutional layer.
+ */
+ print("Grad checking the simple reference 2D convolutional layer with L2 loss.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 2 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ F = 2 # num filters
+ Hf = 3 # filter height
+ Wf = 3 # filter width
+ stride = 1
+ pad = 1
+ X = rand(rows=N, cols=C*Hin*Win)
+ y = rand(rows=N, cols=F*Hin*Win)
+
+ # Create layers
+ [W, b] = conv2d_simple::init(F, C, Hf, Wf)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ dout = l2_loss::backward(out, y)
+ [dX, dW, db] = conv2d_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W.")
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ W[i,j] = old + h
+ [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ W[i,j] = old # reset
+ dW_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b.")
+ for (i in 1:nrow(b)) {
+ for (j in 1:ncol(b)) {
+ # Compute numerical derivative
+ old = as.scalar(b[i,j])
+ b[i,j] = old - h
+ [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ b[i,j] = old + h
+ [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ b[i,j] = old # reset
+ db_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+ }
+ }
+}
+
+cross_entropy_loss = function() {
+ /*
+ * Gradient check for the cross-entropy loss function.
+ */
+ print("Grad checking the cross-entropy loss function.")
+
+ # Generate data
+ N = 3 # num examples
+ K = 10 # num targets
+ pred = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
+ pred = pred / rowSums(pred) # normalized probs
+ y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
+ y = y / rowSums(y) # normalized probs
+
+ # Compute analytical gradient
+ dpred = cross_entropy_loss::backward(pred, y)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(pred)) {
+ for (j in 1:ncol(pred)) {
+ # Compute numerical derivative
+ old = as.scalar(pred[i,j])
+ pred[i,j] = old - h
+ lossmh = cross_entropy_loss::forward(pred, y)
+ pred[i,j] = old + h
+ lossph = cross_entropy_loss::forward(pred, y)
+ pred[i,j] = old # reset W[i,j]
+ dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+ }
+ }
+}
+
+dropout = function() {
+ /*
+ * Gradient check for the (inverted) dropout layer.
+ */
+ print("Grad checking the (inverted) dropout layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ M = 100 # num neurons
+ p = 0.5 # probability of dropping neuron output
+ seed = as.integer(floor(as.scalar(rand(rows=1, cols=1, min=1, max=100000)))) # random seed
+ X = rand(rows=N, cols=M)
+ y = rand(rows=N, cols=M)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, mask] = dropout::forward(X, p, seed)
+ dout = l2_loss::backward(out, y)
+ dX = dropout::backward(dout, X, p, mask)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, mask] = dropout::forward(X, p, seed)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, mask] = dropout::forward(X, p, seed)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+}
+
+l1_loss = function() {
+ /*
+ * Gradient check for the L1 loss function.
+ */
+ print("Grad checking the L1 loss function.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 2 # num targets
+ pred = rand(rows=N, cols=D)
+ y = rand(rows=N, cols=D)
+
+ # Compute analytical gradient
+ dpred = l1_loss::backward(pred, y)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(pred)) {
+ for (j in 1:ncol(pred)) {
+ # Compute numerical derivative
+ old = as.scalar(pred[i,j])
+ pred[i,j] = old - h
+ lossmh = l1_loss::forward(pred, y)
+ pred[i,j] = old + h
+ lossph = l1_loss::forward(pred, y)
+ pred[i,j] = old # reset W[i,j]
+ dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+ }
+ }
+}
+
+l1_reg = function() {
+ /*
+ * Gradient check for the L1 regularization function.
+ */
+ print("Grad checking the L1 regularization function.")
+
+ # Generate data
+ D = 5 # num features
+ M = 3 # num neurons
+ lambda = 0.01
+ W = rand(rows=D, cols=M)
+
+ # Compute analytical gradient
+ dW = l1_reg::backward(W, lambda)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ reg_lossmh = l1_reg::forward(W, lambda)
+ W[i,j] = old + h
+ reg_lossph = l1_reg::forward(W, lambda)
+ W[i,j] = old # reset W[i,j]
+ dW_num = (reg_lossph-reg_lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
+ reg_lossph, reg_lossmh)
+ }
+ }
+}
+
+l2_loss = function() {
+ /*
+ * Gradient check for the L2 loss function.
+ */
+ print("Grad checking the L2 loss function.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 2 # num targets
+ pred = rand(rows=N, cols=D)
+ y = rand(rows=N, cols=D)
+
+ # Compute analytical gradient
+ dpred = l2_loss::backward(pred, y)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(pred)) {
+ for (j in 1:ncol(pred)) {
+ # Compute numerical derivative
+ old = as.scalar(pred[i,j])
+ pred[i,j] = old - h
+ lossmh = l2_loss::forward(pred, y)
+ pred[i,j] = old + h
+ lossph = l2_loss::forward(pred, y)
+ pred[i,j] = old # reset W[i,j]
+ dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+ }
+ }
+}
+
+l2_reg = function() {
+ /*
+ * Gradient check for the L2 regularization function.
+ */
+ print("Grad checking the L2 regularization function.")
+
+ # Generate data
+ D = 5 # num features
+ M = 3 # num neurons
+ lambda = 0.01
+ W = rand(rows=D, cols=M)
+
+ # Compute analytical gradient
+ dW = l2_reg::backward(W, lambda)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ reg_lossmh = l2_reg::forward(W, lambda)
+ W[i,j] = old + h
+ reg_lossph = l2_reg::forward(W, lambda)
+ W[i,j] = old # reset W[i,j]
+ dW_num = (reg_lossph-reg_lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
+ reg_lossph, reg_lossmh)
+ }
+ }
+}
+
+log_loss = function() {
+ /*
+ * Gradient check for the log loss function.
+ */
+ print("Grad checking the log loss function.")
+
+ # Generate data
+ N = 20 # num examples
+ D = 1 # num targets
+ pred = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
+ y = round(rand(rows=N, cols=D, min=0, max=1, pdf="uniform"))
+
+ # Compute analytical gradient
+ dpred = log_loss::backward(pred, y)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(pred)) {
+ for (j in 1:ncol(pred)) {
+ # Compute numerical derivative
+ old = as.scalar(pred[i,j])
+ pred[i,j] = old - h
+ lossmh = log_loss::forward(pred, y)
+ pred[i,j] = old + h
+ lossph = log_loss::forward(pred, y)
+ pred[i,j] = old # reset W[i,j]
+ dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
+ }
+ }
+}
+
+lstm = function() {
+ /*
+ * Gradient check for the LSTM layer.
+ */
+ print("Grad checking the LSTM layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 10 # num features
+ T = 15 # num timesteps (sequence length)
+ M = 5 # num neurons
+ return_seq = TRUE
+ X = rand(rows=N, cols=T*D)
+ y = rand(rows=N, cols=T*M)
+ yc = rand(rows=N, cols=M)
+ out0 = rand(rows=N, cols=M)
+ c0 = rand(rows=N, cols=M)
+ [W, b, dummy, dummy2] = lstm::init(N, D, M)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ dout = l2_loss::backward(out, y)
+ dc = l2_loss::backward(c, yc)
+ [dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0,
+ cache_out, cache_c, cache_ifog)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outmh = l2_loss::forward(outmh, y)
+ loss_cmh = l2_loss::forward(cmh, yc)
+ lossmh = loss_outmh + loss_cmh
+ X[i,j] = old + h
+ [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outph = l2_loss::forward(outph, y)
+ loss_cph = l2_loss::forward(cph, yc)
+ lossph = loss_outph + loss_cph
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W.")
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outmh = l2_loss::forward(outmh, y)
+ loss_cmh = l2_loss::forward(cmh, yc)
+ lossmh = loss_outmh + loss_cmh
+ W[i,j] = old + h
+ [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outph = l2_loss::forward(outph, y)
+ loss_cph = l2_loss::forward(cph, yc)
+ lossph = loss_outph + loss_cph
+ W[i,j] = old # reset
+ dW_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b.")
+ for (i in 1:nrow(b)) {
+ for (j in 1:ncol(b)) {
+ # Compute numerical derivative
+ old = as.scalar(b[i,j])
+ b[i,j] = old - h
+ [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outmh = l2_loss::forward(outmh, y)
+ loss_cmh = l2_loss::forward(cmh, yc)
+ lossmh = loss_outmh + loss_cmh
+ b[i,j] = old + h
+ [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outph = l2_loss::forward(outph, y)
+ loss_cph = l2_loss::forward(cph, yc)
+ lossph = loss_outph + loss_cph
+ b[i,j] = old # reset
+ db_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking out0.")
+ for (i in 1:nrow(out0)) {
+ for (j in 1:ncol(out0)) {
+ # Compute numerical derivative
+ old = as.scalar(out0[i,j])
+ out0[i,j] = old - h
+ [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outmh = l2_loss::forward(outmh, y)
+ loss_cmh = l2_loss::forward(cmh, yc)
+ lossmh = loss_outmh + loss_cmh
+ out0[i,j] = old + h
+ [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outph = l2_loss::forward(outph, y)
+ loss_cph = l2_loss::forward(cph, yc)
+ lossph = loss_outph + loss_cph
+ out0[i,j] = old # reset
+ dout0_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking c0.")
+ for (i in 1:nrow(c0)) {
+ for (j in 1:ncol(c0)) {
+ # Compute numerical derivative
+ old = as.scalar(c0[i,j])
+ c0[i,j] = old - h
+ [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outmh = l2_loss::forward(outmh, y)
+ loss_cmh = l2_loss::forward(cmh, yc)
+ lossmh = loss_outmh + loss_cmh
+ c0[i,j] = old + h
+ [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
+ loss_outph = l2_loss::forward(outph, y)
+ loss_cph = l2_loss::forward(cph, yc)
+ lossph = loss_outph + loss_cph
+ c0[i,j] = old # reset
+ dc0_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
+ }
+ }
+}
+
+max_pool2d = function() {
+ /*
+ * Gradient check for the 2D max pooling layer.
+ */
+ print("Grad checking the 2D max pooling layer with L2 loss.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 2 # num channels
+ Hin = 4 # input height
+ Win = 4 # input width
+ Hf = 2 # pool filter height
+ Wf = 2 # pool filter width
+ stride = 2
+ X = rand(rows=N, cols=C*Hin*Win)
+
+ for (pad in 0:1) {
+ print(" - Grad checking w/ pad="+pad+".")
+ Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
+ Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
+ y = rand(rows=N, cols=C*Hout*Wout)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ dout = l2_loss::backward(out, y)
+ dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+ }
+}
+
+max_pool2d_builtin = function() {
+ /*
+ * Gradient check for the 2D max pooling layer.
+ */
+ print("Grad checking the built-in 2D max pooling layer with L2 loss.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 2 # num channels
+ Hin = 4 # input height
+ Win = 4 # input width
+ Hf = 2 # pool filter height
+ Wf = 2 # pool filter width
+ stride = 2
+ X = rand(rows=N, cols=C*Hin*Win)
+
+ for (pad in 0:1) {
+ print(" - Grad checking w/ pad="+pad+".")
+ Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
+ Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
+ y = rand(rows=N, cols=C*Hout*Wout)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ dout = l2_loss::backward(out, y)
+ dX = max_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+ }
+}
+
+max_pool2d_simple = function() {
+ /*
+ * Gradient check for the simple reference 2D max pooling layer.
+ */
+ print("Grad checking the simple reference 2D max pooling layer with L2 loss.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 2 # num channels
+ Hin = 4 # input height
+ Win = 4 # input width
+ Hf = 2 # pool filter height
+ Wf = 2 # pool filter width
+ stride = 2
+ X = rand(rows=N, cols=C*Hin*Win)
+
+ for (pad in 0:1) {
+ print(" - Grad checking w/ pad="+pad+".")
+ Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
+ Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
+ y = rand(rows=N, cols=C*Hout*Wout)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ dout = l2_loss::backward(out, y)
+ dX = max_pool2d_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+ }
+}
+
+relu = function() {
+ /*
+ * Gradient check for the ReLU nonlinearity layer.
+ *
+ * NOTE: This could result in a false-negative in which the test
+ * fails due to a kink being crossed in the nonlinearity. This
+ * occurs when the tests, f(x-h) and f(x+h), end up on opposite
+ * sides of the zero threshold of max(0, fx). For now, just run
+ * the tests again. In the future, we can explicitly check for
+ * this and rerun the test automatically.
+ */
+ print("Grad checking the ReLU nonlinearity layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ M = 10 # num neurons
+ X = rand(rows=N, cols=M, min=-5, max=5)
+ y = rand(rows=N, cols=M)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = relu::forward(X)
+ dout = l2_loss::backward(out, y)
+ dX = relu::backward(dout, X)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = relu::forward(X)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = relu::forward(X)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+}
+
+rnn = function() {
+ /*
+ * Gradient check for the simple RNN layer.
+ */
+ print("Grad checking the simple RNN layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 10 # num features
+ T = 15 # num timesteps (sequence length)
+ M = 5 # num neurons
+ return_seq = TRUE
+ X = rand(rows=N, cols=T*D)
+ y = rand(rows=N, cols=T*M)
+ out0 = rand(rows=N, cols=M)
+ [W, b, dummy] = rnn::init(N, D, M)
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ dout = l2_loss::backward(out, y)
+ [dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W.")
+ for (i in 1:nrow(W)) {
+ for (j in 1:ncol(W)) {
+ # Compute numerical derivative
+ old = as.scalar(W[i,j])
+ W[i,j] = old - h
+ [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossmh = l2_loss::forward(outmh, y)
+ W[i,j] = old + h
+ [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossph = l2_loss::forward(outph, y)
+ W[i,j] = old # reset
+ dW_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b.")
+ for (i in 1:nrow(b)) {
+ for (j in 1:ncol(b)) {
+ # Compute numerical derivative
+ old = as.scalar(b[i,j])
+ b[i,j] = old - h
+ [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossmh = l2_loss::forward(outmh, y)
+ b[i,j] = old + h
+ [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossph = l2_loss::forward(outph, y)
+ b[i,j] = old # reset
+ db_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking out0.")
+ for (i in 1:nrow(out0)) {
+ for (j in 1:ncol(out0)) {
+ # Compute numerical derivative
+ old = as.scalar(out0[i,j])
+ out0[i,j] = old - h
+ [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossmh = l2_loss::forward(outmh, y)
+ out0[i,j] = old + h
+ [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
+ lossph = l2_loss::forward(outph, y)
+ out0[i,j] = old # reset
+ dout0_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
+ }
+ }
+}
+
+scale_shift1d = function() {
+ /*
+ * Gradient check for the 1D scale & shift layer.
+ */
+ print("Grad checking the 1D scale & shift layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 100 # num features
+ X = rand(rows=N, cols=D)
+ y = rand(rows=N, cols=D)
+ [gamma, beta] = scale_shift1d::init(D)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = scale_shift1d::forward(X, gamma, beta)
+ dout = l2_loss::backward(out, y)
+ [dX, dgamma, dbeta] = scale_shift1d::backward(dout, out, X, gamma, beta)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = scale_shift1d::forward(X, gamma, beta)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = scale_shift1d::forward(X, gamma, beta)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking gamma.")
+ for (i in 1:nrow(gamma)) {
+ for (j in 1:ncol(gamma)) {
+ # Compute numerical derivative
+ old = as.scalar(gamma[i,j])
+ gamma[i,j] = old - h
+ outmh = scale_shift1d::forward(X, gamma, beta)
+ lossmh = l2_loss::forward(outmh, y)
+ gamma[i,j] = old + h
+ outph = scale_shift1d::forward(X, gamma, beta)
+ lossph = l2_loss::forward(outph, y)
+ gamma[i,j] = old # reset
+ dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+ lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking beta.")
+ for (i in 1:nrow(beta)) {
+ for (j in 1:ncol(beta)) {
+ # Compute numerical derivative
+ old = as.scalar(beta[i,j])
+ beta[i,j] = old - h
+ outmh = scale_shift1d::forward(X, gamma, beta)
+ lossmh = l2_loss::forward(outmh, y)
+ beta[i,j] = old + h
+ outph = scale_shift1d::forward(X, gamma, beta)
+ lossph = l2_loss::forward(outph, y)
+ beta[i,j] = old # reset
+ dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+ lossph, lossmh)
+ }
+ }
+}
+
+scale_shift2d = function() {
+ /*
+ * Gradient check for the 2D scale & shift layer.
+ */
+ print("Grad checking the 2D scale & shift layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ C = 2 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ X = rand(rows=N, cols=C*Hin*Win)
+ y = rand(rows=N, cols=C*Hin*Win)
+ [gamma, beta] = scale_shift2d::init(C)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ dout = l2_loss::backward(out, y)
+ [dX, dgamma, dbeta] = scale_shift2d::backward(dout, out, X, gamma, beta, C, Hin, Win)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking gamma.")
+ for (i in 1:nrow(gamma)) {
+ for (j in 1:ncol(gamma)) {
+ # Compute numerical derivative
+ old = as.scalar(gamma[i,j])
+ gamma[i,j] = old - h
+ outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossmh = l2_loss::forward(outmh, y)
+ gamma[i,j] = old + h
+ outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossph = l2_loss::forward(outph, y)
+ gamma[i,j] = old # reset
+ dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+ lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking beta.")
+ for (i in 1:nrow(beta)) {
+ for (j in 1:ncol(beta)) {
+ # Compute numerical derivative
+ old = as.scalar(beta[i,j])
+ beta[i,j] = old - h
+ outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossmh = l2_loss::forward(outmh, y)
+ beta[i,j] = old + h
+ outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossph = l2_loss::forward(outph, y)
+ beta[i,j] = old # reset
+ dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+ lossph, lossmh)
+ }
+ }
+}
+
+sigmoid = function() {
+ /*
+ * Gradient check for the sigmoid nonlinearity layer.
+ */
+ print("Grad checking the sigmoid nonlinearity layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ M = 10 # num neurons
+ X = rand(rows=N, cols=M)
+ y = rand(rows=N, cols=M)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = sigmoid::forward(X)
+ dout = l2_loss::backward(out, y)
+ dX = sigmoid::backward(dout, X)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = sigmoid::forward(X)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = sigmoid::forward(X)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+}
+
+softmax = function() {
+ /*
+ * Gradient check for the softmax layer.
+ */
+ print("Grad checking the softmax layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 10 # num classes
+ X = rand(rows=N, cols=D)
+ y = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
+ y = y / rowSums(y)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = softmax::forward(X)
+ dout = l2_loss::backward(out, y)
+ dX = softmax::backward(dout, X)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = softmax::forward(X)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = softmax::forward(X)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+}
+
+tanh = function() {
+ /*
+ * Gradient check for the hyperbolic tangent (tanh) nonlinearity
+ * layer.
+ */
+ print("Grad checking the tanh nonlinearity layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ M = 10 # num neurons
+ X = rand(rows=N, cols=M)
+ y = rand(rows=N, cols=M)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = tanh::forward(X)
+ dout = l2_loss::backward(out, y)
+ dX = tanh::backward(dout, X)
+
+ # Grad check
+ h = 1e-5
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = tanh::forward(X)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = tanh::forward(X)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+}
+
+two_layer_affine_l2_net = function() {
+ /*
+ * Gradient check for a two-layer, fully-connected, feed-forward
+ * network with ReLU nonlinearity and L2 loss.
+ *
+ * NOTE: This could result in a false-negative in which the test
+ * fails due to a kink being crossed in the ReLU nonlinearity. This
+ * occurs when the tests, f(x-h) and f(x+h), end up on opposite
+ * sides of the zero threshold of max(0, fx). For now, just run
+ * the tests again. In the future, we can explicitly check for
+ * this and rerun the test automatically.
+ */
+ print("Grad checking a two-layer, fully-connected, feed-forward network with a ReLU " +
+ "nonlinearity, and an L2 loss function.")
+
+ # Generate input data
+ N = 1000 # num examples
+ D = 100 # num features
+ yD = 5 # num targets
+ X = rand(rows=N, cols=D, pdf="normal")
+ y = rand(rows=N, cols=yD)
+
+ # Create 2-layer, fully-connected network
+ M = 10 # number of hidden neurons
+ [W1, b1] = affine::init(D, M)
+ [W2, b2] = affine::init(M, yD)
+
+ # Optimize for short "burn-in" time to move to characteristic
+ # mode of operation and unmask any real issues.
+ print(" - Burn-in:")
+ lr = 0.0001
+ decay = 0.99
+ for(i in 1:5) {
+ # Compute forward and backward passes of net
+ [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
+ print(" - L2 loss: " + loss)
+
+ # Optimize with basic SGD
+ W1 = W1 - lr * dW1
+ b1 = b1 - lr * db1
+ W2 = W2 - lr * dW2
+ b2 = b2 - lr * db2
+ lr = lr * decay
+ }
+
+ # Compute analytical gradients
+ [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:2) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old_x = as.scalar(X[i,j])
+ X[i,j] = old_x - h
+ [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ X[i,j] = old_x + h
+ [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ X[i,j] = old_x # reset X[i,j]
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W1.")
+ for (i in 1:nrow(W1)) {
+ for (j in 1:ncol(W1)) {
+ # Compute numerical derivative
+ old_w = as.scalar(W1[i,j])
+ W1[i,j] = old_w - h
+ [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ W1[i,j] = old_w + h
+ [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ W1[i,j] = old_w # reset W[i,j]
+ dWij_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking W2.")
+ for (i in 1:nrow(W2)) {
+ for (j in 1:ncol(W2)) {
+ # Compute numerical derivative
+ old_w = as.scalar(W2[i,j])
+ W2[i,j] = old_w - h
+ [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ W2[i,j] = old_w + h
+ [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ W2[i,j] = old_w # reset W[i,j]
+ dWij_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b1.")
+ for (i in 1:nrow(b1)) {
+ for (j in 1:ncol(b1)) {
+ # Compute numerical derivative
+ old_b = as.scalar(b1[i,j])
+ b1[i,j] = old_b - h
+ [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ b1[i,j] = old_b + h
+ [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ b1[i,j] = old_b # reset b[1,j]
+ dbij_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking b2.")
+ for (i in 1:nrow(b2)) {
+ for (j in 1:ncol(b2)) {
+ # Compute numerical derivative
+ old_b = as.scalar(b2[i,j])
+ b2[i,j] = old_b - h
+ [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ b2[i,j] = old_b + h
+ [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+ b2[i,j] = old_b # reset b[1,j]
+ dbij_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh)
+ }
+ }
+}
+
+/*
+ * Test network with forward/backward functions.
+ */
+two_layer_affine_l2_net_run = function(matrix[double] X, matrix[double] y,
+ matrix[double] W1, matrix[double] b1,
+ matrix[double] W2, matrix[double] b2)
+ return (matrix[double] pred, double loss,
+ matrix[double] dX,
+ matrix[double] dW1, matrix[double] db1,
+ matrix[double] dW2, matrix[double] db2) {
+ # Compute forward pass
+ [loss, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
+
+ # Compute backward pass
+ [dX, dpred, daout, dhout, dW1, db1, dW2, db2] =
+ two_layer_affine_l2_net_backward(X, y, pred, aout, hout, W1, b1, W2, b2)
+}
+
+two_layer_affine_l2_net_forward = function(matrix[double] X, matrix[double] y,
+ matrix[double] W1, matrix[double] b1,
+ matrix[double] W2, matrix[double] b2)
+ return (double loss, matrix[double] pred, matrix[double] aout, matrix[double] hout) {
+ # Compute forward pass
+ hout = affine::forward(X, W1, b1)
+ aout = relu::forward(hout)
+ pred = affine::forward(aout, W2, b2)
+
+ # Compute loss
+ loss = l2_loss::forward(pred, y)
+}
+
+two_layer_affine_l2_net_backward = function(matrix[double] X, matrix[double] y, matrix[double] pred,
+ matrix[double] aout, matrix[double] hout,
+ matrix[double] W1, matrix[double] b1,
+ matrix[double] W2, matrix[double] b2)
+ return (matrix[double] dX, matrix[double] dpred,
+ matrix[double] daout, matrix[double] dhout,
+ matrix[double] dW1, matrix[double] db1, matrix[double] dW2, matrix[double] db2) {
+ # Compute backward pass
+ dpred = l2_loss::backward(pred, y)
+ [daout, dW2, db2] = affine::backward(dpred, aout, W2, b2)
+ dhout = relu::backward(daout, hout)
+ [dX, dW1, db1] = affine::backward(dhout, X, W1, b1)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/max_pool2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/max_pool2d_simple.dml b/scripts/nn/test/max_pool2d_simple.dml
new file mode 100644
index 0000000..188bd6e
--- /dev/null
+++ b/scripts/nn/test/max_pool2d_simple.dml
@@ -0,0 +1,172 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Max Pooling layer.
+ *
+ * This implementation is intended to be a simple, reference version.
+ */
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] out, int Hout, int Wout) {
+ /*
+ * Computes the forward pass for a 2D spatial max pooling layer.
+ * The input data has N examples, each represented as a 3D volume
+ * unrolled into a single vector.
+ *
+ * This implementation is intended to be a simple, reference version.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * A typical value is 0.
+ * - padw: Padding for left and right sides.
+ * A typical value is 0.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, C*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ */
+ N = nrow(X)
+ Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+ Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+ # Create output volume
+ out = matrix(0, rows=N, cols=C*Hout*Wout)
+
+ # Max pooling
+ parfor (n in 1:N, check=0) { # all examples
+ Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+
+ # Pad image
+ pad_value = -1/0
+ Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros
+ parfor (c in 1:C) {
+ Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped
+ Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+ Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+ Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape
+ }
+ img = Xn_padded # shape (C, (Hin+2*padh)*(Win+2*padw))
+
+ parfor (c in 1:C, check=0) { # all channels
+ img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+ parfor (hout in 1:Hout, check=0) { # all output rows
+ hin = (hout-1) * strideh + 1
+ parfor (wout in 1:Wout, check=0) { # all output columns
+ win = (wout-1) * stridew + 1
+ out[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout] = max(img_slice[hin:hin+Hf-1,
+ win:win+Wf-1])
+ }
+ }
+ }
+ }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for a 2D spatial max pooling layer.
+ * The input data has N examples, each represented as a 3D volume
+ * unrolled into a single vector.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, C*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * A typical value is 0.
+ * - padw: Padding for left and right sides.
+ * A typical value is 0.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ */
+ N = nrow(X)
+
+ # Create gradient volume
+ dX = matrix(0, rows=N, cols=C*Hin*Win)
+
+ # Gradient of max pooling
+ for (n in 1:N) { # all examples
+ Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+
+ # Pad image
+ pad_value = -1/0
+ Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros
+ parfor (c in 1:C) {
+ Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped
+ Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+ Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+ Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape
+ }
+ img = Xn_padded
+
+ dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+ for (c in 1:C) { # all channels
+ img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+ dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+ for (hout in 1:Hout, check=0) { # all output rows
+ hin = (hout-1) * strideh + 1
+ for (wout in 1:Wout) { # all output columns
+ win = (wout-1) * stridew + 1
+ img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
+ max_val_ind = img_slice_patch == max(img_slice_patch) # max value indicator matrix
+ # gradient passes through only for the max value(s) in this patch
+ dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
+ dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
+ + dimg_slice_patch
+ }
+ }
+ dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
+ }
+
+ # Unpad derivs on input
+ dXn = matrix(0, rows=C, cols=Hin*Win)
+ parfor (c in 1:C, check=0) {
+ dXn_padded_slice = matrix(dimg[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
+ dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
+ dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
+ }
+ dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/run_tests.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/run_tests.dml b/scripts/nn/test/run_tests.dml
new file mode 100644
index 0000000..d8173a9
--- /dev/null
+++ b/scripts/nn/test/run_tests.dml
@@ -0,0 +1,90 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Script to run tests.
+ */
+source("nn/test/grad_check.dml") as grad_check
+source("nn/test/test.dml") as test
+
+print("")
+print("Starting grad checks.")
+print("---")
+
+# Loss & loss-related functions
+grad_check::cross_entropy_loss()
+grad_check::l1_loss()
+grad_check::l1_reg()
+grad_check::l2_loss()
+grad_check::l2_reg()
+grad_check::log_loss()
+print("")
+
+# Core layers
+grad_check::affine()
+grad_check::batch_norm1d()
+grad_check::batch_norm2d()
+grad_check::conv2d()
+grad_check::conv2d_builtin()
+grad_check::conv2d_simple()
+grad_check::dropout()
+grad_check::lstm()
+grad_check::max_pool2d()
+grad_check::max_pool2d_builtin()
+grad_check::max_pool2d_simple()
+grad_check::relu()
+grad_check::rnn()
+grad_check::scale_shift1d()
+grad_check::scale_shift2d()
+grad_check::sigmoid()
+grad_check::softmax()
+grad_check::tanh()
+print("")
+
+# Example model
+grad_check::two_layer_affine_l2_net()
+print("")
+
+print("---")
+print("Grad checks complete -- look for any ERRORs or WARNINGs.")
+print("If any tests involving ReLUs failed, try a few times " +
+ "to ensure that they were not false negatives due to " +
+ "kinks being crossed.")
+print("")
+
+print("")
+print("Starting other tests.")
+print("---")
+
+test::batch_norm1d()
+test::batch_norm2d()
+test::conv2d()
+test::cross_entropy_loss()
+test::im2col()
+test::max_pool2d()
+test::padding()
+test::tanh()
+
+print("---")
+print("Other tests complete -- look for any ERRORs or WARNINGs.")
+print("")
+print("")
+
[05/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn`
library to `scripts/nn`
Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/rnn.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/rnn.dml b/scripts/staging/SystemML-NN/nn/layers/rnn.dml
deleted file mode 100644
index 3c6faae..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/rnn.dml
+++ /dev/null
@@ -1,183 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Simple (Vanilla) RNN layer.
- */
-source("nn/layers/tanh.dml") as tanh
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
- boolean return_sequences, matrix[double] out0)
- return (matrix[double] out, matrix[double] cache_out) {
- /*
- * Computes the forward pass for a simple RNN layer with M neurons.
- * The input data has N sequences of T examples, each with D features.
- *
- * In a simple RNN, the output of the previous timestep is fed back
- * in as an additional input at the current timestep.
- *
- * Inputs:
- * - X: Inputs, of shape (N, T*D).
- * - W: Weights, of shape (D+M, M).
- * - b: Biases, of shape (1, M).
- * - T: Length of example sequences (number of timesteps).
- * - D: Dimensionality of the input features (number of features).
- * - return_sequences: Whether to return `out` at all timesteps,
- * or just for the final timestep.
- * - out0: Output matrix from previous timestep, of shape (N, M).
- * Note: This is *optional* and could just be an empty matrix.
- *
- * Outputs:
- * - out: If `return_sequences` is True, outputs for all timesteps,
- * of shape (N, T*M). Else, outputs for the final timestep, of
- * shape (N, M).
- * - cache_out: Cache of outputs, of shape (T, N*M).
- * Note: This is used for performance during training.
- */
- N = nrow(X)
- M = ncol(W)
- out_prev = out0
- if (return_sequences) {
- out = matrix(0, rows=N, cols=T*M)
- }
- else {
- out = matrix(0, rows=N, cols=M)
- }
- # caches to be used during the backward pass for performance
- cache_out = matrix(0, rows=T, cols=N*M)
-
- for (t in 1:T) { # each timestep
- X_t = X[,(t-1)*D+1:t*D] # shape (N, D)
- input = cbind(X_t, out_prev) # shape (N, D+M)
- out_t = tanh::forward(input %*% W + b) # shape (N, M)
- # store
- if (return_sequences) {
- out[,(t-1)*M+1:t*M] = out_t
- }
- else {
- out = out_t
- }
- out_prev = out_t
- cache_out[t,] = matrix(out_t, rows=1, cols=N*M) # reshape
- }
-}
-
-backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, matrix[double] b,
- int T, int D, boolean given_sequences, matrix[double] out0,
- matrix[double] cache_out)
- return (matrix[double] dX, matrix[double] dW, matrix[double] db, matrix[double] dout0) {
- /*
- * Computes the backward pass for a simple RNN layer with M neurons.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream. If `given_sequences`
- * is True, contains gradients on outputs for all timesteps,
- * of shape (N, T*M). Else, contains gradient on output for
- * the final timestep, of shape (N, M).
- * - X: Inputs, of shape (N, T*D).
- * - W: Weights, of shape (D+M, M).
- * - b: Biases, of shape (1, M).
- * - T: Length of example sequences (number of timesteps).
- * - D: Dimensionality of the input features (number of features).
- * - given_sequences: Whether `dout` is for all timesteps,
- * or just for the final timestep. This is based on whether
- * `return_sequences` was true in the forward pass.
- * - out0: Output matrix from previous timestep, of shape (N, M).
- * Note: This is *optional* and could just be an empty matrix.
- * - cache_out: Cache of outputs, of shape (T, N*M).
- * Note: This is used for performance during training.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, T*D).
- * - dW: Gradient wrt `W`, of shape (D+M, 4M).
- * - db: Gradient wrt `b`, of shape (1, 4M).
- * - dout0: Gradient wrt `out0`, of shape (N, M).
- */
- N = nrow(X)
- M = ncol(W)
- dX = matrix(0, rows=N, cols=T*D)
- dW = matrix(0, rows=D+M, cols=M)
- db = matrix(0, rows=1, cols=M)
- dout0 = matrix(0, rows=N, cols=M)
- if (!given_sequences) {
- # only given dout for output at final timestep, so prepend empty douts for all other timesteps
- dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout) # shape (N, T*M)
- }
-
- t = T
- for (iter in 1:T) { # each timestep in reverse order
- X_t = X[,(t-1)*D+1:t*D] # shape (N, D)
- dout_t = dout[,(t-1)*M+1:t*M] # shape (N, M)
- out_t = matrix(cache_out[t,], rows=N, cols=M) # shape (N, M)
- if (t == 1) {
- out_prev = out0 # shape (N, M)
- }
- else {
- out_prev = matrix(cache_out[t-1,], rows=N, cols=M) # shape (N, M)
- }
- input = cbind(X_t, out_prev) # shape (N, D+M)
- dout_t_raw = (1-out_t^2) * dout_t # into tanh, shape (N, M)
- dW = dW + t(input) %*% dout_t_raw # shape (D+M, M)
- db = db + colSums(dout_t_raw) # shape (1, M)
- dinput = dout_t_raw %*% t(W) # shape (N, D+M)
- dX[,(t-1)*D+1:t*D] = dinput[,1:D]
- dout_prev = dinput[,D+1:D+M] # shape (N, M)
- if (t == 1) {
- dout0 = dout_prev # shape (N, M)
- }
- else {
- dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M)
- }
- t = t - 1
- }
-}
-
-init = function(int N, int D, int M)
- return (matrix[double] W, matrix[double] b, matrix[double] out0) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * We use the Glorot uniform heuristic which limits the magnification
- * of inputs/gradients during forward/backward passes by scaling
- * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
- * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
- *
- * Inputs:
- * - N: Number of examples in batch.
- * - D: Dimensionality of the input features (number of features).
- * - M: Number of neurons in this layer.
- *
- * Outputs:
- * - W: Weights, of shape (D+M, M).
- * - b: Biases, of shape (1, M).
- * - out0: Empty previous timestep output matrix, of shape (N, M).
- */
- fan_in = D+M
- fan_out = M
- scale = sqrt(6/(fan_in+fan_out))
- W = rand(rows=D+M, cols=M, min=-scale, max=scale, pdf="uniform")
- b = matrix(0, rows=1, cols=M)
- out0 = matrix(0, rows=N, cols=M)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml b/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml
deleted file mode 100644
index 7e162a3..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml
+++ /dev/null
@@ -1,95 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 1D Scale & Shift layer.
- */
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta)
- return (matrix[double] out) {
- /*
- * Computes the forward pass for a 1D scale & shift layer. The input
- * data has N examples, each with D features.
- *
- * A 1D scale & shift layer introduces learnable parameters
- * (gamma, beta) to scale and shift the input on a per-feature basis.
- *
- * `y = x*gamma + beta`
- *
- * Inputs:
- * - X: Inputs, of shape (N, D).
- * - gamma: Scale parameters, of shape (1, D).
- * - beta: Shift parameters, of shape (1, D).
- *
- * Outputs:
- * - out: Outputs, of shape (N, D).
- */
- # Scale and shift
- out = X*gamma + beta # shape (N, D)
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
- matrix[double] X, matrix[double] gamma, matrix[double] beta)
- return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
- /*
- * Computes the backward pass for a 1D scale & shift layer.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of shape (N, D).
- * - out: Outputs from the forward pass, of shape (N, D).
- * - X: Inputs, of shape (N, D).
- * - gamma: Scale parameters, of shape (1, D).
- * - beta: Shift parameters, of shape (1, D).
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, D).
- * - dgamma: Gradient wrt `W`, of shape (1, D).
- * - dbeta: Gradient wrt `b`, of shape (1, D).
- *
- */
- # Compute gradients during training
- dgamma = colSums(dout*X) # shape (1, D)
- dbeta = colSums(dout) # shape (1, D)
- dX = dout * gamma # shape (N, D)
-}
-
-init = function(int D)
- return (matrix[double] gamma, matrix[double] beta) {
- /*
- * Initialize the parameters of this layer.
- *
- * By default, we initialize to an identity function, with a scale
- * filler of `1`, and a shift filler of `0`.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * Inputs:
- * - D: Dimensionality of the input features (number of features).
- *
- * Outputs:
- * - gamma: Scale parameters, of shape (1, D).
- * - beta: Shift parameters, of shape (1, D).
- */
- gamma = matrix(1, rows=1, cols=D)
- beta = matrix(0, rows=1, cols=D)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml b/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml
deleted file mode 100644
index 79c884a..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml
+++ /dev/null
@@ -1,107 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Scale & Shift layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
- int C, int Hin, int Win)
- return (matrix[double] out) {
- /*
- * Computes the forward pass for a 2D scale & shift layer. The input
- * data has N examples, each represented as a 3D volume unrolled into
- * a single vector.
- *
- * A 2D scale & shift layer introduces learnable parameters
- * (gamma, beta) to scale and shift the input on a per-channel basis.
- *
- * `y = x*gamma + beta`
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - gamma: Scale parameters, of shape (C, 1).
- * - beta: Shift parameters, of shape (C, 1).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- *
- * Outputs:
- * - out: Outputs, of shape (N, C*Hin*Win).
- */
- # Scale and shift
- scaled = bias_multiply(X, gamma) # shape (N, C*Hin*Win)
- out = bias_add(scaled, beta) # shape (N, C*Hin*Win)
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
- matrix[double] X, matrix[double] gamma, matrix[double] beta,
- int C, int Hin, int Win)
- return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
- /*
- * Computes the backward pass for a 2D scale & shift layer.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
- * - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
- * - X: Input data matrix to the forward pass, of
- * shape (N, C*Hin*Win).
- * - gamma: Scale parameters, of shape (C, 1).
- * - beta: Shift parameters, of shape (C, 1).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
- * - dgamma: Gradient wrt `W`, of shape (C, 1).
- * - dbeta: Gradient wrt `b`, of shape (C, 1).
- *
- */
- # Compute gradients during training
- dgamma = util::channel_sums(dout*X, C, Hin, Win) # shape (C, 1)
- dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
- dX = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
-}
-
-init = function(int C)
- return (matrix[double] gamma, matrix[double] beta) {
- /*
- * Initialize the parameters of this layer.
- *
- * By default, we initialize to an identity function, with a scale
- * filler of `1`, and a shift filler of `0`.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * Inputs:
- * - C: Number of input channels (dimensionality of input depth).
- *
- * Outputs:
- * - gamma: Scale parameters, of shape (C, 1).
- * - beta: Shift parameters, of shape (C, 1).
- */
- gamma = matrix(1, rows=C, cols=1)
- beta = matrix(0, rows=C, cols=1)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml b/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
deleted file mode 100644
index 2d85adc..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
+++ /dev/null
@@ -1,62 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Sigmoid nonlinearity layer.
- */
-
-forward = function(matrix[double] X)
- return (matrix[double] out) {
- /*
- * Computes the forward pass for a sigmoid nonlinearity layer.
- *
- * `sigmoid(x) = 1 / (1 + e^-x)`
- *
- * If `X` contains a single feature column, the output of a sigmoid
- * layer can be interpreted as a predicted probability of a true
- * class when paired with a log loss function in a binary
- * classification problem.
- *
- * Inputs:
- * - X: Inputs, of shape (any, any).
- *
- * Outputs:
- * - out: Outputs, of same shape as `X`.
- */
- out = 1 / (1+exp(-X))
-}
-
-backward = function(matrix[double] dout, matrix[double] X)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for a sigmoid nonlinearity layer.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
- * - X: Inputs, of shape (any, any).
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of same shape as `X`.
- */
- out = 1 / (1+exp(-X))
- dX = out * (1-out) * dout
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/softmax.dml b/scripts/staging/SystemML-NN/nn/layers/softmax.dml
deleted file mode 100644
index 68a7bc7..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/softmax.dml
+++ /dev/null
@@ -1,87 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Softmax classifier layer.
- */
-
-forward = function(matrix[double] scores)
- return (matrix[double] probs) {
- /*
- * Computes the forward pass for a softmax classifier. The inputs
- * are interpreted as unnormalized, log-probabilities for each of
- * N examples, and the softmax function transforms them to normalized
- * probabilities.
- *
- * This can be interpreted as a generalization of the sigmoid
- * function to multiple classes.
- *
- * `probs_ij = e^scores_ij / sum(e^scores_i)`
- *
- * Inputs:
- * - scores: Inputs, of shape (N, D).
- *
- * Outputs:
- * - probs: Outputs, of shape (N, D).
- */
- # For numerical stability, we subtract the max score of an example from all scores for that
- # example. This is equivalent to the original formulation:
- # e^scores_i / sum(e^scores_i) == C*e^scores_i / C*sum(e^scores_i)
- # == e^(scores_i+log(C)) / sum(e^(scores_i+log(C))
- # set log(C) = -max(scores_i):
- # == e^(scores_i-max(scores_i)) / sum(e^(scores_i-max(scores_i))
- scores = scores - rowMaxs(scores) # numerical stability
- unnorm_probs = exp(scores) # unnormalized probabilities
- probs = unnorm_probs / rowSums(unnorm_probs) # normalized probabilities
-}
-
-backward = function(matrix[double] dprobs, matrix[double] scores)
- return (matrix[double] dscores) {
- /*
- * Computes the backward pass for a softmax classifier.
- *
- * Note that dscores_ij has multiple source branches:
- *
- * ```
- * dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij)
- * dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j
- *
- * dloss/dscores_ij =
- * (dloss/dprobs_ij * dprobs_ij/dscores_ij)
- * + sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij)
- * ```
- *
- * Inputs:
- * - dprobs: Gradient wrt `probs` from upstream, of shape (N, D).
- * - scores: Inputs, of shape (N, D).
- *
- * Outputs:
- * - dscores: Gradient wrt `scores`, of shape (N, D).
- */
- scores = scores - rowMaxs(scores) # numerical stability
- unnorm_probs = exp(scores) # unnormalized probabilities
- probs = unnorm_probs / rowSums(unnorm_probs) # normalized probabilities
- # After some cancellation:
- # dscores = dprobs*probs - probs*rowSums(dprobs*probs)
- dtemp = dprobs * probs
- dscores = dtemp - probs*rowSums(dtemp)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/tanh.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/tanh.dml b/scripts/staging/SystemML-NN/nn/layers/tanh.dml
deleted file mode 100644
index d849d70..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/tanh.dml
+++ /dev/null
@@ -1,65 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Tanh nonlinearity layer.
- */
-source("nn/layers/sigmoid.dml") as sigmoid
-
-forward = function(matrix[double] X)
- return (matrix[double] out) {
- /*
- * Computes the forward pass for a tanh nonlinearity layer.
- *
- * ```
- * tanh(x) = (e^x - e^-x) / (e^x + e^-x)
- * = 2 * sigmoid(2x) - 1
- * ```
- *
- * Inputs:
- * - X: Inputs, of shape (any, any).
- *
- * Outputs:
- * - out: Outputs, of same shape as `X`.
- */
- # out = (exp(X) - exp(-X)) / (exp(X) + exp(-X))
- # Simplification of the above formulation to use the sigmoid function:
- sigma2X = sigmoid::forward(2*X)
- out = 2*sigma2X - 1
-}
-
-backward = function(matrix[double] dout, matrix[double] X)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for a tanh nonlinearity layer.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
- * - X: Inputs, of shape (any, any).
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of same shape as `X`.
- */
- sigma2X = sigmoid::forward(2*X)
- out = 2*sigma2X - 1
- dX = (1-out^2) * dout
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml b/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
deleted file mode 100644
index 85b1c41..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
+++ /dev/null
@@ -1,77 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Adagrad optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon,
- matrix[double] cache)
- return (matrix[double] X, matrix[double] cache) {
- /*
- * Performs an Adagrad update.
- *
- * This is an adaptive learning rate optimizer that maintains the
- * sum of squared gradients to automatically adjust the effective
- * learning rate.
- *
- * Reference:
- * - Adaptive Subgradient Methods for Online Learning and Stochastic
- * Optimization, Duchi et al.
- * - http://jmlr.org/papers/v12/duchi11a.html
- *
- * Inputs:
- * - X: Parameters to update, of shape (any, any).
- * - dX: Gradient wrt `X` of a loss function being optimized, of
- * same shape as `X`.
- * - lr: Learning rate.
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Typical values are in the range of [1e-8, 1e-4].
- * - cache: State that maintains per-parameter sum of squared
- * gradients, of same shape as `X`.
- *
- * Outputs:
- * - X: Updated parameters `X`, of same shape as input `X`.
- * - cache: State that maintains per-parameter sum of squared
- * gradients, of same shape as `X`.
- */
- cache = cache + dX^2
- X = X - (lr * dX / (sqrt(cache)+epsilon))
-}
-
-init = function(matrix[double] X)
- return (matrix[double] cache) {
- /*
- * Initialize the state for this optimizer.
- *
- * Note: This is just a convenience function, and state
- * may be initialized manually if needed.
- *
- * Inputs:
- * - X: Parameters to update, of shape (any, any).
- *
- * Outputs:
- * - cache: State that maintains per-parameter sum of squared
- * gradients, of same shape as `X`.
- */
- cache = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/adam.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/adam.dml b/scripts/staging/SystemML-NN/nn/optim/adam.dml
deleted file mode 100644
index 4b6fa2a..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/adam.dml
+++ /dev/null
@@ -1,97 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Adam optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double beta1, double beta2,
- double epsilon, int t, matrix[double] m, matrix[double] v)
- return (matrix[double] X, matrix[double] m, matrix[double] v) {
- /*
- * Performs an Adam update.
- *
- * Reference:
- * - Adam: A Method for Stochastic Optimization, Kingma, Ba.
- * - http://arxiv.org/abs/1412.6980
- *
- * Inputs:
- * - X: Parameters to update, of shape (any, any).
- * - dX: Gradient wrt `X` of a loss function being optimized, of
- * same shape as `X`.
- * - lr: Learning rate. Recommended value is 0.001.
- * - beta1: Exponential decay rate for the 1st moment estimates.
- * Recommended value is 0.9.
- * - beta2: Exponential decay rate for the 2nd moment estimates.
- * Recommended value is 0.999.
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Recommended value is 1e-8.
- * - t: Timestep, starting at 0.
- * - m: State containing the 1st moment (mean) estimate by
- * maintaining exponential moving averages of the gradients, of
- * same shape as `X`.
- * - v: State containing the 2nd raw moment (uncentered variance)
- * estimate by maintaining exponential moving averages of the
- * squared gradients, of same shape as `X`.
- *
- * Outputs:
- * - X: Updated parameters `X`, of same shape as input `X`.
- * - m: Updated state containing the 1st moment (mean) estimate by
- * maintaining exponential moving averages of the gradients, of
- * same shape as `X`.
- * - v: Updated state containing the 2nd raw moment (uncentered
- * variance) estimate by maintaining exponential moving averages
- * of the squared gradients, of same shape as `X`.
- */
- t = t + 1
- m = beta1*m + (1-beta1)*dX # update biased 1st moment estimate
- v = beta2*v + (1-beta2)*dX^2 # update biased 2nd raw moment estimate
- # m = m / (1-beta1^t) # compute bias-corrected 1st moment estimate
- # v = v / (1-beta2^t) # compute bias-corrected 2nd raw moment estimate
- # X = X - (lr * m / (sqrt(v)+epsilon)) # param update
- # Simplified for computational efficiency:
- lr = lr * sqrt(1-beta2^t) / (1-beta1^t)
- X = X - (lr * m / (sqrt(v)+epsilon))
-}
-
-init = function(matrix[double] X)
- return (matrix[double] m, matrix[double] v) {
- /*
- * Initialize the state for this optimizer.
- *
- * Note: This is just a convenience function, and state
- * may be initialized manually if needed.
- *
- * Inputs:
- * - X: Parameters to update, of shape (any, any).
- *
- * Outputs:
- * - m: Initial state containing the 1st moment (mean) estimate by
- * maintaining exponential moving averages of the gradients, of
- * same shape as `X`.
- * - v: Initial state containing the 2nd raw moment (uncentered
- * variance) estimate by maintaining exponential moving averages
- * of the squared gradients, of same shape as `X`.
- */
- m = matrix(0, rows=nrow(X), cols=ncol(X))
- v = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml b/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
deleted file mode 100644
index 1feccaf..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
+++ /dev/null
@@ -1,79 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * RMSprop optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double decay_rate,
- double epsilon, matrix[double] cache)
- return (matrix[double] X, matrix[double] cache) {
- /*
- * Performs an RMSprop update.
- *
- * This is an adaptive learning rate optimizer that can be viewed
- * as an adjustment of the Adagrad method to use a moving average
- * of the sum of squared gradients in order to improve convergence.
- *
- * Reference:
- * - Neural Networks for Machine Learning, Lecture 6a, Hinton,
- * slide 29.
- * - http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
- *
- * Inputs:
- * - X: Parameters to update, of shape (any, any).
- * - dX: Gradient wrt `X` of a loss function being optimized, of
- * same shape as `X`.
- * - lr: Learning rate.
- * - decay_rate: Term controlling the rate of the moving average.
- * Typical values are in the range of [0.9, 0.999].
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Typical values are in the range of [1e-8, 1e-4].
- * - cache: State that maintains the moving average of the squared
- * gradients, of same shape as `X`.
- *
- * Outputs:
- * - X: Updated parameters `X`, of same shape as input `X`.
- * - cache: Updated state that maintains the moving average of the
- * squared gradients, of same shape as `X`.
- */
- cache = decay_rate*cache + (1-decay_rate)*dX^2
- X = X - (lr * dX / (sqrt(cache)+epsilon))
-}
-
-init = function(matrix[double] X)
- return (matrix[double] cache) {
- /*
- * Initialize the state for this optimizer.
- *
- * Note: This is just a convenience function, and state
- * may be initialized manually if needed.
- *
- * Inputs:
- * - X: Parameters to update, of shape (any, any).
- *
- * Outputs:
- * - cache: State that maintains the moving average of the squared
- * gradients, of same shape as `X`.
- */
- cache = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/sgd.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd.dml b/scripts/staging/SystemML-NN/nn/optim/sgd.dml
deleted file mode 100644
index 3ba7eba..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/sgd.dml
+++ /dev/null
@@ -1,42 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Stochastic Gradient Descent (SGD) optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr)
- return (matrix[double] X) {
- /*
- * Performs a vanilla SGD update.
- *
- * Inputs:
- * - X: Parameters to update, of shape (any, any).
- * - dX: Gradient wrt `X` of a loss function being optimized, of
- * same shape as `X`.
- * - lr: Learning rate.
- *
- * Outputs:
- * - X: Updated parameters `X`, of same shape as input `X`.
- */
- X = X - lr*dX
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
deleted file mode 100644
index 85922da..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
+++ /dev/null
@@ -1,71 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Stochastic Gradient Descent with momentum (SGD-momentum) optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
- return (matrix[double] X, matrix[double] v) {
- /*
- * Performs an SGD update with momentum.
- *
- * In SGD with momentum, we assume that the parameters have a velocity
- * that continues with some momentum, and that is influenced by the
- * gradient.
- *
- * Inputs:
- * - X: Parameters to update, of shape (any, any).
- * - dX: Gradient wrt `X` of a loss function being optimized, of
- * same shape as `X`.
- * - lr: Learning rate.
- * - mu: Momentum value.
- * Typical values are in the range of [0.5, 0.99], usually
- * started at the lower end and annealed towards the higher end.
- * - v: State maintaining the velocity of the parameters `X`, of same
- * shape as `X`.
- *
- * Outputs:
- * - X: Updated parameters `X`, of same shape as input `X`.
- * - v: Updated velocity of the parameters `X`, of same shape as
- * input `X`.
- */
- v = mu*v - lr*dX # update velocity
- X = X + v # update position
-}
-
-init = function(matrix[double] X)
- return (matrix[double] v) {
- /*
- * Initialize the state for this optimizer.
- *
- * Note: This is just a convenience function, and state
- * may be initialized manually if needed.
- *
- * Inputs:
- * - X: Parameters to update, of shape (any, any).
- *
- * Outputs:
- * - v: Initial velocity of the parameters `X`.
- */
- v = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
deleted file mode 100644
index 3b62c6e..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
+++ /dev/null
@@ -1,81 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Stochastic Gradient Descent with Nesterov momentum (SGD-Nesterov) optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
- return (matrix[double] X, matrix[double] v) {
- /*
- * Performs an SGD update with Nesterov momentum.
- *
- * As with regular SGD with momentum, in SGD with Nesterov momentum,
- * we assume that the parameters have a velocity that continues
- * with some momentum, and that is influenced by the gradient.
- * In this view specifically, we perform the position update from the
- * position that the momentum is about to carry the parameters to,
- * rather than from the previous position. Additionally, we always
- * store the parameters in their position after momentum.
- *
- * Reference:
- * - Advances in optimizing Recurrent Networks, Bengio et al.,
- * section 3.5.
- * - http://arxiv.org/abs/1212.0901
- *
- * Inputs:
- * - X: Parameters to update, of shape (any, any).
- * - dX: Gradient wrt `X` of a loss function being optimized, of
- * same shape as `X`.
- * - lr: Learning rate.
- * - mu: Momentum value.
- * Typical values are in the range of [0.5, 0.99], usually
- * started at the lower end and annealed towards the higher end.
- * - v: State maintaining the velocity of the parameters `X`, of same
- * shape as `X`.
- *
- * Outputs:
- * - X: Updated parameters X, of same shape as input X.
- * - v: Updated velocity of the parameters X, of same shape as
- * input v.
- */
- v_prev = v
- v = mu*v - lr*dX # update velocity
- X = X - mu*v_prev + (1+mu)*v # update position, including momentum
-}
-
-init = function(matrix[double] X)
- return (matrix[double] v) {
- /*
- * Initialize the state for this optimizer.
- *
- * Note: This is just a convenience function, and state
- * may be initialized manually if needed.
- *
- * Inputs:
- * - X: Parameters to update, of shape (any, any).
- *
- * Outputs:
- * - v: Initial velocity of the parameters `X`.
- */
- v = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/README.md
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/README.md b/scripts/staging/SystemML-NN/nn/test/README.md
deleted file mode 100644
index b714d50..0000000
--- a/scripts/staging/SystemML-NN/nn/test/README.md
+++ /dev/null
@@ -1,32 +0,0 @@
-<!--
-{% comment %}
-Licensed to the Apache Software Foundation (ASF) under one or more
-contributor license agreements. See the NOTICE file distributed with
-this work for additional information regarding copyright ownership.
-The ASF licenses this file to you under the Apache License, Version 2.0
-(the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-{% endcomment %}
--->
-
-# SystemML-NN Tests
-
-#### This folder contains tests for the *SystemML-NN* (`nn`) deep learning library.
-
----
-## Tests
-#### All layers are tested for correct derivatives ("gradient-checking"), and many layers also have correctness tests against simpler reference implementations.
-* `grad_check.dml` - Contains gradient-checks for all layers as individual DML functions.
-* `test.dml` - Contains correctness tests for several of the more complicated layers by checking against simple reference implementations, such as `conv_simple.dml`. All tests are formulated as individual DML functions.
-* `tests.dml` - A DML script that runs all of the tests in `grad_check.dml` and `test.dml`.
-
-## Execution
-* `spark-submit SystemML.jar -f nn/test/tests.dml` from the base of the project.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml b/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
deleted file mode 100644
index 9f126d0..0000000
--- a/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
+++ /dev/null
@@ -1,213 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- *
- * This implementation is intended to be a simple, reference version.
- */
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] out, int Hout, int Wout) {
- /*
- * Computes the forward pass for a 2D spatial convolutional layer with
- * F filters. The input data has N examples, each represented as a 3D
- * volume unrolled into a single vector.
- *
- * This implementation is intended to be a simple, reference version.
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * - padw: Padding for left and right sides.
- *
- * Outputs:
- * - out: Outputs, of shape (N, F*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- */
- N = nrow(X)
- F = nrow(W)
- Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
- Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
- # Create output volume
- out = matrix(0, rows=N, cols=F*Hout*Wout)
-
- # Convolution - Simple reference implementation
- parfor (n in 1:N) { # all examples
- Xn = matrix(X[n,], rows=C, cols=Hin*Win)
- # Pad image
- Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros
- parfor (c in 1:C) {
- Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped
- Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
- Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
- Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape
- }
- # Convolve image with filters
- parfor (f in 1:F, check=0) { # all filters
- parfor (hout in 1:Hout, check=0) { # all output rows
- h0 = (hout-1)*strideh + 1
- parfor (wout in 1:Wout, check=0) { # all output columns
- w0 = (wout-1)*stridew + 1
- # Create a patch of the input example corresponding spatially to the filter sizes
- Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf) # zeros
- parfor (c in 1:C, check=0) {
- Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw) # reshape
- Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf], rows=1,
- cols=Hf*Wf) # reshape
- }
- out[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] =
- W[f,] %*% matrix(Xn_padded_patch, rows=C*Hf*Wf, cols=1) + b[f,]
- }
- }
- }
- }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
- matrix[double] X, matrix[double] W, matrix[double] b,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
- /*
- * Computes the backward pass for a 2D spatial convolutional layer
- * with F filters.
- *
- * This implementation is intended to be a simple, reference version.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of
- * shape (N, F*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * - padw: Padding for left and right sides.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
- * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
- * - db: Gradient wrt `b`, of shape (F, 1).
- */
- N = nrow(X)
- F = nrow(W)
-
- # Create gradient volumes
- dX = matrix(0, rows=N, cols=C*Hin*Win)
- dW = matrix(0, rows=F, cols=C*Hf*Wf)
- db = matrix(0, rows=F, cols=1)
-
- # Partial derivatives for convolution - Simple reference implementation
- for (n in 1:N) { # all examples
- Xn = matrix(X[n,], rows=C, cols=Hin*Win)
- # Pad image
- Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros
- parfor (c in 1:C) {
- Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped
- Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
- Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
- Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape
- }
- dXn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
- for (f in 1:F) { # all filters
- for (hout in 1:Hout) { # all output rows
- h0 = (hout-1) * strideh + 1
- for (wout in 1:Wout) { # all output columns
- w0 = (wout-1) * stridew + 1
- # Create a patch of the input example corresponding spatially to the filter sizes
- Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf) # zeros
- dXn_padded_patch = matrix(W[f,] * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout],
- rows=C, cols=Hf*Wf) # reshape
- for (c in 1:C) {
- Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw) # reshape
- Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf],
- rows=1, cols=Hf*Wf) # reshape
- dXn_padded_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
- dXn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf] = matrix(dXn_padded_patch[c,],
- rows=Hf, cols=Wf) # reshape
- dXn_padded[c,] = dXn_padded[c,] + matrix(dXn_padded_slice,
- rows=1, cols=(Hin+2*padh)*(Win+2*padw))
- }
- dW[f,] = dW[f,]
- + matrix(Xn_padded_patch, rows=1, cols=C*Hf*Wf)
- * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
- db[f,] = db[f,] + dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
- }
- }
- }
- # Unpad derivs on input
- dXn = matrix(0, rows=C, cols=Hin*Win)
- parfor (c in 1:C, check=0) {
- dXn_padded_slice = matrix(dXn_padded[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
- dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
- dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
- }
- dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
- }
-}
-
-init = function(int F, int C, int Hf, int Wf)
- return (matrix[double] W, matrix[double] b) {
- /*
- * Initialize the parameters of this layer.
- *
- * We use the heuristic by He et al., which limits the magnification
- * of inputs/gradients during forward/backward passes by scaling
- * unit-Gaussian weights by a factor of sqrt(2/n), under the
- * assumption of relu neurons.
- * - http://arxiv.org/abs/1502.01852
- *
- * Inputs:
- * - F: Number of filters.
- * - C: Number of input channels (dimensionality of depth).
- * - Hf: Filter height.
- * - Wf: Filter width.
- *
- * Outputs:
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- */
- W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
- b = matrix(0, rows=F, cols=1)
-}
-
[09/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn`
library to `scripts/nn`
Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/softmax.dml b/scripts/nn/layers/softmax.dml
new file mode 100644
index 0000000..68a7bc7
--- /dev/null
+++ b/scripts/nn/layers/softmax.dml
@@ -0,0 +1,87 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Softmax classifier layer.
+ */
+
+forward = function(matrix[double] scores)
+ return (matrix[double] probs) {
+ /*
+ * Computes the forward pass for a softmax classifier. The inputs
+ * are interpreted as unnormalized, log-probabilities for each of
+ * N examples, and the softmax function transforms them to normalized
+ * probabilities.
+ *
+ * This can be interpreted as a generalization of the sigmoid
+ * function to multiple classes.
+ *
+ * `probs_ij = e^scores_ij / sum(e^scores_i)`
+ *
+ * Inputs:
+ * - scores: Inputs, of shape (N, D).
+ *
+ * Outputs:
+ * - probs: Outputs, of shape (N, D).
+ */
+ # For numerical stability, we subtract the max score of an example from all scores for that
+ # example. This is equivalent to the original formulation:
+ # e^scores_i / sum(e^scores_i) == C*e^scores_i / C*sum(e^scores_i)
+ # == e^(scores_i+log(C)) / sum(e^(scores_i+log(C))
+ # set log(C) = -max(scores_i):
+ # == e^(scores_i-max(scores_i)) / sum(e^(scores_i-max(scores_i))
+ scores = scores - rowMaxs(scores) # numerical stability
+ unnorm_probs = exp(scores) # unnormalized probabilities
+ probs = unnorm_probs / rowSums(unnorm_probs) # normalized probabilities
+}
+
+backward = function(matrix[double] dprobs, matrix[double] scores)
+ return (matrix[double] dscores) {
+ /*
+ * Computes the backward pass for a softmax classifier.
+ *
+ * Note that dscores_ij has multiple source branches:
+ *
+ * ```
+ * dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij)
+ * dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j
+ *
+ * dloss/dscores_ij =
+ * (dloss/dprobs_ij * dprobs_ij/dscores_ij)
+ * + sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij)
+ * ```
+ *
+ * Inputs:
+ * - dprobs: Gradient wrt `probs` from upstream, of shape (N, D).
+ * - scores: Inputs, of shape (N, D).
+ *
+ * Outputs:
+ * - dscores: Gradient wrt `scores`, of shape (N, D).
+ */
+ scores = scores - rowMaxs(scores) # numerical stability
+ unnorm_probs = exp(scores) # unnormalized probabilities
+ probs = unnorm_probs / rowSums(unnorm_probs) # normalized probabilities
+ # After some cancellation:
+ # dscores = dprobs*probs - probs*rowSums(dprobs*probs)
+ dtemp = dprobs * probs
+ dscores = dtemp - probs*rowSums(dtemp)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/tanh.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/tanh.dml b/scripts/nn/layers/tanh.dml
new file mode 100644
index 0000000..d849d70
--- /dev/null
+++ b/scripts/nn/layers/tanh.dml
@@ -0,0 +1,65 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Tanh nonlinearity layer.
+ */
+source("nn/layers/sigmoid.dml") as sigmoid
+
+forward = function(matrix[double] X)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for a tanh nonlinearity layer.
+ *
+ * ```
+ * tanh(x) = (e^x - e^-x) / (e^x + e^-x)
+ * = 2 * sigmoid(2x) - 1
+ * ```
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ *
+ * Outputs:
+ * - out: Outputs, of same shape as `X`.
+ */
+ # out = (exp(X) - exp(-X)) / (exp(X) + exp(-X))
+ # Simplification of the above formulation to use the sigmoid function:
+ sigma2X = sigmoid::forward(2*X)
+ out = 2*sigma2X - 1
+}
+
+backward = function(matrix[double] dout, matrix[double] X)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for a tanh nonlinearity layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+ * - X: Inputs, of shape (any, any).
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ */
+ sigma2X = sigmoid::forward(2*X)
+ out = 2*sigma2X - 1
+ dX = (1-out^2) * dout
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/adagrad.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/optim/adagrad.dml b/scripts/nn/optim/adagrad.dml
new file mode 100644
index 0000000..85b1c41
--- /dev/null
+++ b/scripts/nn/optim/adagrad.dml
@@ -0,0 +1,77 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Adagrad optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon,
+ matrix[double] cache)
+ return (matrix[double] X, matrix[double] cache) {
+ /*
+ * Performs an Adagrad update.
+ *
+ * This is an adaptive learning rate optimizer that maintains the
+ * sum of squared gradients to automatically adjust the effective
+ * learning rate.
+ *
+ * Reference:
+ * - Adaptive Subgradient Methods for Online Learning and Stochastic
+ * Optimization, Duchi et al.
+ * - http://jmlr.org/papers/v12/duchi11a.html
+ *
+ * Inputs:
+ * - X: Parameters to update, of shape (any, any).
+ * - dX: Gradient wrt `X` of a loss function being optimized, of
+ * same shape as `X`.
+ * - lr: Learning rate.
+ * - epsilon: Smoothing term to avoid divide by zero errors.
+ * Typical values are in the range of [1e-8, 1e-4].
+ * - cache: State that maintains per-parameter sum of squared
+ * gradients, of same shape as `X`.
+ *
+ * Outputs:
+ * - X: Updated parameters `X`, of same shape as input `X`.
+ * - cache: State that maintains per-parameter sum of squared
+ * gradients, of same shape as `X`.
+ */
+ cache = cache + dX^2
+ X = X - (lr * dX / (sqrt(cache)+epsilon))
+}
+
+init = function(matrix[double] X)
+ return (matrix[double] cache) {
+ /*
+ * Initialize the state for this optimizer.
+ *
+ * Note: This is just a convenience function, and state
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - X: Parameters to update, of shape (any, any).
+ *
+ * Outputs:
+ * - cache: State that maintains per-parameter sum of squared
+ * gradients, of same shape as `X`.
+ */
+ cache = matrix(0, rows=nrow(X), cols=ncol(X))
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/adam.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/optim/adam.dml b/scripts/nn/optim/adam.dml
new file mode 100644
index 0000000..4b6fa2a
--- /dev/null
+++ b/scripts/nn/optim/adam.dml
@@ -0,0 +1,97 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Adam optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr, double beta1, double beta2,
+ double epsilon, int t, matrix[double] m, matrix[double] v)
+ return (matrix[double] X, matrix[double] m, matrix[double] v) {
+ /*
+ * Performs an Adam update.
+ *
+ * Reference:
+ * - Adam: A Method for Stochastic Optimization, Kingma, Ba.
+ * - http://arxiv.org/abs/1412.6980
+ *
+ * Inputs:
+ * - X: Parameters to update, of shape (any, any).
+ * - dX: Gradient wrt `X` of a loss function being optimized, of
+ * same shape as `X`.
+ * - lr: Learning rate. Recommended value is 0.001.
+ * - beta1: Exponential decay rate for the 1st moment estimates.
+ * Recommended value is 0.9.
+ * - beta2: Exponential decay rate for the 2nd moment estimates.
+ * Recommended value is 0.999.
+ * - epsilon: Smoothing term to avoid divide by zero errors.
+ * Recommended value is 1e-8.
+ * - t: Timestep, starting at 0.
+ * - m: State containing the 1st moment (mean) estimate by
+ * maintaining exponential moving averages of the gradients, of
+ * same shape as `X`.
+ * - v: State containing the 2nd raw moment (uncentered variance)
+ * estimate by maintaining exponential moving averages of the
+ * squared gradients, of same shape as `X`.
+ *
+ * Outputs:
+ * - X: Updated parameters `X`, of same shape as input `X`.
+ * - m: Updated state containing the 1st moment (mean) estimate by
+ * maintaining exponential moving averages of the gradients, of
+ * same shape as `X`.
+ * - v: Updated state containing the 2nd raw moment (uncentered
+ * variance) estimate by maintaining exponential moving averages
+ * of the squared gradients, of same shape as `X`.
+ */
+ t = t + 1
+ m = beta1*m + (1-beta1)*dX # update biased 1st moment estimate
+ v = beta2*v + (1-beta2)*dX^2 # update biased 2nd raw moment estimate
+ # m = m / (1-beta1^t) # compute bias-corrected 1st moment estimate
+ # v = v / (1-beta2^t) # compute bias-corrected 2nd raw moment estimate
+ # X = X - (lr * m / (sqrt(v)+epsilon)) # param update
+ # Simplified for computational efficiency:
+ lr = lr * sqrt(1-beta2^t) / (1-beta1^t)
+ X = X - (lr * m / (sqrt(v)+epsilon))
+}
+
+init = function(matrix[double] X)
+ return (matrix[double] m, matrix[double] v) {
+ /*
+ * Initialize the state for this optimizer.
+ *
+ * Note: This is just a convenience function, and state
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - X: Parameters to update, of shape (any, any).
+ *
+ * Outputs:
+ * - m: Initial state containing the 1st moment (mean) estimate by
+ * maintaining exponential moving averages of the gradients, of
+ * same shape as `X`.
+ * - v: Initial state containing the 2nd raw moment (uncentered
+ * variance) estimate by maintaining exponential moving averages
+ * of the squared gradients, of same shape as `X`.
+ */
+ m = matrix(0, rows=nrow(X), cols=ncol(X))
+ v = matrix(0, rows=nrow(X), cols=ncol(X))
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/rmsprop.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/optim/rmsprop.dml b/scripts/nn/optim/rmsprop.dml
new file mode 100644
index 0000000..1feccaf
--- /dev/null
+++ b/scripts/nn/optim/rmsprop.dml
@@ -0,0 +1,79 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * RMSprop optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr, double decay_rate,
+ double epsilon, matrix[double] cache)
+ return (matrix[double] X, matrix[double] cache) {
+ /*
+ * Performs an RMSprop update.
+ *
+ * This is an adaptive learning rate optimizer that can be viewed
+ * as an adjustment of the Adagrad method to use a moving average
+ * of the sum of squared gradients in order to improve convergence.
+ *
+ * Reference:
+ * - Neural Networks for Machine Learning, Lecture 6a, Hinton,
+ * slide 29.
+ * - http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+ *
+ * Inputs:
+ * - X: Parameters to update, of shape (any, any).
+ * - dX: Gradient wrt `X` of a loss function being optimized, of
+ * same shape as `X`.
+ * - lr: Learning rate.
+ * - decay_rate: Term controlling the rate of the moving average.
+ * Typical values are in the range of [0.9, 0.999].
+ * - epsilon: Smoothing term to avoid divide by zero errors.
+ * Typical values are in the range of [1e-8, 1e-4].
+ * - cache: State that maintains the moving average of the squared
+ * gradients, of same shape as `X`.
+ *
+ * Outputs:
+ * - X: Updated parameters `X`, of same shape as input `X`.
+ * - cache: Updated state that maintains the moving average of the
+ * squared gradients, of same shape as `X`.
+ */
+ cache = decay_rate*cache + (1-decay_rate)*dX^2
+ X = X - (lr * dX / (sqrt(cache)+epsilon))
+}
+
+init = function(matrix[double] X)
+ return (matrix[double] cache) {
+ /*
+ * Initialize the state for this optimizer.
+ *
+ * Note: This is just a convenience function, and state
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - X: Parameters to update, of shape (any, any).
+ *
+ * Outputs:
+ * - cache: State that maintains the moving average of the squared
+ * gradients, of same shape as `X`.
+ */
+ cache = matrix(0, rows=nrow(X), cols=ncol(X))
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/sgd.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/optim/sgd.dml b/scripts/nn/optim/sgd.dml
new file mode 100644
index 0000000..3ba7eba
--- /dev/null
+++ b/scripts/nn/optim/sgd.dml
@@ -0,0 +1,42 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Stochastic Gradient Descent (SGD) optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr)
+ return (matrix[double] X) {
+ /*
+ * Performs a vanilla SGD update.
+ *
+ * Inputs:
+ * - X: Parameters to update, of shape (any, any).
+ * - dX: Gradient wrt `X` of a loss function being optimized, of
+ * same shape as `X`.
+ * - lr: Learning rate.
+ *
+ * Outputs:
+ * - X: Updated parameters `X`, of same shape as input `X`.
+ */
+ X = X - lr*dX
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/sgd_momentum.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/optim/sgd_momentum.dml b/scripts/nn/optim/sgd_momentum.dml
new file mode 100644
index 0000000..85922da
--- /dev/null
+++ b/scripts/nn/optim/sgd_momentum.dml
@@ -0,0 +1,71 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Stochastic Gradient Descent with momentum (SGD-momentum) optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
+ return (matrix[double] X, matrix[double] v) {
+ /*
+ * Performs an SGD update with momentum.
+ *
+ * In SGD with momentum, we assume that the parameters have a velocity
+ * that continues with some momentum, and that is influenced by the
+ * gradient.
+ *
+ * Inputs:
+ * - X: Parameters to update, of shape (any, any).
+ * - dX: Gradient wrt `X` of a loss function being optimized, of
+ * same shape as `X`.
+ * - lr: Learning rate.
+ * - mu: Momentum value.
+ * Typical values are in the range of [0.5, 0.99], usually
+ * started at the lower end and annealed towards the higher end.
+ * - v: State maintaining the velocity of the parameters `X`, of same
+ * shape as `X`.
+ *
+ * Outputs:
+ * - X: Updated parameters `X`, of same shape as input `X`.
+ * - v: Updated velocity of the parameters `X`, of same shape as
+ * input `X`.
+ */
+ v = mu*v - lr*dX # update velocity
+ X = X + v # update position
+}
+
+init = function(matrix[double] X)
+ return (matrix[double] v) {
+ /*
+ * Initialize the state for this optimizer.
+ *
+ * Note: This is just a convenience function, and state
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - X: Parameters to update, of shape (any, any).
+ *
+ * Outputs:
+ * - v: Initial velocity of the parameters `X`.
+ */
+ v = matrix(0, rows=nrow(X), cols=ncol(X))
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/sgd_nesterov.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/optim/sgd_nesterov.dml b/scripts/nn/optim/sgd_nesterov.dml
new file mode 100644
index 0000000..3b62c6e
--- /dev/null
+++ b/scripts/nn/optim/sgd_nesterov.dml
@@ -0,0 +1,81 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Stochastic Gradient Descent with Nesterov momentum (SGD-Nesterov) optimizer.
+ */
+
+update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
+ return (matrix[double] X, matrix[double] v) {
+ /*
+ * Performs an SGD update with Nesterov momentum.
+ *
+ * As with regular SGD with momentum, in SGD with Nesterov momentum,
+ * we assume that the parameters have a velocity that continues
+ * with some momentum, and that is influenced by the gradient.
+ * In this view specifically, we perform the position update from the
+ * position that the momentum is about to carry the parameters to,
+ * rather than from the previous position. Additionally, we always
+ * store the parameters in their position after momentum.
+ *
+ * Reference:
+ * - Advances in optimizing Recurrent Networks, Bengio et al.,
+ * section 3.5.
+ * - http://arxiv.org/abs/1212.0901
+ *
+ * Inputs:
+ * - X: Parameters to update, of shape (any, any).
+ * - dX: Gradient wrt `X` of a loss function being optimized, of
+ * same shape as `X`.
+ * - lr: Learning rate.
+ * - mu: Momentum value.
+ * Typical values are in the range of [0.5, 0.99], usually
+ * started at the lower end and annealed towards the higher end.
+ * - v: State maintaining the velocity of the parameters `X`, of same
+ * shape as `X`.
+ *
+ * Outputs:
+ * - X: Updated parameters X, of same shape as input X.
+ * - v: Updated velocity of the parameters X, of same shape as
+ * input v.
+ */
+ v_prev = v
+ v = mu*v - lr*dX # update velocity
+ X = X - mu*v_prev + (1+mu)*v # update position, including momentum
+}
+
+init = function(matrix[double] X)
+ return (matrix[double] v) {
+ /*
+ * Initialize the state for this optimizer.
+ *
+ * Note: This is just a convenience function, and state
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - X: Parameters to update, of shape (any, any).
+ *
+ * Outputs:
+ * - v: Initial velocity of the parameters `X`.
+ */
+ v = matrix(0, rows=nrow(X), cols=ncol(X))
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/README.md
----------------------------------------------------------------------
diff --git a/scripts/nn/test/README.md b/scripts/nn/test/README.md
new file mode 100644
index 0000000..b714d50
--- /dev/null
+++ b/scripts/nn/test/README.md
@@ -0,0 +1,32 @@
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements. See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+# SystemML-NN Tests
+
+#### This folder contains tests for the *SystemML-NN* (`nn`) deep learning library.
+
+---
+## Tests
+#### All layers are tested for correct derivatives ("gradient-checking"), and many layers also have correctness tests against simpler reference implementations.
+* `grad_check.dml` - Contains gradient-checks for all layers as individual DML functions.
+* `test.dml` - Contains correctness tests for several of the more complicated layers by checking against simple reference implementations, such as `conv_simple.dml`. All tests are formulated as individual DML functions.
+* `tests.dml` - A DML script that runs all of the tests in `grad_check.dml` and `test.dml`.
+
+## Execution
+* `spark-submit SystemML.jar -f nn/test/tests.dml` from the base of the project.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/conv2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/conv2d_simple.dml b/scripts/nn/test/conv2d_simple.dml
new file mode 100644
index 0000000..9f126d0
--- /dev/null
+++ b/scripts/nn/test/conv2d_simple.dml
@@ -0,0 +1,213 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ *
+ * This implementation is intended to be a simple, reference version.
+ */
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] out, int Hout, int Wout) {
+ /*
+ * Computes the forward pass for a 2D spatial convolutional layer with
+ * F filters. The input data has N examples, each represented as a 3D
+ * volume unrolled into a single vector.
+ *
+ * This implementation is intended to be a simple, reference version.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * - padw: Padding for left and right sides.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, F*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ */
+ N = nrow(X)
+ F = nrow(W)
+ Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+ Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+ # Create output volume
+ out = matrix(0, rows=N, cols=F*Hout*Wout)
+
+ # Convolution - Simple reference implementation
+ parfor (n in 1:N) { # all examples
+ Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+ # Pad image
+ Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros
+ parfor (c in 1:C) {
+ Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped
+ Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+ Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+ Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape
+ }
+ # Convolve image with filters
+ parfor (f in 1:F, check=0) { # all filters
+ parfor (hout in 1:Hout, check=0) { # all output rows
+ h0 = (hout-1)*strideh + 1
+ parfor (wout in 1:Wout, check=0) { # all output columns
+ w0 = (wout-1)*stridew + 1
+ # Create a patch of the input example corresponding spatially to the filter sizes
+ Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf) # zeros
+ parfor (c in 1:C, check=0) {
+ Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw) # reshape
+ Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf], rows=1,
+ cols=Hf*Wf) # reshape
+ }
+ out[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] =
+ W[f,] %*% matrix(Xn_padded_patch, rows=C*Hf*Wf, cols=1) + b[f,]
+ }
+ }
+ }
+ }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+ matrix[double] X, matrix[double] W, matrix[double] b,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+ /*
+ * Computes the backward pass for a 2D spatial convolutional layer
+ * with F filters.
+ *
+ * This implementation is intended to be a simple, reference version.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, F*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * - padw: Padding for left and right sides.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+ * - db: Gradient wrt `b`, of shape (F, 1).
+ */
+ N = nrow(X)
+ F = nrow(W)
+
+ # Create gradient volumes
+ dX = matrix(0, rows=N, cols=C*Hin*Win)
+ dW = matrix(0, rows=F, cols=C*Hf*Wf)
+ db = matrix(0, rows=F, cols=1)
+
+ # Partial derivatives for convolution - Simple reference implementation
+ for (n in 1:N) { # all examples
+ Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+ # Pad image
+ Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros
+ parfor (c in 1:C) {
+ Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped
+ Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+ Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+ Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape
+ }
+ dXn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+ for (f in 1:F) { # all filters
+ for (hout in 1:Hout) { # all output rows
+ h0 = (hout-1) * strideh + 1
+ for (wout in 1:Wout) { # all output columns
+ w0 = (wout-1) * stridew + 1
+ # Create a patch of the input example corresponding spatially to the filter sizes
+ Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf) # zeros
+ dXn_padded_patch = matrix(W[f,] * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout],
+ rows=C, cols=Hf*Wf) # reshape
+ for (c in 1:C) {
+ Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw) # reshape
+ Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf],
+ rows=1, cols=Hf*Wf) # reshape
+ dXn_padded_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+ dXn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf] = matrix(dXn_padded_patch[c,],
+ rows=Hf, cols=Wf) # reshape
+ dXn_padded[c,] = dXn_padded[c,] + matrix(dXn_padded_slice,
+ rows=1, cols=(Hin+2*padh)*(Win+2*padw))
+ }
+ dW[f,] = dW[f,]
+ + matrix(Xn_padded_patch, rows=1, cols=C*Hf*Wf)
+ * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
+ db[f,] = db[f,] + dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
+ }
+ }
+ }
+ # Unpad derivs on input
+ dXn = matrix(0, rows=C, cols=Hin*Win)
+ parfor (c in 1:C, check=0) {
+ dXn_padded_slice = matrix(dXn_padded[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
+ dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
+ dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
+ }
+ dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
+ }
+}
+
+init = function(int F, int C, int Hf, int Wf)
+ return (matrix[double] W, matrix[double] b) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * We use the heuristic by He et al., which limits the magnification
+ * of inputs/gradients during forward/backward passes by scaling
+ * unit-Gaussian weights by a factor of sqrt(2/n), under the
+ * assumption of relu neurons.
+ * - http://arxiv.org/abs/1502.01852
+ *
+ * Inputs:
+ * - F: Number of filters.
+ * - C: Number of input channels (dimensionality of depth).
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ *
+ * Outputs:
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ */
+ W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+ b = matrix(0, rows=F, cols=1)
+}
+
[04/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn`
library to `scripts/nn`
Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/grad_check.dml b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
deleted file mode 100644
index f3bc9a7..0000000
--- a/scripts/staging/SystemML-NN/nn/test/grad_check.dml
+++ /dev/null
@@ -1,1769 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Gradient checks for various architectures.
- */
-source("nn/layers/affine.dml") as affine
-source("nn/layers/batch_norm1d.dml") as batch_norm1d
-source("nn/layers/batch_norm2d.dml") as batch_norm2d
-source("nn/layers/conv2d.dml") as conv2d
-source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/dropout.dml") as dropout
-source("nn/layers/l1_loss.dml") as l1_loss
-source("nn/layers/l1_reg.dml") as l1_reg
-source("nn/layers/l2_loss.dml") as l2_loss
-source("nn/layers/l2_reg.dml") as l2_reg
-source("nn/layers/log_loss.dml") as log_loss
-source("nn/layers/lstm.dml") as lstm
-source("nn/layers/max_pool2d.dml") as max_pool2d
-source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
-source("nn/layers/relu.dml") as relu
-source("nn/layers/rnn.dml") as rnn
-source("nn/layers/scale_shift1d.dml") as scale_shift1d
-source("nn/layers/scale_shift2d.dml") as scale_shift2d
-source("nn/layers/sigmoid.dml") as sigmoid
-source("nn/layers/softmax.dml") as softmax
-source("nn/layers/tanh.dml") as tanh
-source("nn/test/conv2d_simple.dml") as conv2d_simple
-source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
-source("nn/test/util.dml") as test_util
-
-affine = function() {
- /*
- * Gradient check for the affine layer.
- */
- print("Grad checking the affine layer with L2 loss.")
-
- # Generate data
- N = 3 # num examples
- D = 100 # num features
- M = 10 # num neurons
- X = rand(rows=N, cols=D)
- y = rand(rows=N, cols=M)
- [W, b] = affine::init(D, M)
-
- # Compute analytical gradients of loss wrt parameters
- out = affine::forward(X, W, b)
- dout = l2_loss::backward(out, y)
- [dX, dW, db] = affine::backward(dout, X, W, b)
-
- # Grad check
- h = 1e-5
- print(" - Grad checking X.")
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- outmh = affine::forward(X, W, b)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- outph = affine::forward(X, W, b)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking W.")
- for (i in 1:nrow(W)) {
- for (j in 1:ncol(W)) {
- # Compute numerical derivative
- old = as.scalar(W[i,j])
- W[i,j] = old - h
- outmh = affine::forward(X, W, b)
- lossmh = l2_loss::forward(outmh, y)
- W[i,j] = old + h
- outph = affine::forward(X, W, b)
- lossph = l2_loss::forward(outph, y)
- W[i,j] = old # reset
- dW_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking b.")
- for (i in 1:nrow(b)) {
- for (j in 1:ncol(b)) {
- # Compute numerical derivative
- old = as.scalar(b[i,j])
- b[i,j] = old - h
- outmh = affine::forward(X, W, b)
- lossmh = l2_loss::forward(outmh, y)
- b[i,j] = old + h
- outph = affine::forward(X, W, b)
- lossph = l2_loss::forward(outph, y)
- b[i,j] = old # reset
- db_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
- }
- }
-}
-
-batch_norm1d = function() {
- /*
- * Gradient check for the 1D batch normalization layer.
- */
- print("Grad checking the 1D batch normalization layer with L2 loss.")
-
- # Generate data
- N = 3 # num examples
- D = 100 # num features
- mu = 0.9 # momentum
- eps = 1e-5 # epsilon
- X = rand(rows=N, cols=D)
- y = rand(rows=N, cols=D)
- gamma = rand(rows=1, cols=D)
- beta = rand(rows=1, cols=D)
- ema_mean = rand(rows=1, cols=D)
- ema_var = rand(rows=1, cols=D)
- #[dummy, dummy, ema_mean, ema_var] = batch_norm1d::init(D)
-
- # Check training & testing modes
- for (i in 1:2) {
- if (i == 1)
- mode = 'train'
- else
- mode = 'test'
- print(" - Grad checking the '"+mode+"' mode.")
-
- # Compute analytical gradients of loss wrt parameters
- [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
- dout = l2_loss::backward(out, y)
- [dX, dgamma, dbeta] = batch_norm1d::backward(dout, out, ema_mean_upd, ema_var_upd,
- cache_mean, cache_var, cache_norm,
- X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
-
- # Grad check
- h = 1e-5
- print(" - Grad checking X.")
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking gamma.")
- for (i in 1:nrow(gamma)) {
- for (j in 1:ncol(gamma)) {
- # Compute numerical derivative
- old = as.scalar(gamma[i,j])
- gamma[i,j] = old - h
- [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
- lossmh = l2_loss::forward(outmh, y)
- gamma[i,j] = old + h
- [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
- lossph = l2_loss::forward(outph, y)
- gamma[i,j] = old # reset
- dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
- lossph, lossmh)
- }
- }
-
- print(" - Grad checking beta.")
- for (i in 1:nrow(beta)) {
- for (j in 1:ncol(beta)) {
- # Compute numerical derivative
- old = as.scalar(beta[i,j])
- beta[i,j] = old - h
- [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
- lossmh = l2_loss::forward(outmh, y)
- beta[i,j] = old + h
- [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
- lossph = l2_loss::forward(outph, y)
- beta[i,j] = old # reset
- dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
- lossph, lossmh)
- }
- }
- }
-}
-
-batch_norm2d = function() {
- /*
- * Gradient check for the 2D (spatial) batch normalization layer.
- */
- print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.")
-
- # Generate data
- N = 3 # num examples
- C = 2 # num channels
- Hin = 5 # input height
- Win = 5 # input width
- mu = 0.9 # momentum
- eps = 1e-5 # epsilon
- X = rand(rows=N, cols=C*Hin*Win)
- y = rand(rows=N, cols=C*Hin*Win)
- gamma = rand(rows=C, cols=1)
- beta = rand(rows=C, cols=1)
- ema_mean = rand(rows=C, cols=1)
- ema_var = rand(rows=C, cols=1)
- #[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C)
-
- # Check training & testing modes
- for (i in 1:2) {
- if (i == 1)
- mode = 'train'
- else
- mode = 'test'
- print(" - Grad checking the '"+mode+"' mode.")
-
- # Compute analytical gradients of loss wrt parameters
- [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
- dout = l2_loss::backward(out, y)
- [dX, dgamma, dbeta] = batch_norm2d::backward(dout, out, ema_mean_upd, ema_var_upd,
- cache_mean, cache_var, cache_norm,
- X, gamma, beta, C, Hin, Win, mode,
- ema_mean, ema_var, mu, eps)
-
- # Grad check
- h = 1e-5
- print(" - Grad checking X.")
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking gamma.")
- for (i in 1:nrow(gamma)) {
- for (j in 1:ncol(gamma)) {
- # Compute numerical derivative
- old = as.scalar(gamma[i,j])
- gamma[i,j] = old - h
- [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
- lossmh = l2_loss::forward(outmh, y)
- gamma[i,j] = old + h
- [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
- lossph = l2_loss::forward(outph, y)
- gamma[i,j] = old # reset
- dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
- lossph, lossmh)
- }
- }
-
- print(" - Grad checking beta.")
- for (i in 1:nrow(beta)) {
- for (j in 1:ncol(beta)) {
- # Compute numerical derivative
- old = as.scalar(beta[i,j])
- beta[i,j] = old - h
- [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
- lossmh = l2_loss::forward(outmh, y)
- beta[i,j] = old + h
- [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
- lossph = l2_loss::forward(outph, y)
- beta[i,j] = old # reset
- dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
- lossph, lossmh)
- }
- }
- }
-}
-
-conv2d = function() {
- /*
- * Gradient check for the 2D convolutional layer using `im2col`.
- */
- print("Grad checking the `im2col` 2D convolutional layer with L2 loss.")
-
- # Generate data
- N = 2 # num examples
- C = 2 # num channels
- Hin = 5 # input height
- Win = 5 # input width
- F = 2 # num filters
- Hf = 3 # filter height
- Wf = 3 # filter width
- stride = 1
- pad = 1
- X = rand(rows=N, cols=C*Hin*Win)
- y = rand(rows=N, cols=F*Hin*Win)
-
- # Create layers
- [W, b] = conv2d::init(F, C, Hf, Wf)
-
- # Compute analytical gradients of loss wrt parameters
- [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- dout = l2_loss::backward(out, y)
- [dX, dW, db] = conv2d::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
-
- # Grad check
- h = 1e-5
- print(" - Grad checking X.")
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking W.")
- for (i in 1:nrow(W)) {
- for (j in 1:ncol(W)) {
- # Compute numerical derivative
- old = as.scalar(W[i,j])
- W[i,j] = old - h
- [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- lossmh = l2_loss::forward(outmh, y)
- W[i,j] = old + h
- [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- lossph = l2_loss::forward(outph, y)
- W[i,j] = old # reset
- dW_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking b.")
- for (i in 1:nrow(b)) {
- for (j in 1:ncol(b)) {
- # Compute numerical derivative
- old = as.scalar(b[i,j])
- b[i,j] = old - h
- [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- lossmh = l2_loss::forward(outmh, y)
- b[i,j] = old + h
- [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- lossph = l2_loss::forward(outph, y)
- b[i,j] = old # reset
- db_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
- }
- }
-}
-
-conv2d_builtin = function() {
- /*
- * Gradient check for the 2D convolutional layer using built-in
- * functions.
- */
- print("Grad checking the built-in 2D convolutional layer with L2 loss.")
-
- # Generate data
- N = 2 # num examples
- C = 2 # num channels
- Hin = 5 # input height
- Win = 5 # input width
- F = 2 # num filters
- Hf = 3 # filter height
- Wf = 3 # filter width
- stride = 1
- pad = 1
- X = rand(rows=N, cols=C*Hin*Win)
- y = rand(rows=N, cols=F*Hin*Win)
-
- # Create layers
- [W, b] = conv2d_builtin::init(F, C, Hf, Wf)
-
- # Compute analytical gradients of loss wrt parameters
- [out, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- dout = l2_loss::backward(out, y)
- [dX, dW, db] = conv2d_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
- stride, stride, pad, pad)
-
- # Grad check
- h = 1e-5
- print(" - Grad checking X.")
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking W.")
- for (i in 1:nrow(W)) {
- for (j in 1:ncol(W)) {
- # Compute numerical derivative
- old = as.scalar(W[i,j])
- W[i,j] = old - h
- [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossmh = l2_loss::forward(outmh, y)
- W[i,j] = old + h
- [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossph = l2_loss::forward(outph, y)
- W[i,j] = old # reset
- dW_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking b.")
- for (i in 1:nrow(b)) {
- for (j in 1:ncol(b)) {
- # Compute numerical derivative
- old = as.scalar(b[i,j])
- b[i,j] = old - h
- [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossmh = l2_loss::forward(outmh, y)
- b[i,j] = old + h
- [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossph = l2_loss::forward(outph, y)
- b[i,j] = old # reset
- db_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
- }
- }
-}
-
-conv2d_simple = function() {
- /*
- * Gradient check for the simple reference 2D convolutional layer.
- */
- print("Grad checking the simple reference 2D convolutional layer with L2 loss.")
-
- # Generate data
- N = 2 # num examples
- C = 2 # num channels
- Hin = 5 # input height
- Win = 5 # input width
- F = 2 # num filters
- Hf = 3 # filter height
- Wf = 3 # filter width
- stride = 1
- pad = 1
- X = rand(rows=N, cols=C*Hin*Win)
- y = rand(rows=N, cols=F*Hin*Win)
-
- # Create layers
- [W, b] = conv2d_simple::init(F, C, Hf, Wf)
-
- # Compute analytical gradients of loss wrt parameters
- [out, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- dout = l2_loss::backward(out, y)
- [dX, dW, db] = conv2d_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
- stride, stride, pad, pad)
-
- # Grad check
- h = 1e-5
- print(" - Grad checking X.")
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking W.")
- for (i in 1:nrow(W)) {
- for (j in 1:ncol(W)) {
- # Compute numerical derivative
- old = as.scalar(W[i,j])
- W[i,j] = old - h
- [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossmh = l2_loss::forward(outmh, y)
- W[i,j] = old + h
- [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossph = l2_loss::forward(outph, y)
- W[i,j] = old # reset
- dW_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking b.")
- for (i in 1:nrow(b)) {
- for (j in 1:ncol(b)) {
- # Compute numerical derivative
- old = as.scalar(b[i,j])
- b[i,j] = old - h
- [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossmh = l2_loss::forward(outmh, y)
- b[i,j] = old + h
- [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossph = l2_loss::forward(outph, y)
- b[i,j] = old # reset
- db_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
- }
- }
-}
-
-cross_entropy_loss = function() {
- /*
- * Gradient check for the cross-entropy loss function.
- */
- print("Grad checking the cross-entropy loss function.")
-
- # Generate data
- N = 3 # num examples
- K = 10 # num targets
- pred = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
- pred = pred / rowSums(pred) # normalized probs
- y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
- y = y / rowSums(y) # normalized probs
-
- # Compute analytical gradient
- dpred = cross_entropy_loss::backward(pred, y)
-
- # Grad check
- h = 1e-5
- for (i in 1:nrow(pred)) {
- for (j in 1:ncol(pred)) {
- # Compute numerical derivative
- old = as.scalar(pred[i,j])
- pred[i,j] = old - h
- lossmh = cross_entropy_loss::forward(pred, y)
- pred[i,j] = old + h
- lossph = cross_entropy_loss::forward(pred, y)
- pred[i,j] = old # reset W[i,j]
- dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
- }
- }
-}
-
-dropout = function() {
- /*
- * Gradient check for the (inverted) dropout layer.
- */
- print("Grad checking the (inverted) dropout layer with L2 loss.")
-
- # Generate data
- N = 3 # num examples
- M = 100 # num neurons
- p = 0.5 # probability of dropping neuron output
- seed = as.integer(floor(as.scalar(rand(rows=1, cols=1, min=1, max=100000)))) # random seed
- X = rand(rows=N, cols=M)
- y = rand(rows=N, cols=M)
-
- # Compute analytical gradients of loss wrt parameters
- [out, mask] = dropout::forward(X, p, seed)
- dout = l2_loss::backward(out, y)
- dX = dropout::backward(dout, X, p, mask)
-
- # Grad check
- h = 1e-5
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- [outmh, mask] = dropout::forward(X, p, seed)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- [outph, mask] = dropout::forward(X, p, seed)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-}
-
-l1_loss = function() {
- /*
- * Gradient check for the L1 loss function.
- */
- print("Grad checking the L1 loss function.")
-
- # Generate data
- N = 3 # num examples
- D = 2 # num targets
- pred = rand(rows=N, cols=D)
- y = rand(rows=N, cols=D)
-
- # Compute analytical gradient
- dpred = l1_loss::backward(pred, y)
-
- # Grad check
- h = 1e-5
- for (i in 1:nrow(pred)) {
- for (j in 1:ncol(pred)) {
- # Compute numerical derivative
- old = as.scalar(pred[i,j])
- pred[i,j] = old - h
- lossmh = l1_loss::forward(pred, y)
- pred[i,j] = old + h
- lossph = l1_loss::forward(pred, y)
- pred[i,j] = old # reset W[i,j]
- dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
- }
- }
-}
-
-l1_reg = function() {
- /*
- * Gradient check for the L1 regularization function.
- */
- print("Grad checking the L1 regularization function.")
-
- # Generate data
- D = 5 # num features
- M = 3 # num neurons
- lambda = 0.01
- W = rand(rows=D, cols=M)
-
- # Compute analytical gradient
- dW = l1_reg::backward(W, lambda)
-
- # Grad check
- h = 1e-5
- for (i in 1:nrow(W)) {
- for (j in 1:ncol(W)) {
- # Compute numerical derivative
- old = as.scalar(W[i,j])
- W[i,j] = old - h
- reg_lossmh = l1_reg::forward(W, lambda)
- W[i,j] = old + h
- reg_lossph = l1_reg::forward(W, lambda)
- W[i,j] = old # reset W[i,j]
- dW_num = (reg_lossph-reg_lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
- reg_lossph, reg_lossmh)
- }
- }
-}
-
-l2_loss = function() {
- /*
- * Gradient check for the L2 loss function.
- */
- print("Grad checking the L2 loss function.")
-
- # Generate data
- N = 3 # num examples
- D = 2 # num targets
- pred = rand(rows=N, cols=D)
- y = rand(rows=N, cols=D)
-
- # Compute analytical gradient
- dpred = l2_loss::backward(pred, y)
-
- # Grad check
- h = 1e-5
- for (i in 1:nrow(pred)) {
- for (j in 1:ncol(pred)) {
- # Compute numerical derivative
- old = as.scalar(pred[i,j])
- pred[i,j] = old - h
- lossmh = l2_loss::forward(pred, y)
- pred[i,j] = old + h
- lossph = l2_loss::forward(pred, y)
- pred[i,j] = old # reset W[i,j]
- dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
- }
- }
-}
-
-l2_reg = function() {
- /*
- * Gradient check for the L2 regularization function.
- */
- print("Grad checking the L2 regularization function.")
-
- # Generate data
- D = 5 # num features
- M = 3 # num neurons
- lambda = 0.01
- W = rand(rows=D, cols=M)
-
- # Compute analytical gradient
- dW = l2_reg::backward(W, lambda)
-
- # Grad check
- h = 1e-5
- for (i in 1:nrow(W)) {
- for (j in 1:ncol(W)) {
- # Compute numerical derivative
- old = as.scalar(W[i,j])
- W[i,j] = old - h
- reg_lossmh = l2_reg::forward(W, lambda)
- W[i,j] = old + h
- reg_lossph = l2_reg::forward(W, lambda)
- W[i,j] = old # reset W[i,j]
- dW_num = (reg_lossph-reg_lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
- reg_lossph, reg_lossmh)
- }
- }
-}
-
-log_loss = function() {
- /*
- * Gradient check for the log loss function.
- */
- print("Grad checking the log loss function.")
-
- # Generate data
- N = 20 # num examples
- D = 1 # num targets
- pred = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
- y = round(rand(rows=N, cols=D, min=0, max=1, pdf="uniform"))
-
- # Compute analytical gradient
- dpred = log_loss::backward(pred, y)
-
- # Grad check
- h = 1e-5
- for (i in 1:nrow(pred)) {
- for (j in 1:ncol(pred)) {
- # Compute numerical derivative
- old = as.scalar(pred[i,j])
- pred[i,j] = old - h
- lossmh = log_loss::forward(pred, y)
- pred[i,j] = old + h
- lossph = log_loss::forward(pred, y)
- pred[i,j] = old # reset W[i,j]
- dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
- }
- }
-}
-
-lstm = function() {
- /*
- * Gradient check for the LSTM layer.
- */
- print("Grad checking the LSTM layer with L2 loss.")
-
- # Generate data
- N = 3 # num examples
- D = 10 # num features
- T = 15 # num timesteps (sequence length)
- M = 5 # num neurons
- return_seq = TRUE
- X = rand(rows=N, cols=T*D)
- y = rand(rows=N, cols=T*M)
- yc = rand(rows=N, cols=M)
- out0 = rand(rows=N, cols=M)
- c0 = rand(rows=N, cols=M)
- [W, b, dummy, dummy2] = lstm::init(N, D, M)
-
- # Compute analytical gradients of loss wrt parameters
- [out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
- dout = l2_loss::backward(out, y)
- dc = l2_loss::backward(c, yc)
- [dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0,
- cache_out, cache_c, cache_ifog)
-
- # Grad check
- h = 1e-5
- print(" - Grad checking X.")
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
- loss_outmh = l2_loss::forward(outmh, y)
- loss_cmh = l2_loss::forward(cmh, yc)
- lossmh = loss_outmh + loss_cmh
- X[i,j] = old + h
- [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
- loss_outph = l2_loss::forward(outph, y)
- loss_cph = l2_loss::forward(cph, yc)
- lossph = loss_outph + loss_cph
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking W.")
- for (i in 1:nrow(W)) {
- for (j in 1:ncol(W)) {
- # Compute numerical derivative
- old = as.scalar(W[i,j])
- W[i,j] = old - h
- [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
- loss_outmh = l2_loss::forward(outmh, y)
- loss_cmh = l2_loss::forward(cmh, yc)
- lossmh = loss_outmh + loss_cmh
- W[i,j] = old + h
- [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
- loss_outph = l2_loss::forward(outph, y)
- loss_cph = l2_loss::forward(cph, yc)
- lossph = loss_outph + loss_cph
- W[i,j] = old # reset
- dW_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking b.")
- for (i in 1:nrow(b)) {
- for (j in 1:ncol(b)) {
- # Compute numerical derivative
- old = as.scalar(b[i,j])
- b[i,j] = old - h
- [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
- loss_outmh = l2_loss::forward(outmh, y)
- loss_cmh = l2_loss::forward(cmh, yc)
- lossmh = loss_outmh + loss_cmh
- b[i,j] = old + h
- [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
- loss_outph = l2_loss::forward(outph, y)
- loss_cph = l2_loss::forward(cph, yc)
- lossph = loss_outph + loss_cph
- b[i,j] = old # reset
- db_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking out0.")
- for (i in 1:nrow(out0)) {
- for (j in 1:ncol(out0)) {
- # Compute numerical derivative
- old = as.scalar(out0[i,j])
- out0[i,j] = old - h
- [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
- loss_outmh = l2_loss::forward(outmh, y)
- loss_cmh = l2_loss::forward(cmh, yc)
- lossmh = loss_outmh + loss_cmh
- out0[i,j] = old + h
- [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
- loss_outph = l2_loss::forward(outph, y)
- loss_cph = l2_loss::forward(cph, yc)
- lossph = loss_outph + loss_cph
- out0[i,j] = old # reset
- dout0_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking c0.")
- for (i in 1:nrow(c0)) {
- for (j in 1:ncol(c0)) {
- # Compute numerical derivative
- old = as.scalar(c0[i,j])
- c0[i,j] = old - h
- [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
- loss_outmh = l2_loss::forward(outmh, y)
- loss_cmh = l2_loss::forward(cmh, yc)
- lossmh = loss_outmh + loss_cmh
- c0[i,j] = old + h
- [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
- loss_outph = l2_loss::forward(outph, y)
- loss_cph = l2_loss::forward(cph, yc)
- lossph = loss_outph + loss_cph
- c0[i,j] = old # reset
- dc0_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
- }
- }
-}
-
-max_pool2d = function() {
- /*
- * Gradient check for the 2D max pooling layer.
- */
- print("Grad checking the 2D max pooling layer with L2 loss.")
-
- # Generate data
- N = 2 # num examples
- C = 2 # num channels
- Hin = 4 # input height
- Win = 4 # input width
- Hf = 2 # pool filter height
- Wf = 2 # pool filter width
- stride = 2
- X = rand(rows=N, cols=C*Hin*Win)
-
- for (pad in 0:1) {
- print(" - Grad checking w/ pad="+pad+".")
- Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
- Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
- y = rand(rows=N, cols=C*Hout*Wout)
-
- # Compute analytical gradients of loss wrt parameters
- [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- dout = l2_loss::backward(out, y)
- dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
-
- # Grad check
- h = 1e-5
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- [outmh, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- [outph, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
- }
-}
-
-max_pool2d_builtin = function() {
- /*
- * Gradient check for the 2D max pooling layer.
- */
- print("Grad checking the built-in 2D max pooling layer with L2 loss.")
-
- # Generate data
- N = 2 # num examples
- C = 2 # num channels
- Hin = 4 # input height
- Win = 4 # input width
- Hf = 2 # pool filter height
- Wf = 2 # pool filter width
- stride = 2
- X = rand(rows=N, cols=C*Hin*Win)
-
- for (pad in 0:1) {
- print(" - Grad checking w/ pad="+pad+".")
- Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
- Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
- y = rand(rows=N, cols=C*Hout*Wout)
-
- # Compute analytical gradients of loss wrt parameters
- [out, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- dout = l2_loss::backward(out, y)
- dX = max_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
-
- # Grad check
- h = 1e-5
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- [outmh, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- [outph, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
- }
-}
-
-max_pool2d_simple = function() {
- /*
- * Gradient check for the simple reference 2D max pooling layer.
- */
- print("Grad checking the simple reference 2D max pooling layer with L2 loss.")
-
- # Generate data
- N = 2 # num examples
- C = 2 # num channels
- Hin = 4 # input height
- Win = 4 # input width
- Hf = 2 # pool filter height
- Wf = 2 # pool filter width
- stride = 2
- X = rand(rows=N, cols=C*Hin*Win)
-
- for (pad in 0:1) {
- print(" - Grad checking w/ pad="+pad+".")
- Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
- Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
- y = rand(rows=N, cols=C*Hout*Wout)
-
- # Compute analytical gradients of loss wrt parameters
- [out, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
- dout = l2_loss::backward(out, y)
- dX = max_pool2d_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
-
- # Grad check
- h = 1e-5
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- [outmh, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- [outph, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
- }
-}
-
-relu = function() {
- /*
- * Gradient check for the ReLU nonlinearity layer.
- *
- * NOTE: This could result in a false-negative in which the test
- * fails due to a kink being crossed in the nonlinearity. This
- * occurs when the tests, f(x-h) and f(x+h), end up on opposite
- * sides of the zero threshold of max(0, fx). For now, just run
- * the tests again. In the future, we can explicitly check for
- * this and rerun the test automatically.
- */
- print("Grad checking the ReLU nonlinearity layer with L2 loss.")
-
- # Generate data
- N = 3 # num examples
- M = 10 # num neurons
- X = rand(rows=N, cols=M, min=-5, max=5)
- y = rand(rows=N, cols=M)
-
- # Compute analytical gradients of loss wrt parameters
- out = relu::forward(X)
- dout = l2_loss::backward(out, y)
- dX = relu::backward(dout, X)
-
- # Grad check
- h = 1e-5
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- outmh = relu::forward(X)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- outph = relu::forward(X)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-}
-
-rnn = function() {
- /*
- * Gradient check for the simple RNN layer.
- */
- print("Grad checking the simple RNN layer with L2 loss.")
-
- # Generate data
- N = 3 # num examples
- D = 10 # num features
- T = 15 # num timesteps (sequence length)
- M = 5 # num neurons
- return_seq = TRUE
- X = rand(rows=N, cols=T*D)
- y = rand(rows=N, cols=T*M)
- out0 = rand(rows=N, cols=M)
- [W, b, dummy] = rnn::init(N, D, M)
-
- # Compute analytical gradients of loss wrt parameters
- [out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
- dout = l2_loss::backward(out, y)
- [dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out)
-
- # Grad check
- h = 1e-5
- print(" - Grad checking X.")
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking W.")
- for (i in 1:nrow(W)) {
- for (j in 1:ncol(W)) {
- # Compute numerical derivative
- old = as.scalar(W[i,j])
- W[i,j] = old - h
- [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
- lossmh = l2_loss::forward(outmh, y)
- W[i,j] = old + h
- [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
- lossph = l2_loss::forward(outph, y)
- W[i,j] = old # reset
- dW_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking b.")
- for (i in 1:nrow(b)) {
- for (j in 1:ncol(b)) {
- # Compute numerical derivative
- old = as.scalar(b[i,j])
- b[i,j] = old - h
- [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
- lossmh = l2_loss::forward(outmh, y)
- b[i,j] = old + h
- [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
- lossph = l2_loss::forward(outph, y)
- b[i,j] = old # reset
- db_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking out0.")
- for (i in 1:nrow(out0)) {
- for (j in 1:ncol(out0)) {
- # Compute numerical derivative
- old = as.scalar(out0[i,j])
- out0[i,j] = old - h
- [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
- lossmh = l2_loss::forward(outmh, y)
- out0[i,j] = old + h
- [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
- lossph = l2_loss::forward(outph, y)
- out0[i,j] = old # reset
- dout0_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
- }
- }
-}
-
-scale_shift1d = function() {
- /*
- * Gradient check for the 1D scale & shift layer.
- */
- print("Grad checking the 1D scale & shift layer with L2 loss.")
-
- # Generate data
- N = 3 # num examples
- D = 100 # num features
- X = rand(rows=N, cols=D)
- y = rand(rows=N, cols=D)
- [gamma, beta] = scale_shift1d::init(D)
-
- # Compute analytical gradients of loss wrt parameters
- out = scale_shift1d::forward(X, gamma, beta)
- dout = l2_loss::backward(out, y)
- [dX, dgamma, dbeta] = scale_shift1d::backward(dout, out, X, gamma, beta)
-
- # Grad check
- h = 1e-5
- print(" - Grad checking X.")
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- outmh = scale_shift1d::forward(X, gamma, beta)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- outph = scale_shift1d::forward(X, gamma, beta)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking gamma.")
- for (i in 1:nrow(gamma)) {
- for (j in 1:ncol(gamma)) {
- # Compute numerical derivative
- old = as.scalar(gamma[i,j])
- gamma[i,j] = old - h
- outmh = scale_shift1d::forward(X, gamma, beta)
- lossmh = l2_loss::forward(outmh, y)
- gamma[i,j] = old + h
- outph = scale_shift1d::forward(X, gamma, beta)
- lossph = l2_loss::forward(outph, y)
- gamma[i,j] = old # reset
- dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
- lossph, lossmh)
- }
- }
-
- print(" - Grad checking beta.")
- for (i in 1:nrow(beta)) {
- for (j in 1:ncol(beta)) {
- # Compute numerical derivative
- old = as.scalar(beta[i,j])
- beta[i,j] = old - h
- outmh = scale_shift1d::forward(X, gamma, beta)
- lossmh = l2_loss::forward(outmh, y)
- beta[i,j] = old + h
- outph = scale_shift1d::forward(X, gamma, beta)
- lossph = l2_loss::forward(outph, y)
- beta[i,j] = old # reset
- dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
- lossph, lossmh)
- }
- }
-}
-
-scale_shift2d = function() {
- /*
- * Gradient check for the 2D scale & shift layer.
- */
- print("Grad checking the 2D scale & shift layer with L2 loss.")
-
- # Generate data
- N = 3 # num examples
- C = 2 # num channels
- Hin = 5 # input height
- Win = 5 # input width
- X = rand(rows=N, cols=C*Hin*Win)
- y = rand(rows=N, cols=C*Hin*Win)
- [gamma, beta] = scale_shift2d::init(C)
-
- # Compute analytical gradients of loss wrt parameters
- out = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
- dout = l2_loss::backward(out, y)
- [dX, dgamma, dbeta] = scale_shift2d::backward(dout, out, X, gamma, beta, C, Hin, Win)
-
- # Grad check
- h = 1e-5
- print(" - Grad checking X.")
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking gamma.")
- for (i in 1:nrow(gamma)) {
- for (j in 1:ncol(gamma)) {
- # Compute numerical derivative
- old = as.scalar(gamma[i,j])
- gamma[i,j] = old - h
- outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
- lossmh = l2_loss::forward(outmh, y)
- gamma[i,j] = old + h
- outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
- lossph = l2_loss::forward(outph, y)
- gamma[i,j] = old # reset
- dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
- lossph, lossmh)
- }
- }
-
- print(" - Grad checking beta.")
- for (i in 1:nrow(beta)) {
- for (j in 1:ncol(beta)) {
- # Compute numerical derivative
- old = as.scalar(beta[i,j])
- beta[i,j] = old - h
- outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
- lossmh = l2_loss::forward(outmh, y)
- beta[i,j] = old + h
- outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
- lossph = l2_loss::forward(outph, y)
- beta[i,j] = old # reset
- dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
- lossph, lossmh)
- }
- }
-}
-
-sigmoid = function() {
- /*
- * Gradient check for the sigmoid nonlinearity layer.
- */
- print("Grad checking the sigmoid nonlinearity layer with L2 loss.")
-
- # Generate data
- N = 3 # num examples
- M = 10 # num neurons
- X = rand(rows=N, cols=M)
- y = rand(rows=N, cols=M)
-
- # Compute analytical gradients of loss wrt parameters
- out = sigmoid::forward(X)
- dout = l2_loss::backward(out, y)
- dX = sigmoid::backward(dout, X)
-
- # Grad check
- h = 1e-5
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- outmh = sigmoid::forward(X)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- outph = sigmoid::forward(X)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-}
-
-softmax = function() {
- /*
- * Gradient check for the softmax layer.
- */
- print("Grad checking the softmax layer with L2 loss.")
-
- # Generate data
- N = 3 # num examples
- D = 10 # num classes
- X = rand(rows=N, cols=D)
- y = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
- y = y / rowSums(y)
-
- # Compute analytical gradients of loss wrt parameters
- out = softmax::forward(X)
- dout = l2_loss::backward(out, y)
- dX = softmax::backward(dout, X)
-
- # Grad check
- h = 1e-5
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- outmh = softmax::forward(X)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- outph = softmax::forward(X)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-}
-
-tanh = function() {
- /*
- * Gradient check for the hyperbolic tangent (tanh) nonlinearity
- * layer.
- */
- print("Grad checking the tanh nonlinearity layer with L2 loss.")
-
- # Generate data
- N = 3 # num examples
- M = 10 # num neurons
- X = rand(rows=N, cols=M)
- y = rand(rows=N, cols=M)
-
- # Compute analytical gradients of loss wrt parameters
- out = tanh::forward(X)
- dout = l2_loss::backward(out, y)
- dX = tanh::backward(dout, X)
-
- # Grad check
- h = 1e-5
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- outmh = tanh::forward(X)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- outph = tanh::forward(X)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-}
-
-two_layer_affine_l2_net = function() {
- /*
- * Gradient check for a two-layer, fully-connected, feed-forward
- * network with ReLU nonlinearity and L2 loss.
- *
- * NOTE: This could result in a false-negative in which the test
- * fails due to a kink being crossed in the ReLU nonlinearity. This
- * occurs when the tests, f(x-h) and f(x+h), end up on opposite
- * sides of the zero threshold of max(0, fx). For now, just run
- * the tests again. In the future, we can explicitly check for
- * this and rerun the test automatically.
- */
- print("Grad checking a two-layer, fully-connected, feed-forward network with a ReLU " +
- "nonlinearity, and an L2 loss function.")
-
- # Generate input data
- N = 1000 # num examples
- D = 100 # num features
- yD = 5 # num targets
- X = rand(rows=N, cols=D, pdf="normal")
- y = rand(rows=N, cols=yD)
-
- # Create 2-layer, fully-connected network
- M = 10 # number of hidden neurons
- [W1, b1] = affine::init(D, M)
- [W2, b2] = affine::init(M, yD)
-
- # Optimize for short "burn-in" time to move to characteristic
- # mode of operation and unmask any real issues.
- print(" - Burn-in:")
- lr = 0.0001
- decay = 0.99
- for(i in 1:5) {
- # Compute forward and backward passes of net
- [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
- print(" - L2 loss: " + loss)
-
- # Optimize with basic SGD
- W1 = W1 - lr * dW1
- b1 = b1 - lr * db1
- W2 = W2 - lr * dW2
- b2 = b2 - lr * db2
- lr = lr * decay
- }
-
- # Compute analytical gradients
- [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
-
- # Grad check
- h = 1e-5
- print(" - Grad checking X.")
- for (i in 1:2) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old_x = as.scalar(X[i,j])
- X[i,j] = old_x - h
- [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
- X[i,j] = old_x + h
- [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
- X[i,j] = old_x # reset X[i,j]
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking W1.")
- for (i in 1:nrow(W1)) {
- for (j in 1:ncol(W1)) {
- # Compute numerical derivative
- old_w = as.scalar(W1[i,j])
- W1[i,j] = old_w - h
- [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
- W1[i,j] = old_w + h
- [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
- W1[i,j] = old_w # reset W[i,j]
- dWij_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking W2.")
- for (i in 1:nrow(W2)) {
- for (j in 1:ncol(W2)) {
- # Compute numerical derivative
- old_w = as.scalar(W2[i,j])
- W2[i,j] = old_w - h
- [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
- W2[i,j] = old_w + h
- [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
- W2[i,j] = old_w # reset W[i,j]
- dWij_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking b1.")
- for (i in 1:nrow(b1)) {
- for (j in 1:ncol(b1)) {
- # Compute numerical derivative
- old_b = as.scalar(b1[i,j])
- b1[i,j] = old_b - h
- [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
- b1[i,j] = old_b + h
- [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
- b1[i,j] = old_b # reset b[1,j]
- dbij_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking b2.")
- for (i in 1:nrow(b2)) {
- for (j in 1:ncol(b2)) {
- # Compute numerical derivative
- old_b = as.scalar(b2[i,j])
- b2[i,j] = old_b - h
- [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
- b2[i,j] = old_b + h
- [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
- b2[i,j] = old_b # reset b[1,j]
- dbij_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh)
- }
- }
-}
-
-/*
- * Test network with forward/backward functions.
- */
-two_layer_affine_l2_net_run = function(matrix[double] X, matrix[double] y,
- matrix[double] W1, matrix[double] b1,
- matrix[double] W2, matrix[double] b2)
- return (matrix[double] pred, double loss,
- matrix[double] dX,
- matrix[double] dW1, matrix[double] db1,
- matrix[double] dW2, matrix[double] db2) {
- # Compute forward pass
- [loss, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
-
- # Compute backward pass
- [dX, dpred, daout, dhout, dW1, db1, dW2, db2] =
- two_layer_affine_l2_net_backward(X, y, pred, aout, hout, W1, b1, W2, b2)
-}
-
-two_layer_affine_l2_net_forward = function(matrix[double] X, matrix[double] y,
- matrix[double] W1, matrix[double] b1,
- matrix[double] W2, matrix[double] b2)
- return (double loss, matrix[double] pred, matrix[double] aout, matrix[double] hout) {
- # Compute forward pass
- hout = affine::forward(X, W1, b1)
- aout = relu::forward(hout)
- pred = affine::forward(aout, W2, b2)
-
- # Compute loss
- loss = l2_loss::forward(pred, y)
-}
-
-two_layer_affine_l2_net_backward = function(matrix[double] X, matrix[double] y, matrix[double] pred,
- matrix[double] aout, matrix[double] hout,
- matrix[double] W1, matrix[double] b1,
- matrix[double] W2, matrix[double] b2)
- return (matrix[double] dX, matrix[double] dpred,
- matrix[double] daout, matrix[double] dhout,
- matrix[double] dW1, matrix[double] db1, matrix[double] dW2, matrix[double] db2) {
- # Compute backward pass
- dpred = l2_loss::backward(pred, y)
- [daout, dW2, db2] = affine::backward(dpred, aout, W2, b2)
- dhout = relu::backward(daout, hout)
- [dX, dW1, db1] = affine::backward(dhout, X, W1, b1)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml b/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
deleted file mode 100644
index 188bd6e..0000000
--- a/scripts/staging/SystemML-NN/nn/test/max_pool2d_simple.dml
+++ /dev/null
@@ -1,172 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Max Pooling layer.
- *
- * This implementation is intended to be a simple, reference version.
- */
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] out, int Hout, int Wout) {
- /*
- * Computes the forward pass for a 2D spatial max pooling layer.
- * The input data has N examples, each represented as a 3D volume
- * unrolled into a single vector.
- *
- * This implementation is intended to be a simple, reference version.
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * A typical value is 0.
- * - padw: Padding for left and right sides.
- * A typical value is 0.
- *
- * Outputs:
- * - out: Outputs, of shape (N, C*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- */
- N = nrow(X)
- Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
- Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
- # Create output volume
- out = matrix(0, rows=N, cols=C*Hout*Wout)
-
- # Max pooling
- parfor (n in 1:N, check=0) { # all examples
- Xn = matrix(X[n,], rows=C, cols=Hin*Win)
-
- # Pad image
- pad_value = -1/0
- Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros
- parfor (c in 1:C) {
- Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped
- Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
- Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
- Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape
- }
- img = Xn_padded # shape (C, (Hin+2*padh)*(Win+2*padw))
-
- parfor (c in 1:C, check=0) { # all channels
- img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
- parfor (hout in 1:Hout, check=0) { # all output rows
- hin = (hout-1) * strideh + 1
- parfor (wout in 1:Wout, check=0) { # all output columns
- win = (wout-1) * stridew + 1
- out[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout] = max(img_slice[hin:hin+Hf-1,
- win:win+Wf-1])
- }
- }
- }
- }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for a 2D spatial max pooling layer.
- * The input data has N examples, each represented as a 3D volume
- * unrolled into a single vector.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of
- * shape (N, C*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * A typical value is 0.
- * - padw: Padding for left and right sides.
- * A typical value is 0.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
- */
- N = nrow(X)
-
- # Create gradient volume
- dX = matrix(0, rows=N, cols=C*Hin*Win)
-
- # Gradient of max pooling
- for (n in 1:N) { # all examples
- Xn = matrix(X[n,], rows=C, cols=Hin*Win)
-
- # Pad image
- pad_value = -1/0
- Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros
- parfor (c in 1:C) {
- Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped
- Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
- Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
- Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape
- }
- img = Xn_padded
-
- dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
- for (c in 1:C) { # all channels
- img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
- dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
- for (hout in 1:Hout, check=0) { # all output rows
- hin = (hout-1) * strideh + 1
- for (wout in 1:Wout) { # all output columns
- win = (wout-1) * stridew + 1
- img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
- max_val_ind = img_slice_patch == max(img_slice_patch) # max value indicator matrix
- # gradient passes through only for the max value(s) in this patch
- dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
- dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
- + dimg_slice_patch
- }
- }
- dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
- }
-
- # Unpad derivs on input
- dXn = matrix(0, rows=C, cols=Hin*Win)
- parfor (c in 1:C, check=0) {
- dXn_padded_slice = matrix(dimg[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
- dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
- dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
- }
- dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/run_tests.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/run_tests.dml b/scripts/staging/SystemML-NN/nn/test/run_tests.dml
deleted file mode 100644
index d8173a9..0000000
--- a/scripts/staging/SystemML-NN/nn/test/run_tests.dml
+++ /dev/null
@@ -1,90 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Script to run tests.
- */
-source("nn/test/grad_check.dml") as grad_check
-source("nn/test/test.dml") as test
-
-print("")
-print("Starting grad checks.")
-print("---")
-
-# Loss & loss-related functions
-grad_check::cross_entropy_loss()
-grad_check::l1_loss()
-grad_check::l1_reg()
-grad_check::l2_loss()
-grad_check::l2_reg()
-grad_check::log_loss()
-print("")
-
-# Core layers
-grad_check::affine()
-grad_check::batch_norm1d()
-grad_check::batch_norm2d()
-grad_check::conv2d()
-grad_check::conv2d_builtin()
-grad_check::conv2d_simple()
-grad_check::dropout()
-grad_check::lstm()
-grad_check::max_pool2d()
-grad_check::max_pool2d_builtin()
-grad_check::max_pool2d_simple()
-grad_check::relu()
-grad_check::rnn()
-grad_check::scale_shift1d()
-grad_check::scale_shift2d()
-grad_check::sigmoid()
-grad_check::softmax()
-grad_check::tanh()
-print("")
-
-# Example model
-grad_check::two_layer_affine_l2_net()
-print("")
-
-print("---")
-print("Grad checks complete -- look for any ERRORs or WARNINGs.")
-print("If any tests involving ReLUs failed, try a few times " +
- "to ensure that they were not false negatives due to " +
- "kinks being crossed.")
-print("")
-
-print("")
-print("Starting other tests.")
-print("---")
-
-test::batch_norm1d()
-test::batch_norm2d()
-test::conv2d()
-test::cross_entropy_loss()
-test::im2col()
-test::max_pool2d()
-test::padding()
-test::tanh()
-
-print("---")
-print("Other tests complete -- look for any ERRORs or WARNINGs.")
-print("")
-print("")
-
[11/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn`
library to `scripts/nn`
Posted by du...@apache.org.
[SYSTEMML-1524] Graduate `nn` library to `scripts/nn`
This graduates the SystemML `nn` deep learning library from the staging
directory to the top-level `scripts` directory. The aim is to have the
library ready for full release by the 1.0 release, alongside Caffe2DML,
GPU support, and native BLAS.
Closes #472.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/43c321d1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/43c321d1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/43c321d1
Branch: refs/heads/master
Commit: 43c321d18675d9b76483e0d1d8b156196172efdb
Parents: 1f5cf69
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Wed Apr 26 14:40:46 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Wed Apr 26 14:40:46 2017 -0700
----------------------------------------------------------------------
scripts/nn/README.md | 183 ++
scripts/nn/examples/Example - MNIST LeNet.ipynb | 189 ++
.../Example - MNIST Softmax Classifier.ipynb | 179 ++
scripts/nn/examples/README.md | 74 +
scripts/nn/examples/get_mnist_data.sh | 28 +
scripts/nn/examples/mnist_lenet-predict.dml | 91 +
scripts/nn/examples/mnist_lenet-train.dml | 123 ++
scripts/nn/examples/mnist_lenet.dml | 331 ++++
scripts/nn/examples/mnist_softmax-predict.dml | 77 +
scripts/nn/examples/mnist_softmax-train.dml | 110 ++
scripts/nn/examples/mnist_softmax.dml | 178 ++
scripts/nn/layers/affine.dml | 92 +
scripts/nn/layers/batch_norm1d.dml | 210 +++
scripts/nn/layers/batch_norm2d.dml | 238 +++
scripts/nn/layers/conv2d.dml | 194 ++
scripts/nn/layers/conv2d_builtin.dml | 160 ++
scripts/nn/layers/cross_entropy_loss.dml | 78 +
scripts/nn/layers/dropout.dml | 76 +
scripts/nn/layers/l1_loss.dml | 72 +
scripts/nn/layers/l1_reg.dml | 56 +
scripts/nn/layers/l2_loss.dml | 72 +
scripts/nn/layers/l2_reg.dml | 56 +
scripts/nn/layers/log_loss.dml | 76 +
scripts/nn/layers/lstm.dml | 260 +++
scripts/nn/layers/max_pool2d.dml | 159 ++
scripts/nn/layers/max_pool2d_builtin.dml | 103 +
scripts/nn/layers/relu.dml | 59 +
scripts/nn/layers/rnn.dml | 183 ++
scripts/nn/layers/scale_shift1d.dml | 95 +
scripts/nn/layers/scale_shift2d.dml | 107 ++
scripts/nn/layers/sigmoid.dml | 62 +
scripts/nn/layers/softmax.dml | 87 +
scripts/nn/layers/tanh.dml | 65 +
scripts/nn/optim/adagrad.dml | 77 +
scripts/nn/optim/adam.dml | 97 +
scripts/nn/optim/rmsprop.dml | 79 +
scripts/nn/optim/sgd.dml | 42 +
scripts/nn/optim/sgd_momentum.dml | 71 +
scripts/nn/optim/sgd_nesterov.dml | 81 +
scripts/nn/test/README.md | 32 +
scripts/nn/test/conv2d_simple.dml | 213 +++
scripts/nn/test/grad_check.dml | 1769 ++++++++++++++++++
scripts/nn/test/max_pool2d_simple.dml | 172 ++
scripts/nn/test/run_tests.dml | 90 +
scripts/nn/test/test.dml | 549 ++++++
scripts/nn/test/util.dml | 155 ++
scripts/nn/util.dml | 202 ++
scripts/staging/SystemML-NN/README.md | 183 --
.../nn/examples/Example - MNIST LeNet.ipynb | 189 --
.../Example - MNIST Softmax Classifier.ipynb | 179 --
.../staging/SystemML-NN/nn/examples/README.md | 74 -
.../SystemML-NN/nn/examples/get_mnist_data.sh | 28 -
.../nn/examples/mnist_lenet-predict.dml | 91 -
.../nn/examples/mnist_lenet-train.dml | 123 --
.../SystemML-NN/nn/examples/mnist_lenet.dml | 331 ----
.../nn/examples/mnist_softmax-predict.dml | 77 -
.../nn/examples/mnist_softmax-train.dml | 110 --
.../SystemML-NN/nn/examples/mnist_softmax.dml | 178 --
.../staging/SystemML-NN/nn/layers/affine.dml | 92 -
.../SystemML-NN/nn/layers/batch_norm1d.dml | 210 ---
.../SystemML-NN/nn/layers/batch_norm2d.dml | 238 ---
.../staging/SystemML-NN/nn/layers/conv2d.dml | 194 --
.../SystemML-NN/nn/layers/conv2d_builtin.dml | 160 --
.../nn/layers/cross_entropy_loss.dml | 78 -
.../staging/SystemML-NN/nn/layers/dropout.dml | 76 -
.../staging/SystemML-NN/nn/layers/l1_loss.dml | 72 -
.../staging/SystemML-NN/nn/layers/l1_reg.dml | 56 -
.../staging/SystemML-NN/nn/layers/l2_loss.dml | 72 -
.../staging/SystemML-NN/nn/layers/l2_reg.dml | 56 -
.../staging/SystemML-NN/nn/layers/log_loss.dml | 76 -
scripts/staging/SystemML-NN/nn/layers/lstm.dml | 260 ---
.../SystemML-NN/nn/layers/max_pool2d.dml | 159 --
.../nn/layers/max_pool2d_builtin.dml | 103 -
scripts/staging/SystemML-NN/nn/layers/relu.dml | 59 -
scripts/staging/SystemML-NN/nn/layers/rnn.dml | 183 --
.../SystemML-NN/nn/layers/scale_shift1d.dml | 95 -
.../SystemML-NN/nn/layers/scale_shift2d.dml | 107 --
.../staging/SystemML-NN/nn/layers/sigmoid.dml | 62 -
.../staging/SystemML-NN/nn/layers/softmax.dml | 87 -
scripts/staging/SystemML-NN/nn/layers/tanh.dml | 65 -
.../staging/SystemML-NN/nn/optim/adagrad.dml | 77 -
scripts/staging/SystemML-NN/nn/optim/adam.dml | 97 -
.../staging/SystemML-NN/nn/optim/rmsprop.dml | 79 -
scripts/staging/SystemML-NN/nn/optim/sgd.dml | 42 -
.../SystemML-NN/nn/optim/sgd_momentum.dml | 71 -
.../SystemML-NN/nn/optim/sgd_nesterov.dml | 81 -
scripts/staging/SystemML-NN/nn/test/README.md | 32 -
.../SystemML-NN/nn/test/conv2d_simple.dml | 213 ---
.../staging/SystemML-NN/nn/test/grad_check.dml | 1769 ------------------
.../SystemML-NN/nn/test/max_pool2d_simple.dml | 172 --
.../staging/SystemML-NN/nn/test/run_tests.dml | 90 -
scripts/staging/SystemML-NN/nn/test/test.dml | 549 ------
scripts/staging/SystemML-NN/nn/test/util.dml | 155 --
scripts/staging/SystemML-NN/nn/util.dml | 202 --
94 files changed, 7752 insertions(+), 7752 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/README.md
----------------------------------------------------------------------
diff --git a/scripts/nn/README.md b/scripts/nn/README.md
new file mode 100644
index 0000000..b80f2c6
--- /dev/null
+++ b/scripts/nn/README.md
@@ -0,0 +1,183 @@
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements. See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+# SystemML-NN
+
+### A deep learning library for [Apache SystemML](https://github.com/apache/incubator-systemml).
+
+## Examples:
+#### Please see the [`examples`](nn/examples) folder for more detailed examples, or view the following two quick examples.
+### Neural net for regression with vanilla SGD:
+```python
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/l2_loss.dml") as l2_loss
+source("nn/layers/relu.dml") as relu
+source("nn/optim/sgd.dml") as sgd
+
+# Generate input data
+N = 1024 # num examples
+D = 100 # num features
+t = 1 # num targets
+X = rand(rows=N, cols=D, pdf="normal")
+y = rand(rows=N, cols=t)
+
+# Create 2-layer network:
+## affine1 -> relu1 -> affine2
+M = 64 # number of neurons
+[W1, b1] = affine::init(D, M)
+[W2, b2] = affine::init(M, t)
+
+# Initialize optimizer
+lr = 0.05 # learning rate
+mu = 0.9 # momentum
+decay = 0.99 # learning rate decay constant
+
+# Optimize
+print("Starting optimization")
+batch_size = 32
+epochs = 5
+iters = 1024 / batch_size
+for (e in 1:epochs) {
+ for(i in 1:iters) {
+ # Get next batch
+ X_batch = X[i:i+batch_size-1,]
+ y_batch = y[i:i+batch_size-1,]
+
+ # Compute forward pass
+ out1 = affine::forward(X_batch, W1, b1)
+ outr1 = relu::forward(out1)
+ out2 = affine::forward(outr1, W2, b2)
+
+ # Compute loss
+ loss = l2_loss::forward(out2, y_batch)
+ print("L2 loss: " + loss)
+
+ # Compute backward pass
+ dout2 = l2_loss::backward(out2, y_batch)
+ [doutr1, dW2, db2] = affine::backward(dout2, outr1, W2, b2)
+ dout1 = relu::backward(doutr1, out1)
+ [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)
+
+ # Optimize with vanilla SGD
+ W1 = sgd::update(W1, dW1, lr)
+ b1 = sgd::update(b1, db1, lr)
+ W2 = sgd::update(W2, dW2, lr)
+ b2 = sgd::update(b2, db2, lr)
+ }
+ # Decay learning rate
+ lr = lr * decay
+}
+```
+
+### Neural net for multi-class classification with dropout and SGD w/ Nesterov momentum:
+```python
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/dropout.dml") as dropout
+source("nn/layers/relu.dml") as relu
+source("nn/layers/softmax.dml") as softmax
+source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+
+# Generate input data
+N = 1024 # num examples
+D = 100 # num features
+t = 5 # num targets
+X = rand(rows=N, cols=D, pdf="normal")
+classes = round(rand(rows=N, cols=1, min=1, max=t, pdf="uniform"))
+y = matrix(0, rows=N, cols=t)
+parfor (i in 1:N) {
+ y[i, as.scalar(classes[i,1])] = 1 # one-hot encoding
+}
+
+# Create network:
+# affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> dropout2 -> affine3 -> softmax
+H1 = 64 # number of neurons in 1st hidden layer
+H2 = 64 # number of neurons in 2nd hidden layer
+p = 0.5 # dropout probability
+[W1, b1] = affine::init(D, H1)
+[W2, b2] = affine::init(H1, H2)
+[W3, b3] = affine::init(H2, t)
+
+# Initialize SGD w/ Nesterov momentum optimizer
+lr = 0.05 # learning rate
+mu = 0.5 # momentum
+decay = 0.99 # learning rate decay constant
+vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
+vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
+vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
+
+# Optimize
+print("Starting optimization")
+batch_size = 64
+epochs = 10
+iters = 1024 / batch_size
+for (e in 1:epochs) {
+ for(i in 1:iters) {
+ # Get next batch
+ X_batch = X[i:i+batch_size-1,]
+ y_batch = y[i:i+batch_size-1,]
+
+ # Compute forward pass
+ ## layer 1:
+ out1 = affine::forward(X_batch, W1, b1)
+ outr1 = relu::forward(out1)
+ [outd1, maskd1] = dropout::forward(outr1, p, -1)
+ ## layer 2:
+ out2 = affine::forward(outd1, W2, b2)
+ outr2 = relu::forward(out2)
+ [outd2, maskd2] = dropout::forward(outr2, p, -1)
+ ## layer 3:
+ out3 = affine::forward(outd2, W3, b3)
+ probs = softmax::forward(out3)
+
+ # Compute loss
+ loss = cross_entropy_loss::forward(probs, y_batch)
+ print("Cross entropy loss: " + loss)
+
+ # Compute backward pass
+ ## loss:
+ dprobs = cross_entropy_loss::backward(probs, y_batch)
+ ## layer 3:
+ dout3 = softmax::backward(dprobs, out3)
+ [doutd2, dW3, db3] = affine::backward(dout3, outd2, W3, b3)
+ ## layer 2:
+ doutr2 = dropout::backward(doutd2, outr2, p, maskd2)
+ dout2 = relu::backward(doutr2, out2)
+ [doutd1, dW2, db2] = affine::backward(dout2, outd1, W2, b2)
+ ## layer 1:
+ doutr1 = dropout::backward(doutd1, outr1, p, maskd1)
+ dout1 = relu::backward(doutr1, out1)
+ [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)
+
+ # Optimize with SGD w/ Nesterov momentum
+ [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
+ [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
+ [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
+ [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
+ [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
+ [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
+ }
+ # Anneal momentum towards 0.999
+ mu = mu + (0.999 - mu)/(1+epochs-e)
+ # Decay learning rate
+ lr = lr * decay
+}
+```
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/Example - MNIST LeNet.ipynb
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/Example - MNIST LeNet.ipynb b/scripts/nn/examples/Example - MNIST LeNet.ipynb
new file mode 100644
index 0000000..0423269
--- /dev/null
+++ b/scripts/nn/examples/Example - MNIST LeNet.ipynb
@@ -0,0 +1,189 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Quick Setup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create a SystemML MLContext object\n",
+ "from systemml import MLContext, dml\n",
+ "ml = MLContext(sc)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Download Data - MNIST"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9]. Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%sh\n",
+ "mkdir -p data/mnist/\n",
+ "cd data/mnist/\n",
+ "curl -O https://pjreddie.com/media/files/mnist_train.csv\n",
+ "curl -O https://pjreddie.com/media/files/mnist_test.csv"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## SystemML \"LeNet\" Neural Network"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1. Train"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "script_string = \"\"\"\n",
+ "source(\"nn/examples/mnist_lenet.dml\") as mnist_lenet\n",
+ "\n",
+ "# Read training data\n",
+ "data = read($data, format=\"csv\")\n",
+ "n = nrow(data)\n",
+ "\n",
+ "# Extract images and labels\n",
+ "images = data[,2:ncol(data)]\n",
+ "labels = data[,1]\n",
+ "\n",
+ "# Scale images to [-1,1], and one-hot encode the labels\n",
+ "images = (images / 255.0) * 2 - 1\n",
+ "labels = table(seq(1, n), labels+1, n, 10)\n",
+ "\n",
+ "# Split into training (55,000 examples) and validation (5,000 examples)\n",
+ "X = images[5001:nrow(images),]\n",
+ "X_val = images[1:5000,]\n",
+ "y = labels[5001:nrow(images),]\n",
+ "y_val = labels[1:5000,]\n",
+ "\n",
+ "# Train\n",
+ "epochs = 10\n",
+ "[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)\n",
+ "\"\"\"\n",
+ "script = (dml(script_string).input(\"$data\", \"data/mnist/mnist_train.csv\")\n",
+ " .input(C=1, Hin=28, Win=28)\n",
+ " .output(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))\n",
+ "W1, b1, W2, b2, W3, b3, W4, b4 = (ml.execute(script)\n",
+ " .get(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2. Compute Test Accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "script_string = \"\"\"\n",
+ "source(\"nn/examples/mnist_lenet.dml\") as mnist_lenet\n",
+ "\n",
+ "# Read test data\n",
+ "data = read($data, format=\"csv\")\n",
+ "n = nrow(data)\n",
+ "\n",
+ "# Extract images and labels\n",
+ "X_test = data[,2:ncol(data)]\n",
+ "y_test = data[,1]\n",
+ "\n",
+ "# Scale images to [-1,1], and one-hot encode the labels\n",
+ "X_test = (X_test / 255.0) * 2 - 1\n",
+ "y_test = table(seq(1, n), y_test+1, n, 10)\n",
+ "\n",
+ "# Eval on test set\n",
+ "probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)\n",
+ "[loss, accuracy] = mnist_lenet::eval(probs, y_test)\n",
+ "\n",
+ "print(\"Test Accuracy: \" + accuracy)\n",
+ "\"\"\"\n",
+ "script = dml(script_string).input(**{\"$data\": \"data/mnist/mnist_train.csv\",\n",
+ " \"C\": 1, \"Hin\": 28, \"Win\": 28,\n",
+ " \"W1\": W1, \"b1\": b1,\n",
+ " \"W2\": W2, \"b2\": b2,\n",
+ " \"W3\": W3, \"b3\": b3,\n",
+ " \"W4\": W4, \"b4\": b4})\n",
+ "ml.execute(script)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3. Extract Model Into Spark DataFrames For Future Use"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "W1_df = W1.toDF()\n",
+ "b1_df = b1.toDF()\n",
+ "W2_df = W2.toDF()\n",
+ "b2_df = b2.toDF()\n",
+ "W3_df = W3.toDF()\n",
+ "b3_df = b3.toDF()\n",
+ "W4_df = W4.toDF()\n",
+ "b4_df = b4.toDF()\n",
+ "W1_df, b1_df, W2_df, b2_df, W3_df, b3_df, W4_df, b4_df"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 + Spark 2.x + SystemML",
+ "language": "python",
+ "name": "pyspark3_2.x"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/Example - MNIST Softmax Classifier.ipynb
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/Example - MNIST Softmax Classifier.ipynb b/scripts/nn/examples/Example - MNIST Softmax Classifier.ipynb
new file mode 100644
index 0000000..5e7182a
--- /dev/null
+++ b/scripts/nn/examples/Example - MNIST Softmax Classifier.ipynb
@@ -0,0 +1,179 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Quick Setup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "# Create a SystemML MLContext object\n",
+ "from systemml import MLContext, dml\n",
+ "ml = MLContext(sc)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Download Data - MNIST"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9]. Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "%%sh\n",
+ "mkdir -p data/mnist/\n",
+ "cd data/mnist/\n",
+ "curl -O https://pjreddie.com/media/files/mnist_train.csv\n",
+ "curl -O https://pjreddie.com/media/files/mnist_test.csv"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## SystemML Softmax Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1. Train"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "training = \"\"\"\n",
+ "source(\"nn/examples/mnist_softmax.dml\") as mnist_softmax\n",
+ "\n",
+ "# Read training data\n",
+ "data = read($data, format=\"csv\")\n",
+ "n = nrow(data)\n",
+ "\n",
+ "# Extract images and labels\n",
+ "images = data[,2:ncol(data)]\n",
+ "labels = data[,1]\n",
+ "\n",
+ "# Scale images to [0,1], and one-hot encode the labels\n",
+ "images = images / 255.0\n",
+ "labels = table(seq(1, n), labels+1, n, 10)\n",
+ "\n",
+ "# Split into training (55,000 examples) and validation (5,000 examples)\n",
+ "X = images[5001:nrow(images),]\n",
+ "X_val = images[1:5000,]\n",
+ "y = labels[5001:nrow(images),]\n",
+ "y_val = labels[1:5000,]\n",
+ "\n",
+ "# Train\n",
+ "epochs = 1\n",
+ "[W, b] = mnist_softmax::train(X, y, X_val, y_val, epochs)\n",
+ "\"\"\"\n",
+ "script = dml(training).input(\"$data\", \"data/mnist/mnist_train.csv\").output(\"W\", \"b\")\n",
+ "W, b = ml.execute(script).get(\"W\", \"b\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2. Compute Test Accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "testing = \"\"\"\n",
+ "source(\"nn/examples/mnist_softmax.dml\") as mnist_softmax\n",
+ "\n",
+ "# Read test data\n",
+ "data = read($data, format=\"csv\")\n",
+ "n = nrow(data)\n",
+ "\n",
+ "# Extract images and labels\n",
+ "X_test = data[,2:ncol(data)]\n",
+ "y_test = data[,1]\n",
+ "\n",
+ "# Scale images to [0,1], and one-hot encode the labels\n",
+ "X_test = X_test / 255.0\n",
+ "y_test = table(seq(1, n), y_test+1, n, 10)\n",
+ "\n",
+ "# Eval on test set\n",
+ "probs = mnist_softmax::predict(X_test, W, b)\n",
+ "[loss, accuracy] = mnist_softmax::eval(probs, y_test)\n",
+ "\n",
+ "print(\"Test Accuracy: \" + accuracy)\n",
+ "\"\"\"\n",
+ "script = dml(testing).input(\"$data\", \"data/mnist/mnist_test.csv\", W=W, b=b)\n",
+ "ml.execute(script)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3. Extract Model Into Spark DataFrames For Future Use"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "W_df = W.toDF()\n",
+ "b_df = b.toDF()\n",
+ "W_df, b_df"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/README.md
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/README.md b/scripts/nn/examples/README.md
new file mode 100644
index 0000000..d5e9d04
--- /dev/null
+++ b/scripts/nn/examples/README.md
@@ -0,0 +1,74 @@
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements. See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+# SystemML-NN Examples
+
+#### This folder contains scripts and PySpark Jupyter notebooks serving as examples of using the *SystemML-NN* (`nn`) deep learning library.
+
+---
+
+# Examples
+### MNIST Softmax Classifier
+
+* This example trains a softmax classifier, which is essentially a multi-class logistic regression model, on the MNIST data. The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
+* Notebook: `Example - MNIST Softmax Classifier.ipynb`.
+* DML Functions: `mnist_softmax.dml`
+* Training script: `mnist_softmax-train.dml`
+* Prediction script: `mnist_softmax-predict.dml`
+
+### MNIST "LeNet" Neural Net
+
+* This example trains a neural network on the MNIST data using a ["LeNet" architecture](http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf). The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
+* Notebook: `Example - MNIST LeNet.ipynb`.
+* DML Functions: `mnist_lenet.dml`
+* Training script: `mnist_lenet-train.dml`
+* Prediction script: `mnist_lenet-predict.dml`
+
+---
+
+# Setup
+## Code
+* To run the examples, please first download and unzip the project via GitHub using the "Clone or download" button on the [homepage of the project](https://github.com/dusenberrymw/systemml-nn), *or* via the following commands:
+
+ ```
+ git clone https://github.com/dusenberrymw/systemml-nn.git
+ ```
+
+* Then, move into the `systemml-nn` folder via:
+ ```
+ cd systemml-nn
+ ```
+
+## Data
+* These examples use the classic [MNIST](http://yann.lecun.com/exdb/mnist/) dataset, which contains labeled 28x28 pixel images of handwritten digits in the range of 0-9. There are 60,000 training images, and 10,000 testing images. Of the 60,000 training images, 5,000 will be used as validation images.
+* **Download**:
+ * **Notebooks**: The data will be automatically downloaded as a step in either of the example notebooks.
+ * **Training scripts**: Please run `get_mnist_data.sh` to download the data separately.
+
+## Execution
+* These examples contain scripts written in SystemML's R-like language (`*.dml`), as well as PySpark Jupyter notebooks (`*.ipynb`). The scripts contain the math for the algorithms, enclosed in functions, and the notebooks serve as full, end-to-end examples of reading in data, training models using the functions within the scripts, and evaluating final performance.
+* **Notebooks**: To run the notebook examples, please install the SystemML Python package with `pip install systemml`, and then startup Jupyter in the following manner from this directory (or for more information, please see [this great blog post](http://spark.tc/0-to-life-changing-application-with-apache-systemml/)):
+
+ ```
+ PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook" pyspark --master local[*] --driver-memory 3G --driver-class-path SystemML.jar --jars SystemML.jar
+ ```
+
+ Note that all printed output, such as training statistics, from the SystemML scripts will be sent to the terminal in which Jupyter was started (for now...).
+
+* **Scripts**: To run the scripts from the command line using `spark-submit`, please see the comments located at the top of the `-train` and `-predict` scripts.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/get_mnist_data.sh
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/get_mnist_data.sh b/scripts/nn/examples/get_mnist_data.sh
new file mode 100755
index 0000000..deb0c40
--- /dev/null
+++ b/scripts/nn/examples/get_mnist_data.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+DIR="$(cd "$(dirname "$0")" && pwd)"
+mkdir -p $DIR/data/mnist/
+cd $DIR/data/mnist/
+curl -O https://pjreddie.com/media/files/mnist_train.csv
+curl -O https://pjreddie.com/media/files/mnist_test.csv
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/mnist_lenet-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/mnist_lenet-predict.dml b/scripts/nn/examples/mnist_lenet-predict.dml
new file mode 100644
index 0000000..85a5307
--- /dev/null
+++ b/scripts/nn/examples/mnist_lenet-predict.dml
@@ -0,0 +1,91 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST LeNet - Predict
+#
+# This script computes the class probability predictions of a
+# trained convolutional net using the "LeNet" architecture on
+# images of handwritten digits.
+#
+# Inputs:
+# - X: File containing training images.
+# The format is "pixel_1, pixel_2, ..., pixel_n".
+# - C: Number of color chanels in the images.
+# - Hin: Input image height.
+# - Win: Input image width.
+# - model_dir: Directory containing the trained weights and biases
+# of the model.
+# - out_dir: Directory to store class probability predictions for
+# each image.
+# - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
+# Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+# - probs: File containing class probability predictions for each
+# image.
+#
+# Data:
+# The X file should contain images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels.
+#
+# Sample Invocation (running from outside the `nn` folder):
+# 1. Download images.
+#
+# For example, save images to `nn/examples/data/mnist/images.csv`.
+#
+# 2. Execute using Spark
+# ```
+# spark-submit --master local[*] --driver-memory 5G
+# --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+# $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_lenet-predict.dml
+# -nvargs X=nn/examples/data/mnist/images.csv C=1 Hin=28 Win=28
+# model_dir=nn/examples/model/mnist_lenet out_dir=nn/examples/data/mnist
+# ```
+#
+source("nn/examples/mnist_lenet.dml") as mnist_lenet
+
+# Read training data
+fmt = ifdef($fmt, "csv")
+X = read($X, format=fmt)
+C = $C
+Hin = $Hin
+Win = $Win
+
+# Scale images to [-1,1]
+X = (X / 255.0) * 2 - 1
+
+# Read model coefficients
+W1 = read($model_dir+"/W1")
+b1 = read($model_dir+"/b1")
+W2 = read($model_dir+"/W2")
+b2 = read($model_dir+"/b2")
+W3 = read($model_dir+"/W3")
+b3 = read($model_dir+"/b3")
+W4 = read($model_dir+"/W4")
+b4 = read($model_dir+"/b4")
+
+# Predict classes
+probs = mnist_lenet::predict(X, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
+
+# Output results
+write(probs, $out_dir+"/probs."+fmt, format=fmt)
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/mnist_lenet-train.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/mnist_lenet-train.dml b/scripts/nn/examples/mnist_lenet-train.dml
new file mode 100644
index 0000000..0fc733e
--- /dev/null
+++ b/scripts/nn/examples/mnist_lenet-train.dml
@@ -0,0 +1,123 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST LeNet - Train
+#
+# This script trains a convolutional net using the "LeNet" architecture
+# on images of handwritten digits.
+#
+# Inputs:
+# - train: File containing labeled MNIST training images.
+# The format is "label, pixel_1, pixel_2, ..., pixel_n".
+# - test: File containing labeled MNIST test images.
+# The format is "label, pixel_1, pixel_2, ..., pixel_n".
+# - C: Number of color chanels in the images.
+# - Hin: Input image height.
+# - Win: Input image width.
+# - epochs: [DEFAULT: 10] Total number of full training loops over
+# the full data set.
+# - out_dir: [DEFAULT: "."] Directory to store weights and bias
+# matrices of trained model, as well as final test accuracy.
+# - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
+# Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+# - W1, W2, W3, W4: Files containing the trained weights of the model.
+# - b1, b2, b3, b4: Files containing the trained biases of the model.
+# - accuracy: File containing the final accuracy on the test data.
+#
+# Data:
+# The MNIST dataset contains labeled images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels, and each label is
+# one of 10 possible digits in [0,9].
+#
+# Sample Invocation (running from outside the `nn` folder):
+# 1. Download data (60,000 training examples, and 10,000 test examples)
+# ```
+# nn/examples/get_mnist_data.sh
+# ```
+#
+# 2. Execute using Spark
+# ```
+# spark-submit --master local[*] --driver-memory 10G
+# --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+# $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_lenet-train.dml
+# -nvargs train=nn/examples/data/mnist/mnist_train.csv test=nn/examples/data/mnist/mnist_test.csv
+# C=1 Hin=28 Win=28 epochs=10 out_dir=nn/examples/model/mnist_lenet
+# ```
+#
+source("nn/examples/mnist_lenet.dml") as mnist_lenet
+
+# Read training data & settings
+fmt = ifdef($fmt, "csv")
+train = read($train, format=fmt)
+test = read($test, format=fmt)
+C = $C
+Hin = $Hin
+Win = $Win
+epochs = ifdef($epochs, 10)
+out_dir = ifdef($out_dir, ".")
+
+# Extract images and labels
+images = train[,2:ncol(train)]
+labels = train[,1]
+X_test = test[,2:ncol(test)]
+y_test = test[,1]
+
+# Scale images to [-1,1], and one-hot encode the labels
+n = nrow(train)
+n_test = nrow(test)
+images = (images / 255.0) * 2 - 1
+labels = table(seq(1, n), labels+1, n, 10)
+X_test = (X_test / 255.0) * 2 - 1
+y_test = table(seq(1, n_test), y_test+1, n_test, 10)
+
+# Split into training (55,000 examples) and validation (5,000 examples)
+X = images[5001:nrow(images),]
+X_val = images[1:5000,]
+y = labels[5001:nrow(images),]
+y_val = labels[1:5000,]
+
+# Train
+[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)
+
+# Write model out
+write(W1, out_dir+"/W1")
+write(b1, out_dir+"/b1")
+write(W2, out_dir+"/W2")
+write(b2, out_dir+"/b2")
+write(W3, out_dir+"/W3")
+write(b3, out_dir+"/b3")
+write(W4, out_dir+"/W4")
+write(b4, out_dir+"/b4")
+
+# Eval on test set
+probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
+[loss, accuracy] = mnist_lenet::eval(probs, y_test)
+
+# Output results
+print("Test Accuracy: " + accuracy)
+write(accuracy, out_dir+"/accuracy")
+
+print("")
+print("")
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/mnist_lenet.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/mnist_lenet.dml b/scripts/nn/examples/mnist_lenet.dml
new file mode 100644
index 0000000..e5755c4
--- /dev/null
+++ b/scripts/nn/examples/mnist_lenet.dml
@@ -0,0 +1,331 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * MNIST LeNet Example
+ */
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/conv2d_builtin.dml") as conv2d
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/dropout.dml") as dropout
+source("nn/layers/l2_reg.dml") as l2_reg
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
+source("nn/layers/relu.dml") as relu
+source("nn/layers/softmax.dml") as softmax
+source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+
+train = function(matrix[double] X, matrix[double] y,
+ matrix[double] X_val, matrix[double] y_val,
+ int C, int Hin, int Win, int epochs)
+ return (matrix[double] W1, matrix[double] b1,
+ matrix[double] W2, matrix[double] b2,
+ matrix[double] W3, matrix[double] b3,
+ matrix[double] W4, matrix[double] b4) {
+ /*
+ * Trains a convolutional net using the "LeNet" architecture.
+ *
+ * The input matrix, X, has N examples, each represented as a 3D
+ * volume unrolled into a single vector. The targets, y, have K
+ * classes, and are one-hot encoded.
+ *
+ * Inputs:
+ * - X: Input data matrix, of shape (N, C*Hin*Win).
+ * - y: Target matrix, of shape (N, K).
+ * - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
+ * - y_val: Target validation matrix, of shape (N, K).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - epochs: Total number of full training loops over the full data set.
+ *
+ * Outputs:
+ * - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
+ * - b1: 1st layer biases vector, of shape (F1, 1).
+ * - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
+ * - b2: 2nd layer biases vector, of shape (F2, 1).
+ * - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
+ * - b3: 3rd layer biases vector, of shape (1, N3).
+ * - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
+ * - b4: 4th layer biases vector, of shape (1, K).
+ */
+ N = nrow(X)
+ K = ncol(y)
+
+ # Create network:
+ # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
+ Hf = 5 # filter height
+ Wf = 5 # filter width
+ stride = 1
+ pad = 2 # For same dimensions, (Hf - stride) / 2
+
+ F1 = 32 # num conv filters in conv1
+ F2 = 64 # num conv filters in conv2
+ N3 = 512 # num nodes in affine3
+ # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)
+
+ [W1, b1] = conv2d::init(F1, C, Hf, Wf) # inputs: (N, C*Hin*Win)
+ [W2, b2] = conv2d::init(F2, F1, Hf, Wf) # inputs: (N, F1*(Hin/2)*(Win/2))
+ [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3) # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
+ [W4, b4] = affine::init(N3, K) # inputs: (N, N3)
+ W4 = W4 / sqrt(2) # different initialization, since being fed into softmax, instead of relu
+
+ # Initialize SGD w/ Nesterov momentum optimizer
+ lr = 0.01 # learning rate
+ mu = 0.9 #0.5 # momentum
+ decay = 0.95 # learning rate decay constant
+ vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
+ vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
+ vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
+ vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)
+
+ # Regularization
+ lambda = 5e-04
+
+ # Optimize
+ print("Starting optimization")
+ batch_size = 64
+ iters = ceil(N / batch_size)
+ for (e in 1:epochs) {
+ for(i in 1:iters) {
+ # Get next batch
+ beg = ((i-1) * batch_size) %% N + 1
+ end = min(N, beg + batch_size - 1)
+ X_batch = X[beg:end,]
+ y_batch = y[beg:end,]
+
+ # Compute forward pass
+ ## layer 1: conv1 -> relu1 -> pool1
+ [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ outr1 = relu::forward(outc1)
+ [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+ strideh=2, stridew=2, pad=0, pad=0)
+ ## layer 2: conv2 -> relu2 -> pool2
+ [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
+ stride, stride, pad, pad)
+ outr2 = relu::forward(outc2)
+ [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+ strideh=2, stridew=2, pad=0, pad=0)
+ ## layer 3: affine3 -> relu3 -> dropout
+ outa3 = affine::forward(outp2, W3, b3)
+ outr3 = relu::forward(outa3)
+ [outd3, maskd3] = dropout::forward(outr3, 0.5, -1)
+ ## layer 4: affine4 -> softmax
+ outa4 = affine::forward(outd3, W4, b4)
+ probs = softmax::forward(outa4)
+
+ # Compute loss & accuracy for training & validation data every 100 iterations.
+ if (i %% 100 == 0) {
+ # Compute training loss & accuracy
+ loss_data = cross_entropy_loss::forward(probs, y_batch)
+ loss_reg_W1 = l2_reg::forward(W1, lambda)
+ loss_reg_W2 = l2_reg::forward(W2, lambda)
+ loss_reg_W3 = l2_reg::forward(W3, lambda)
+ loss_reg_W4 = l2_reg::forward(W4, lambda)
+ loss = loss_data + loss_reg_W1 + loss_reg_W2 + loss_reg_W3 + loss_reg_W4
+ accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
+
+ # Compute validation loss & accuracy
+ probs_val = predict(X_val, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
+ loss_val = cross_entropy_loss::forward(probs_val, y_val)
+ accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
+
+ # Output results
+ print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
+ + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+ }
+
+ # Compute data backward pass
+ ## loss:
+ dprobs = cross_entropy_loss::backward(probs, y_batch)
+ ## layer 4: affine4 -> softmax
+ douta4 = softmax::backward(dprobs, outa4)
+ [doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4)
+ ## layer 3: affine3 -> relu3 -> dropout
+ doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3)
+ douta3 = relu::backward(doutr3, outa3)
+ [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
+ ## layer 2: conv2 -> relu2 -> pool2
+ doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+ strideh=2, stridew=2, pad=0, pad=0)
+ doutc2 = relu::backward(doutr2, outc2)
+ [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
+ Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
+ ## layer 1: conv1 -> relu1 -> pool1
+ doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+ strideh=2, stridew=2, pad=0, pad=0)
+ doutc1 = relu::backward(doutr1, outc1)
+ [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win,
+ Hf, Wf, stride, stride, pad, pad)
+
+ # Compute regularization backward pass
+ dW1_reg = l2_reg::backward(W1, lambda)
+ dW2_reg = l2_reg::backward(W2, lambda)
+ dW3_reg = l2_reg::backward(W3, lambda)
+ dW4_reg = l2_reg::backward(W4, lambda)
+ dW1 = dW1 + dW1_reg
+ dW2 = dW2 + dW2_reg
+ dW3 = dW3 + dW3_reg
+ dW4 = dW4 + dW4_reg
+
+ # Optimize with SGD w/ Nesterov momentum
+ [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
+ [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
+ [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
+ [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
+ [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
+ [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
+ [W4, vW4] = sgd_nesterov::update(W4, dW4, lr, mu, vW4)
+ [b4, vb4] = sgd_nesterov::update(b4, db4, lr, mu, vb4)
+ }
+ # Anneal momentum towards 0.999
+ #mu = mu + (0.999 - mu)/(1+epochs-e)
+ # Decay learning rate
+ lr = lr * decay
+ }
+}
+
+predict = function(matrix[double] X, int C, int Hin, int Win,
+ matrix[double] W1, matrix[double] b1,
+ matrix[double] W2, matrix[double] b2,
+ matrix[double] W3, matrix[double] b3,
+ matrix[double] W4, matrix[double] b4)
+ return (matrix[double] probs) {
+ /*
+ * Computes the class probability predictions of a convolutional
+ * net using the "LeNet" architecture.
+ *
+ * The input matrix, X, has N examples, each represented as a 3D
+ * volume unrolled into a single vector.
+ *
+ * Inputs:
+ * - X: Input data matrix, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
+ * - b1: 1st layer biases vector, of shape (F1, 1).
+ * - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
+ * - b2: 2nd layer biases vector, of shape (F2, 1).
+ * - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
+ * - b3: 3rd layer biases vector, of shape (1, N3).
+ * - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
+ * - b4: 4th layer biases vector, of shape (1, K).
+ *
+ * Outputs:
+ * - probs: Class probabilities, of shape (N, K).
+ */
+ N = nrow(X)
+
+ # Network:
+ # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
+ Hf = 5 # filter height
+ Wf = 5 # filter width
+ stride = 1
+ pad = 2 # For same dimensions, (Hf - stride) / 2
+
+ F1 = nrow(W1) # num conv filters in conv1
+ F2 = nrow(W2) # num conv filters in conv2
+ N3 = ncol(W3) # num nodes in affine3
+ K = ncol(W4) # num nodes in affine4, equal to number of target dimensions (num classes)
+
+ # Compute predictions over mini-batches
+ probs = matrix(0, rows=N, cols=K)
+ batch_size = 64
+ iters = ceil(N / batch_size)
+ for(i in 1:iters) {
+ # Get next batch
+ beg = ((i-1) * batch_size) %% N + 1
+ end = min(N, beg + batch_size - 1)
+ X_batch = X[beg:end,]
+
+ # Compute forward pass
+ ## layer 1: conv1 -> relu1 -> pool1
+ [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
+ pad, pad)
+ outr1 = relu::forward(outc1)
+ [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+ strideh=2, stridew=2, pad=0, pad=0)
+ ## layer 2: conv2 -> relu2 -> pool2
+ [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
+ stride, stride, pad, pad)
+ outr2 = relu::forward(outc2)
+ [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+ strideh=2, stridew=2, pad=0, pad=0)
+ ## layer 3: affine3 -> relu3
+ outa3 = affine::forward(outp2, W3, b3)
+ outr3 = relu::forward(outa3)
+ ## layer 4: affine4 -> softmax
+ outa4 = affine::forward(outr3, W4, b4)
+ probs_batch = softmax::forward(outa4)
+
+ # Store predictions
+ probs[beg:end,] = probs_batch
+ }
+}
+
+eval = function(matrix[double] probs, matrix[double] y)
+ return (double loss, double accuracy) {
+ /*
+ * Evaluates a convolutional net using the "LeNet" architecture.
+ *
+ * The probs matrix contains the class probability predictions
+ * of K classes over N examples. The targets, y, have K classes,
+ * and are one-hot encoded.
+ *
+ * Inputs:
+ * - probs: Class probabilities, of shape (N, K).
+ * - y: Target matrix, of shape (N, K).
+ *
+ * Outputs:
+ * - loss: Scalar loss, of shape (1).
+ * - accuracy: Scalar accuracy, of shape (1).
+ */
+ # Compute loss & accuracy
+ loss = cross_entropy_loss::forward(probs, y)
+ correct_pred = rowIndexMax(probs) == rowIndexMax(y)
+ accuracy = mean(correct_pred)
+}
+
+generate_dummy_data = function()
+ return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
+ /*
+ * Generate a dummy dataset similar to the MNIST dataset.
+ *
+ * Outputs:
+ * - X: Input data matrix, of shape (N, D).
+ * - y: Target matrix, of shape (N, K).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ */
+ # Generate dummy input data
+ N = 1024 # num examples
+ C = 1 # num input channels
+ Hin = 28 # input height
+ Win = 28 # input width
+ K = 10 # num target classes
+ X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+ classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))
+ y = table(seq(1, N), classes) # one-hot encoding
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/mnist_softmax-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/mnist_softmax-predict.dml b/scripts/nn/examples/mnist_softmax-predict.dml
new file mode 100644
index 0000000..4c8c434
--- /dev/null
+++ b/scripts/nn/examples/mnist_softmax-predict.dml
@@ -0,0 +1,77 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST Softmax - Predict
+#
+# This script computes the class probability predictions of a
+# trained softmax classifier on images of handwritten digits.
+#
+# Inputs:
+# - X: File containing training images.
+# The format is "pixel_1, pixel_2, ..., pixel_n".
+# - model_dir: Directory containing the trained weights and biases
+# of the model.
+# - out_dir: Directory to store class probability predictions for
+# each image.
+# - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
+# Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+# - probs: File containing class probability predictions for each
+# image.
+#
+# Data:
+# The X file should contain images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels.
+#
+# Sample Invocation:
+# 1. Download images.
+#
+# For example, save images to `nn/examples/data/mnist/images.csv`.
+#
+# 2. Execute using Spark
+# ```
+# spark-submit --master local[*] --driver-memory 5G
+# --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+# $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_softmax-predict.dml
+# -nvargs X=nn/examples/data/mnist/images.csv
+# model_dir=nn/examples/model/mnist_softmax out_dir=nn/examples/data/mnist
+#
+source("nn/examples/mnist_softmax.dml") as mnist_softmax
+
+# Read training data
+fmt = ifdef($fmt, "csv")
+X = read($X, format=fmt)
+
+# Scale images to [0,1], and one-hot encode the labels
+X = X / 255.0
+
+# Read model coefficients
+W = read($model_dir+"/W")
+b = read($model_dir+"/b")
+
+# Predict classes
+probs = mnist_softmax::predict(X, W, b)
+
+# Output results
+write(probs, $out_dir+"/probs."+fmt, format=fmt)
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/mnist_softmax-train.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/mnist_softmax-train.dml b/scripts/nn/examples/mnist_softmax-train.dml
new file mode 100644
index 0000000..09970f0
--- /dev/null
+++ b/scripts/nn/examples/mnist_softmax-train.dml
@@ -0,0 +1,110 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# MNIST Softmax - Train
+#
+# This script trains a softmax classifier on images of handwritten
+# digits.
+#
+# Inputs:
+# - train: File containing labeled MNIST training images.
+# The format is "label, pixel_1, pixel_2, ..., pixel_n".
+# - test: File containing labeled MNIST test images.
+# The format is "label, pixel_1, pixel_2, ..., pixel_n".
+# - out_dir: Directory to store weights and bias matrices of
+# trained model, as well as final test accuracy.
+# - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
+# Options include: "csv", "mm", "text", and "binary".
+#
+# Outputs:
+# - W: File containing the trained weights of the model.
+# - b: File containing the trained biases of the model.
+# - accuracy: File containing the final accuracy on the test data.
+#
+# Data:
+# The MNIST dataset contains labeled images of handwritten digits,
+# where each example is a 28x28 pixel image of grayscale values in
+# the range [0,255] stretched out as 784 pixels, and each label is
+# one of 10 possible digits in [0,9].
+#
+# Sample Invocation (running from wihtin the `examples` folder):
+# 1. Download data (60,000 training examples, and 10,000 test examples)
+# ```
+# nn/examples/get_mnist_data.sh
+# ```
+#
+# 2. Execute using Spark
+# ```
+# spark-submit --master local[*] --driver-memory 10G
+# --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
+# $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_softmax-train.dml
+# -nvargs train=nn/examples/data/mnist/mnist_train.csv test=nn/examples/data/mnist/mnist_test.csv
+# epochs=1 out_dir=nn/examples/model/mnist_softmax
+# ```
+#
+source("nn/examples/mnist_softmax.dml") as mnist_softmax
+
+# Read training data
+fmt = ifdef($fmt, "csv")
+train = read($train, format=fmt)
+test = read($test, format=fmt)
+epochs = ifdef($epochs, 1)
+out_dir = ifdef($out_dir, ".")
+
+# Extract images and labels
+images = train[,2:ncol(train)]
+labels = train[,1]
+X_test = test[,2:ncol(test)]
+y_test = test[,1]
+
+# Scale images to [0,1], and one-hot encode the labels
+n = nrow(train)
+n_test = nrow(test)
+classes = 10
+images = images / 255.0
+labels = table(seq(1, n), labels+1, n, classes)
+X_test = X_test / 255.0
+y_test = table(seq(1, n_test), y_test+1, n_test, classes)
+
+# Split into training (55,000 examples) and validation (5,000 examples)
+X = images[5001:nrow(images),]
+X_val = images[1:5000,]
+y = labels[5001:nrow(images),]
+y_val = labels[1:5000,]
+
+# Train
+[W, b] = mnist_softmax::train(X, y, X_val, y_val, epochs)
+
+# Write model out
+write(W, out_dir+"/W")
+write(b, out_dir+"/b")
+
+# Eval on test set
+probs = mnist_softmax::predict(X_test, W, b)
+[loss, accuracy] = mnist_softmax::eval(probs, y_test)
+
+# Output results
+print("Test Accuracy: " + accuracy)
+write(accuracy, out_dir+"/accuracy")
+
+print("")
+print("")
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/examples/mnist_softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/examples/mnist_softmax.dml b/scripts/nn/examples/mnist_softmax.dml
new file mode 100644
index 0000000..a529a12
--- /dev/null
+++ b/scripts/nn/examples/mnist_softmax.dml
@@ -0,0 +1,178 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * MNIST Softmax Example
+ */
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/softmax.dml") as softmax
+source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+
+train = function(matrix[double] X, matrix[double] y,
+ matrix[double] X_val, matrix[double] y_val,
+ int epochs)
+ return (matrix[double] W, matrix[double] b) {
+ /*
+ * Trains a softmax classifier.
+ *
+ * The input matrix, X, has N examples, each with D features.
+ * The targets, y, have K classes, and are one-hot encoded.
+ *
+ * Inputs:
+ * - X: Input data matrix, of shape (N, D).
+ * - y: Target matrix, of shape (N, K).
+ * - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
+ * - y_val: Target validation matrix, of shape (N, K).
+ * - epochs: Total number of full training loops over the full data set.
+ *
+ * Outputs:
+ * - W: Weights (parameters) matrix, of shape (D, M).
+ * - b: Biases vector, of shape (1, M).
+ */
+ N = nrow(X) # num examples
+ D = ncol(X) # num features
+ K = ncol(y) # num classes
+
+ # Create softmax classifier:
+ # affine -> softmax
+ [W, b] = affine::init(D, K)
+ W = W / sqrt(2.0/(D)) * sqrt(1/(D))
+
+ # Initialize SGD w/ Nesterov momentum optimizer
+ lr = 0.2 # learning rate
+ mu = 0 # momentum
+ decay = 0.99 # learning rate decay constant
+ vW = sgd_nesterov::init(W) # optimizer momentum state for W
+ vb = sgd_nesterov::init(b) # optimizer momentum state for b
+
+ # Optimize
+ print("Starting optimization")
+ batch_size = 50
+ iters = 1000 #ceil(N / batch_size)
+ for (e in 1:epochs) {
+ for(i in 1:iters) {
+ # Get next batch
+ beg = ((i-1) * batch_size) %% N + 1
+ end = min(N, beg + batch_size - 1)
+ X_batch = X[beg:end,]
+ y_batch = y[beg:end,]
+
+ # Compute forward pass
+ ## affine & softmax:
+ out = affine::forward(X_batch, W, b)
+ probs = softmax::forward(out)
+
+ # Compute loss & accuracy for training & validation data
+ loss = cross_entropy_loss::forward(probs, y_batch)
+ accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
+ probs_val = predict(X_val, W, b)
+ loss_val = cross_entropy_loss::forward(probs_val, y_val)
+ accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
+ print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
+ accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+
+ # Compute backward pass
+ ## loss:
+ dprobs = cross_entropy_loss::backward(probs, y_batch)
+ ## affine & softmax:
+ dout = softmax::backward(dprobs, out)
+ [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b)
+
+ # Optimize with SGD w/ Nesterov momentum
+ [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW)
+ [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb)
+ }
+ # Anneal momentum towards 0.999
+ mu = mu + (0.999 - mu)/(1+epochs-e)
+ # Decay learning rate
+ lr = lr * decay
+ }
+}
+
+predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
+ return (matrix[double] probs) {
+ /*
+ * Computes the class probability predictions of a softmax classifier.
+ *
+ * The input matrix, X, has N examples, each with D features.
+ *
+ * Inputs:
+ * - X: Input data matrix, of shape (N, D).
+ * - W: Weights (parameters) matrix, of shape (D, M).
+ * - b: Biases vector, of shape (1, M).
+ *
+ * Outputs:
+ * - probs: Class probabilities, of shape (N, K).
+ */
+ # Compute forward pass
+ ## affine & softmax:
+ out = affine::forward(X, W, b)
+ probs = softmax::forward(out)
+}
+
+eval = function(matrix[double] probs, matrix[double] y)
+ return (double loss, double accuracy) {
+ /*
+ * Evaluates a softmax classifier.
+ *
+ * The probs matrix contains the class probability predictions
+ * of K classes over N examples. The targets, y, have K classes,
+ * and are one-hot encoded.
+ *
+ * Inputs:
+ * - probs: Class probabilities, of shape (N, K).
+ * - y: Target matrix, of shape (N, K).
+ *
+ * Outputs:
+ * - loss: Scalar loss, of shape (1).
+ * - accuracy: Scalar accuracy, of shape (1).
+ */
+ # Compute loss & accuracy
+ loss = cross_entropy_loss::forward(probs, y)
+ correct_pred = rowIndexMax(probs) == rowIndexMax(y)
+ accuracy = mean(correct_pred)
+}
+
+generate_dummy_data = function()
+ return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
+ /*
+ * Generate a dummy dataset similar to the MNIST dataset.
+ *
+ * Outputs:
+ * - X: Input data matrix, of shape (N, D).
+ * - y: Target matrix, of shape (N, K).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ */
+ # Generate dummy input data
+ N = 1024 # num examples
+ C = 1 # num input channels
+ Hin = 28 # input height
+ Win = 28 # input width
+ T = 10 # num targets
+ X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+ classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
+ y = table(seq(1, N), classes) # one-hot encoding
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/affine.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/affine.dml b/scripts/nn/layers/affine.dml
new file mode 100644
index 0000000..c9a740b
--- /dev/null
+++ b/scripts/nn/layers/affine.dml
@@ -0,0 +1,92 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Affine (fully-connected) layer.
+ */
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for an affine (fully-connected) layer
+ * with M neurons. The input data has N examples, each with D
+ * features.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, D).
+ * - W: Weights, of shape (D, M).
+ * - b: Biases, of shape (1, M).
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, M).
+ */
+ out = X %*% W + b
+}
+
+backward = function(matrix[double] dout, matrix[double] X,
+ matrix[double] W, matrix[double] b)
+ return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+ /*
+ * Computes the backward pass for a fully-connected (affine) layer
+ * with M neurons.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of shape (N, M).
+ * - X: Inputs, of shape (N, D).
+ * - W: Weights, of shape (D, M).
+ * - b: Biases, of shape (1, M).
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, D).
+ * - dW: Gradient wrt `W`, of shape (D, M).
+ * - db: Gradient wrt `b`, of shape (1, M).
+ */
+ dX = dout %*% t(W)
+ dW = t(X) %*% dout
+ db = colSums(dout)
+}
+
+init = function(int D, int M)
+ return (matrix[double] W, matrix[double] b) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * We use the heuristic by He et al., which limits the magnification
+ * of inputs/gradients during forward/backward passes by scaling
+ * unit-Gaussian weights by a factor of sqrt(2/n), under the
+ * assumption of relu neurons.
+ * - http://arxiv.org/abs/1502.01852
+ *
+ * Inputs:
+ * - D: Dimensionality of the input features (number of features).
+ * - M: Number of neurons in this layer.
+ *
+ * Outputs:
+ * - W: Weights, of shape (D, M).
+ * - b: Biases, of shape (1, M).
+ */
+ W = rand(rows=D, cols=M, pdf="normal") * sqrt(2.0/D)
+ b = matrix(0, rows=1, cols=M)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/batch_norm1d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/batch_norm1d.dml b/scripts/nn/layers/batch_norm1d.dml
new file mode 100644
index 0000000..2ccffdb
--- /dev/null
+++ b/scripts/nn/layers/batch_norm1d.dml
@@ -0,0 +1,210 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 1D Batch Normalization layer.
+ */
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ string mode, matrix[double] ema_mean, matrix[double] ema_var,
+ double mu, double epsilon)
+ return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
+ matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
+ /*
+ * Computes the forward pass for a 1D batch normalization layer.
+ * The input data has N examples, each with D features.
+ *
+ * A batch normalization layer uses the per-feature sample mean and
+ * per-feature uncorrected sample variance during training to
+ * normalize each feature of the input data. Additionally, it
+ * introduces learnable parameters (gamma, beta) to control the
+ * amount of normalization.
+ *
+ * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
+ *
+ * This implementation maintains exponential moving averages of the
+ * mean and variance during training for use during testing.
+ *
+ * Reference:
+ * - Batch Normalization: Accelerating Deep Network Training by
+ * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
+ * - https://arxiv.org/abs/1502.03167
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, D).
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ * - mode: 'train' or 'test' to indicate if the model is currently
+ * being trained or tested. During training, the current batch
+ * mean and variance will be used to normalize the inputs, while
+ * during testing, the exponential average of the mean and
+ * variance over all previous batches will be used.
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (1, D).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (1, D).
+ * - mu: Momentum value for moving averages.
+ * Typical values are in the range of [0.9, 0.999].
+ * - epsilon: Smoothing term to avoid divide by zero errors.
+ * Typical values are in the range of [1e-5, 1e-3].
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, D).
+ * - ema_mean_upd: Updated exponential moving average of the mean,
+ * of shape (1, D).
+ * - ema_var_upd: Updated exponential moving average of the variance,
+ * of shape (1, D).
+ * - cache_mean: Cache of the batch mean, of shape (1, D).
+ * Note: This is used for performance during training.
+ * - cache_var: Cache of the batch variance, of shape (1, D).
+ * Note: This is used for performance during training.
+ * - cache_norm: Cache of the normalized inputs, of shape (N, D).
+ * Note: This is used for performance during training.
+ */
+ N = nrow(X)
+
+ if (mode == 'train') {
+ # Compute feature-wise mean and variance
+ mean = colMeans(X) # shape (1, D)
+ # var = (1/N) * colSums((X-mean)^2)
+ var = colVars(X) * ((N-1)/N) # compute uncorrected variance, of shape (1, D)
+ # Update moving averages
+ ema_mean_upd = mu*ema_mean + (1-mu)*mean
+ ema_var_upd = mu*ema_var + (1-mu)*var
+ }
+ else {
+ # Use moving averages of mean and variance during testing
+ mean = ema_mean
+ var = ema_var
+ ema_mean_upd = ema_mean
+ ema_var_upd = ema_var
+ }
+
+ # Normalize, shift, and scale
+ # norm = (X-mean)*(var+epsilon)^(-1/2)
+ norm = (X-mean) / sqrt(var+epsilon) # shape (N, D)
+ out = norm*gamma + beta # shape (N, D)
+
+ # Save variable for backward pass
+ cache_mean = mean
+ cache_var = var
+ cache_norm = norm
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+ matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
+ matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
+ matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ string mode, matrix[double] ema_mean, matrix[double] ema_var,
+ double mu, double epsilon)
+ return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+ /*
+ * Computes the backward pass for a 1D batch normalization layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of shape (N, D).
+ * - out: Outputs from the forward pass, of shape (N, D).
+ * - ema_mean_upd: Updated exponential moving average of the mean
+ * from the forward pass, of shape (1, D).
+ * - ema_var_upd: Updated exponential moving average of the variance
+ * from the forward pass, of shape (1, D).
+ * - cache_mean: Cache of the batch mean from the forward pass, of
+ * shape (1, D). Note: This is used for performance during
+ * training.
+ * - cache_var: Cache of the batch variance from the forward pass,
+ * of shape (1, D). Note: This is used for performance during
+ * training.
+ * - cache_norm: Cache of the normalized inputs from the forward
+ * pass, of shape (N, D). Note: This is used for performance
+ * during training.
+ * - X: Inputs, of shape (N, D).
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ * - mode: 'train' or 'test' to indicate if the model is currently
+ * being trained or tested. During training, the current batch
+ * mean and variance will be used to normalize the inputs, while
+ * during testing, the exponential average of the mean and
+ * variance over all previous batches will be used.
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (1, D).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (1, D).
+ * - mu: Momentum value for moving averages.
+ * Typical values are in the range of [0.9, 0.999].
+ * - epsilon: Smoothing term to avoid divide by zero errors.
+ * Typical values are in the range of [1e-5, 1e-3].
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, D).
+ * - dgamma: Gradient wrt `W`, of shape (1, D).
+ * - dbeta: Gradient wrt `b`, of shape (1, D).
+ *
+ */
+ N = nrow(X)
+ mean = cache_mean
+ var = cache_var
+ norm = cache_norm
+ centered = X-mean
+
+ if (mode == 'train') {
+ # Compute gradients during training
+ dgamma = colSums(dout*norm) # shape (1, D)
+ dbeta = colSums(dout) # shape (1, D)
+ dnorm = dout * gamma # shape (N, D)
+ dvar = (-1/2) * colSums(centered * (var+epsilon)^(-3/2) * dnorm) # shape (1, D)
+ dmean = colSums((-dnorm/sqrt(var+epsilon)) + ((-2/N)*centered*dvar)) # shape (1, D)
+ dX = (dnorm/sqrt(var+epsilon)) + ((2/N)*centered*dvar) + ((1/N)*dmean) # shape (N, D)
+ }
+ else {
+ # Compute gradients during testing
+ dgamma = colSums(dout*norm) # shape (1, D)
+ dbeta = colSums(dout) # shape (1, D)
+ dnorm = dout * gamma # shape (N, D)
+ dX = dnorm / sqrt(var+epsilon) # shape (N, D)
+ }
+}
+
+init = function(int D)
+ return (matrix[double] gamma, matrix[double] beta,
+ matrix[double] ema_mean, matrix[double] ema_var) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - D: Dimensionality of the input features (number of features).
+ *
+ * Outputs:
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (1, D).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (1, D).
+ */
+ gamma = matrix(1, rows=1, cols=D)
+ beta = matrix(0, rows=1, cols=D)
+ ema_mean = matrix(0, rows=1, cols=D)
+ ema_var = matrix(1, rows=1, cols=D)
+}
+
[07/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn`
library to `scripts/nn`
Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/test.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/test.dml b/scripts/nn/test/test.dml
new file mode 100644
index 0000000..a5cb497
--- /dev/null
+++ b/scripts/nn/test/test.dml
@@ -0,0 +1,549 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Various tests, not including gradient checks.
+ */
+source("nn/layers/batch_norm1d.dml") as batch_norm1d
+source("nn/layers/batch_norm2d.dml") as batch_norm2d
+source("nn/layers/conv2d.dml") as conv2d
+source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/max_pool2d.dml") as max_pool2d
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
+source("nn/layers/tanh.dml") as tanh
+source("nn/test/conv2d_simple.dml") as conv2d_simple
+source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
+source("nn/test/util.dml") as test_util
+source("nn/util.dml") as util
+
+batch_norm1d = function() {
+ /*
+ * Test for the 1D batch normalization function.
+ */
+ print("Testing the 1D batch normalization function.")
+
+ # Generate data
+ N = 4 # Number of examples
+ D = 4 # Number of features
+ mode = 'train' # execution mode
+ mu = 0.9 # momentum of moving averages
+ eps = 1e-5 # smoothing term
+ X = matrix(seq(1,16), rows=N, cols=D)
+
+ # Create layer
+ [gamma, beta, ema_mean, ema_var] = batch_norm1d::init(D)
+
+ # Forward
+ [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+
+ # Equivalency check
+ target = matrix("-1.34160721 -1.34160721 -1.34160733 -1.34160709
+ -0.44720244 -0.44720244 -0.44720244 -0.44720232
+ 0.44720244 0.44720232 0.44720244 0.44720244
+ 1.34160733 1.34160721 1.34160733 1.34160733", rows=1, cols=N*D)
+ out = matrix(out, rows=1, cols=N*D)
+ for (i in 1:length(out)) {
+ rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
+ as.scalar(target[1,i]), 1e-3, 1e-4)
+ }
+}
+
+conv2d = function() {
+ /*
+ * Test for the 2D convolution functions.
+ */
+ print("Testing the 2D convolution functions.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 3 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ F = 2 # num filters
+ Hf = 3 # filter height
+ Wf = 3 # filter width
+ stride = 1
+ pad = 1
+ X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+
+ # Create layer
+ [W, b] = conv2d::init(F, C, Hf, Wf)
+
+ # Forward
+ [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ [out_simple, Hout_simple, Wout_simple] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+ [out_builtin, Hout_builtin, Wout_builtin] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+
+ # Equivalency check
+ out = matrix(out, rows=1, cols=N*F*Hout*Wout)
+ out_simple = matrix(out_simple, rows=1, cols=N*F*Hout*Wout)
+ out_builtin = matrix(out_builtin, rows=1, cols=N*F*Hout*Wout)
+ for (i in 1:length(out)) {
+ rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
+ as.scalar(out_simple[1,i]), 1e-10, 1e-12)
+ rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
+ as.scalar(out_builtin[1,i]), 1e-10, 1e-12)
+ }
+}
+
+cross_entropy_loss = function() {
+ /*
+ * Test for the cross-entropy loss function.
+ *
+ * Here we make sure that the cross-entropy loss function does
+ * not propagate `infinity` values in the case that a prediction is
+` * exactly equal to 0.
+ */
+ print("Testing the cross-entropy loss function with zero-valued predictions.")
+
+ # Generate data
+ N = 3 # num examples
+ K = 10 # num targets
+ pred = matrix(0, rows=N, cols=K)
+ y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
+ y = y / rowSums(y) # normalized probs
+
+ loss = cross_entropy_loss::forward(pred, y)
+
+ inf = 1/0
+ if (loss == inf) {
+ print("ERROR: The cross-entropy loss function ouptuts infinity for all-zero predictions.")
+ }
+}
+
+im2col = function() {
+ /*
+ * Test for the `im2col` and `col2im` functions.
+ */
+ print("Testing the im2col and col2im functions.")
+
+ # Generate data
+ C = 3 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ Hf = 3 # filter height
+ Wf = 3 # filter width
+ stride = 2
+ pad = (Hin * stride - Hin + Hf - stride) / 2
+ Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
+ Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
+ x = rand(rows=C, cols=Hin*Win)
+
+ # pad
+ x_pad = util::pad_image(x, Hin, Win, pad, pad, 0)
+
+ # im2col
+ x_cols = util::im2col(x_pad, Hin+2*pad, Win+2*pad, Hf, Wf, stride, stride)
+
+ if (ncol(x_cols) != Hout*Wout) {
+ print("ERROR: im2col does not yield the correct output size: "
+ + ncol(x_cols)+" (actual) vs. "+Hout*Wout+" (correct).")
+ }
+
+ # col2im
+ x_pad2 = util::col2im(x_cols, C, Hin+2*pad, Win+2*pad, Hf, Wf, stride, stride, "none")
+
+ # Equivalency check
+ equivalent = test_util::all_equal(x_pad, x_pad2)
+ if (!equivalent) {
+ print("ERROR: im2col and then col2im does not yield the original image.")
+ }
+}
+
+padding = function() {
+ /*
+ * Test for the `pad_image` and `unpad_image` functions.
+ */
+ print("Testing the padding and unpadding functions.")
+
+ # Generate data
+ C = 3 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ pad = 3 # padding
+ x = rand(rows=C, cols=Hin*Win)
+
+ # Pad image
+ x_pad = util::pad_image(x, Hin, Win, pad, pad, 0)
+
+ # Check for padded rows & columns
+ for (c in 1:C) {
+ x_pad_slice = matrix(x_pad[c,], rows=Hin+2*pad, cols=Win+2*pad)
+ for (i in 1:pad) {
+ rowsum = sum(x_pad_slice[i,])
+ colsum = sum(x_pad_slice[,i])
+ if (rowsum != 0)
+ print("ERROR: Padding was not applied to row " + i + ".")
+ if (colsum != 0)
+ print("ERROR: Padding was not applied to column " + i + ".")
+ }
+ }
+
+ # Unpad image
+ x1 = util::unpad_image(x_pad, Hin, Win, pad, pad)
+
+ # Equivalency check
+ equivalent = test_util::all_equal(x, x1)
+ if (!equivalent) {
+ print("ERROR: Padding and then unpadding does not yield the original image.")
+ }
+}
+
+max_pool2d = function() {
+ /*
+ * Test for the 2D max pooling functions.
+ */
+ print("Testing the 2D max pooling functions.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 3 # num channels
+ Hin = 8 # input height
+ Win = 8 # input width
+ Hf = 2 # filter height
+ Wf = 2 # filter width
+ stride = 2
+ X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+
+ for (padh in 0:3) {
+ for (padw in 0:3) {
+ print(" - Testing w/ padh="+padh+" & padw="+padw+".")
+ #if (1==1) {} # force correct printing
+ #print(" - Testing forward")
+ [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, padh, padw)
+ [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+ stride, stride,
+ padh, padw)
+ [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win,
+ Hf, Wf,
+ stride, stride,
+ padh, padw)
+
+ # Equivalency check
+ out = matrix(out, rows=1, cols=N*C*Hout*Wout)
+ out_simple = matrix(out_simple, rows=1, cols=N*C*Hout*Wout)
+ out_builtin = matrix(out_builtin, rows=1, cols=N*C*Hout*Wout)
+ for (i in 1:length(out)) {
+ rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
+ as.scalar(out_simple[1,i]), 1e-10, 1e-12)
+ rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
+ as.scalar(out_builtin[1,i]), 1e-10, 1e-12)
+ }
+
+ #print(" - Testing backward")
+ dout = rand(rows=N, cols=C*Hout*Wout, pdf="normal")
+ dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
+ padh, padw)
+ dX_simple = max_pool2d_simple::backward(dout, Hout_simple, Wout_simple, X, C, Hin, Win,
+ Hf, Wf, stride, stride, padh, padw)
+ dX_builtin = max_pool2d_builtin::backward(dout, Hout_builtin, Wout_builtin, X, C, Hin, Win,
+ Hf, Wf, stride, stride, padh, padw)
+
+ # Equivalency check
+ dX = matrix(dX, rows=1, cols=N*C*Hin*Win)
+ dX_simple = matrix(dX_simple, rows=1, cols=N*C*Hin*Win)
+ dX_builtin = matrix(dX_builtin, rows=1, cols=N*C*Hin*Win)
+ for (i in 1:length(dX)) {
+ rel_error = test_util::check_rel_error(as.scalar(dX[1,i]),
+ as.scalar(dX_simple[1,i]), 1e-10, 1e-12)
+ rel_error = test_util::check_rel_error(as.scalar(dX[1,i]),
+ as.scalar(dX_builtin[1,i]), 1e-10, 1e-12)
+ }
+ }
+ }
+
+ # ---
+ print(" - Testing for correct behavior against known answer w/ pad=0.")
+ # generate data
+ # -- channel 1
+ # 1 2 3 4
+ # 5 6 7 8
+ # 9 10 11 12
+ # 13 14 15 16
+ # -- channel 2
+ # 1 5 9 13
+ # 2 6 10 14
+ # 3 7 11 15
+ # 4 8 12 16
+ C = 2 # num channels
+ Hin = 4 # input height
+ Win = 4 # input width
+ X = matrix(seq(1,16,1), rows=Hin, cols=Win)
+ X = matrix(rbind(X, t(X)), rows=1, cols=C*Hin*Win) # C=2
+ X = rbind(X, X) # n=2
+ pad = 0
+
+ # forward
+ [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+ [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+
+ # equivalency check
+ # -- channel 1
+ # 6 8
+ # 14 16
+ # -- channel 2
+ # 6 14
+ # 8 16
+ target = matrix("6 8 14 16 6 14 8 16", rows=1, cols=C*Hout*Wout)
+ target = rbind(target, target) # n=2
+ tmp = test_util::check_all_equal(out, target)
+ tmp = test_util::check_all_equal(out_simple, target)
+ tmp = test_util::check_all_equal(out_builtin, target)
+
+ print(" - Testing for correct behavior against known answer w/ pad=1.")
+ # generate data
+ # -- channel 1
+ # 0 0 0 0 0 0
+ # 0 1 2 3 4 0
+ # 0 5 6 7 8 0
+ # 0 9 10 11 12 0
+ # 0 13 14 15 16 0
+ # 0 0 0 0 0 0
+ # -- channel 2
+ # 0 0 0 0 0 0
+ # 0 1 5 9 13 0
+ # 0 2 6 10 14 0
+ # 0 3 7 11 15 0
+ # 0 4 8 12 16 0
+ # 0 0 0 0 0 0
+ pad = 1
+
+ # forward
+ [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+ [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+
+ # equivalency check
+ # -- channel 1
+ # 1 3 4
+ # 9 11 12
+ # 13 15 16
+ # -- channel 2
+ # 1 9 13
+ # 3 11 15
+ # 4 12 16
+ target = matrix("1 3 4 9 11 12 13 15 16 1 9 13 3 11 15 4 12 16", rows=1, cols=C*Hout*Wout)
+ target = rbind(target, target) # n=2
+ tmp = test_util::check_all_equal(out, target)
+ tmp = test_util::check_all_equal(out_simple, target)
+ tmp = test_util::check_all_equal(out_builtin, target)
+
+ print(" - Testing for correct behavior against known answer w/ all negative matrix w/ pad=0.")
+ # generate data
+ # -- channel 1
+ # -1 -2 -3 -4
+ # -5 -6 -7 -8
+ # -9 -10 -11 -12
+ # -13 -14 -15 -16
+ # -- channel 2
+ # -1 -5 -9 -13
+ # -2 -6 -10 -14
+ # -3 -7 -11 -15
+ # -4 -8 -12 -16
+ X = X * -1
+ pad = 0
+
+ # forward
+ [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+ [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+
+ # equivalency check
+ # -- channel 1
+ # -1 -3
+ # -9 -11
+ # -- channel 2
+ # -1 -9
+ # -3 -11
+ target = matrix("-1 -3 -9 -11 -1 -9 -3 -11", rows=1, cols=C*Hout*Wout)
+ target = rbind(target, target) # n=2
+ tmp = test_util::check_all_equal(out, target)
+ tmp = test_util::check_all_equal(out_simple, target)
+ tmp = test_util::check_all_equal(out_builtin, target)
+
+
+ print(" - Testing for correct behavior against known answer w/ all negative matrix w/ pad=1.")
+ # generate data
+ # -- channel 1
+ # 0 0 0 0 0 0
+ # 0 -1 -2 -3 -4 0
+ # 0 -5 -6 -7 -8 0
+ # 0 -9 -10 -11 -12 0
+ # 0 -13 -14 -15 -16 0
+ # 0 0 0 0 0 0
+ # -- channel 2
+ # 0 0 0 0 0 0
+ # 0 -1 -5 -9 -13 0
+ # 0 -2 -6 -10 -14 0
+ # 0 -3 -7 -11 -15 0
+ # 0 -4 -8 -12 -16 0
+ # 0 0 0 0 0 0
+ pad = 1
+
+ # forward
+ [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+ [out_simple, Hout_simple, Wout_simple] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+ [out_builtin, Hout_builtin, Wout_builtin] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+
+ # equivalency check
+ # -- channel 1
+ # 0 0 0
+ # 0 -6 0
+ # 0 0 0
+ # -- channel 2
+ # 0 0 0
+ # 0 -6 0
+ # 0 0 0
+ target = matrix("-1 -2 -4 -5 -6 -8 -13 -14 -16 -1 -5 -13 -2 -6 -14 -4 -8 -16",
+ rows=1, cols=C*Hout*Wout)
+ target = rbind(target, target) # n=2
+ tmp = test_util::check_all_equal(out, target)
+ tmp = test_util::check_all_equal(out_simple, target)
+ tmp = test_util::check_all_equal(out_builtin, target)
+}
+
+batch_norm2d = function() {
+ /*
+ * Test for the 2D (spatial) batch normalization function.
+ */
+ print("Testing the 2D (spatial) batch normalization function.")
+
+ # Generate data
+ N = 2 # Number of examples
+ C = 3 # num channels
+ Hin = 4 # input height
+ Win = 5 # input width
+ mode = 'train' # execution mode
+ mu = 0.9 # momentum of moving averages
+ eps = 1e-5 # smoothing term
+ X = matrix("70 29 23 55 72
+ 42 98 68 48 39
+ 34 73 44 6 40
+ 74 18 18 53 53
+
+ 63 85 72 61 72
+ 32 36 23 29 63
+ 9 43 43 49 43
+ 31 43 89 94 50
+
+ 62 12 32 41 87
+ 25 48 99 52 61
+ 12 83 60 55 34
+ 30 42 68 88 51
+
+
+ 67 59 62 67 84
+ 8 76 24 19 57
+ 10 89 63 72 2
+ 59 56 16 15 70
+
+ 32 69 55 39 93
+ 84 36 4 30 40
+ 70 100 36 76 59
+ 69 15 40 24 34
+
+ 51 67 11 13 32
+ 66 85 55 85 38
+ 32 35 17 83 34
+ 55 58 52 0 99", rows=N, cols=C*Hin*Win)
+
+ # Create layer
+ [gamma, beta, ema_mean, ema_var] = batch_norm2d::init(C)
+
+ # Forward
+ [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+
+ # Equivalency check
+ target = matrix("0.86215019 -0.76679718 -1.00517964 0.26619387 0.94161105
+ -0.25030172 1.97460198 0.78268933 -0.01191914 -0.36949289
+ -0.56814504 0.98134136 -0.17084086 -1.68059683 -0.32976246
+ 1.02107191 -1.20383179 -1.20383179 0.18673301 0.18673301
+
+ 0.50426388 1.41921711 0.87856293 0.42108631 0.87856293
+ -0.78498828 -0.61863315 -1.15928721 -0.90975463 0.50426388
+ -1.74153018 -0.32751167 -0.32751167 -0.07797909 -0.32751167
+ -0.82657707 -0.32751167 1.58557224 1.79351616 -0.0363903
+
+ 0.4607178 -1.49978399 -0.71558321 -0.36269283 1.44096887
+ -0.99005347 -0.08822262 1.91148913 0.06861746 0.42150795
+ -1.49978399 1.28412855 0.38229787 0.18624771 -0.63716316
+ -0.79400325 -0.32348287 0.69597805 1.48017895 0.0294075
+
+
+ 0.74295878 0.42511559 0.54430676 0.74295878 1.41837597
+ -1.60113597 1.10053277 -0.96544927 -1.16410136 0.34565473
+ -1.52167511 1.61702824 0.5840373 0.94161105 -1.83951855
+ 0.42511559 0.30592418 -1.28329265 -1.32302308 0.86215019
+
+ -0.78498828 0.75379658 0.17155361 -0.4938668 1.75192738
+ 1.37762833 -0.61863315 -1.9494741 -0.86816585 -0.45227802
+ 0.79538536 2.04304862 -0.61863315 1.04491806 0.33790874
+ 0.75379658 -1.49199748 -0.45227802 -1.11769855 -0.70181072
+
+ 0.0294075 0.65676796 -1.53899395 -1.46057391 -0.71558321
+ 0.61755812 1.36254871 0.18624771 1.36254871 -0.48032296
+ -0.71558321 -0.59795308 -1.30373383 1.28412855 -0.63716316
+ 0.18624771 0.30387771 0.06861746 -1.97030437 1.91148913",
+ rows=1, cols=N*C*Hin*Win)
+ out = matrix(out, rows=1, cols=N*C*Hin*Win)
+ for (i in 1:length(out)) {
+ rel_error = test_util::check_rel_error(as.scalar(out[1,i]),
+ as.scalar(target[1,i]), 1e-3, 1e-4)
+ }
+}
+
+tanh = function() {
+ /*
+ * Test for the `tanh` forward function.
+ */
+ print("Testing the tanh forward function.")
+
+ # Generate data
+ N = 2 # num examples
+ C = 3 # num channels
+ X = rand(rows=N, cols=C, pdf="normal")
+
+ out = tanh::forward(X)
+ out_ref = (exp(X) - exp(-X)) / (exp(X) + exp(-X))
+
+ # Equivalency check
+ for (i in 1:nrow(out)) {
+ for (j in 1:ncol(out)) {
+ rel_error = test_util::check_rel_error(as.scalar(out[i,j]), as.scalar(out_ref[i,j]),
+ 1e-10, 1e-12)
+ }
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/util.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/test/util.dml b/scripts/nn/test/util.dml
new file mode 100644
index 0000000..e32a885
--- /dev/null
+++ b/scripts/nn/test/util.dml
@@ -0,0 +1,155 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Test utility functions.
+ */
+
+all_equal = function(matrix[double] X1, matrix[double] X2)
+ return(boolean equivalent) {
+ /*
+ * Determine if two matrices are equivalent.
+ *
+ * Inputs:
+ * - X1: Inputs, of shape (any, any).
+ * - X2: Inputs, of same shape as X1.
+ *
+ * Outputs:
+ * - equivalent: Whether or not the two matrices are equivalent.
+ */
+ equivalent = as.logical(prod(X1 == X2))
+}
+
+check_all_equal = function(matrix[double] X1, matrix[double] X2)
+ return(boolean equivalent) {
+ /*
+ * Check if two matrices are equivalent, and report any issues.
+ *
+ * Issues an "ERROR" statement if elements of the two matrices are
+ * not equal.
+ *
+ * Inputs:
+ * - X1: Inputs, of shape (any, any).
+ * - X2: Inputs, of same shape as X1.
+ *
+ * Outputs:
+ * - equivalent: Whether or not the two matrices are equivalent.
+ */
+ # Determine if matrices are equivalent
+ equivalent = all_equal(X1, X2)
+
+ # Evaluate relative error
+ if (!equivalent) {
+ print("ERROR: The two matrices are not equivalent.")
+ }
+}
+
+compute_rel_error = function(double x1, double x2)
+ return (double rel_error) {
+ /*
+ * Relative error measure between two values.
+ *
+ * Uses smoothing to avoid divide-by-zero errors.
+ *
+ * Inputs:
+ * - x1: First value.
+ * - x2: Second value.
+ *
+ * Outputs:
+ * - rel_error: Relative error measure between the two values.
+ */
+ rel_error = abs(x1-x2) / max(1e-8, abs(x1)+abs(x2))
+}
+
+check_rel_error = function(double x1, double x2, double thresh_error, double thresh_warn)
+ return (double rel_error) {
+ /*
+ * Check and report any issues with the relative error measure between
+ * two values.
+ *
+ * Issues an "ERROR" statement for relative errors > thresh_error,
+ * indicating that the implementation is likely incorrect.
+ *
+ * Issues a "WARNING" statement for relative errors < thresh_error
+ * but > thresh_warn, indicating that the implementation may be
+ * incorrect.
+ *
+ * Inputs:
+ * - x1: First value.
+ * - x2: Second value.
+ * - thresh_error: Error threshold.
+ * - thresh_warn: Warning threshold.
+ *
+ * Outputs:
+ * - rel_error: Relative error measure between the two values.
+ */
+ # Compute relative error
+ rel_error = compute_rel_error(x1, x2)
+
+ # Evaluate relative error
+ if (rel_error > thresh_error) {
+ print("ERROR: Relative error " + rel_error + " > " + thresh_error + " with " + x1 +
+ " vs " + x2 + ".")
+ }
+ else if (rel_error > thresh_warn & rel_error <= thresh_error) {
+ print("WARNING: Relative error " + rel_error + " > " + thresh_warn + " & <= " + thresh_error +
+ " with " + x1 + " vs " + x2 + ".")
+ }
+}
+
+check_rel_grad_error = function(double dw_a, double dw_n, double lossph, double lossmh)
+ return (double rel_error) {
+ /*
+ * Check and report any issues with the relative error measure between
+ * the analytical and numerical partial derivatives.
+ *
+ * - Issues an "ERROR" statement for relative errors > 1e-2,
+ * indicating that the gradient is likely incorrect.
+ * - Issues a "WARNING" statement for relative errors < 1e-2
+ * but > 1e-4, indicating that the may be incorrect.
+ *
+ * Inputs:
+ * - dw_a: Analytical partial derivative wrt w.
+ * - dw_n: Numerical partial derivative wrt w.
+ * - lossph: Loss evaluated with w set to w+h.
+ * - lossmh: Loss evaluated with w set to w-h.
+ *
+ * Outputs:
+ * - rel_error: Relative error measure between the two derivatives.
+ */
+ # Compute relative error
+ rel_error = compute_rel_error(dw_a, dw_n)
+
+ # Evaluate relative error
+ thresh_error = 1e-2
+ thresh_warn = 1e-4
+ if (rel_error > thresh_error) {
+ print("ERROR: Relative error " + rel_error + " > " + thresh_error + " with " + dw_a +
+ " analytical vs " + dw_n + " numerical, with lossph " + lossph +
+ " and lossmh " + lossmh)
+ }
+ else if (rel_error > thresh_warn & rel_error <= thresh_error) {
+ print("WARNING: Relative error " + rel_error + " > " + thresh_warn + " & <= " + thresh_error +
+ " with " + dw_a + " analytical vs " + dw_n + " numerical, with lossph " + lossph +
+ " and lossmh " + lossmh)
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/util.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/util.dml b/scripts/nn/util.dml
new file mode 100644
index 0000000..3a73f08
--- /dev/null
+++ b/scripts/nn/util.dml
@@ -0,0 +1,202 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Utility functions.
+ */
+
+channel_sums = function(matrix[double] X, int C, int Hin, int Win)
+ return (matrix[double] out) {
+ /*
+ * Computes a channel-wise summation over a 4D input.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (C, 1).
+ */
+ # Here we sum each column, reshape to (C, Hin*Win), and sum each row to result in the summation
+ # for each channel.
+ out = rowSums(matrix(colSums(X), rows=C, cols=Hin*Win)) # shape (C, 1)
+}
+
+im2col = function(matrix[double] img, int Hin, int Win, int Hf, int Wf, int strideh, int stridew)
+ return (matrix[double] img_cols) {
+ /*
+ * Rearrange local image regions (patches) into columns.
+ *
+ * Assumes image has already been padded as necessary.
+ *
+ * Inputs:
+ * - img: Input image, of shape (C, Hin*Win), where C is the number
+ * of input channels (depth).
+ * - Hin: Input height, including padding.
+ * - Win: Input width, including padding.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ *
+ * Outputs:
+ * - img_cols: Local spatial regions (patches) of the image stretched
+ * out into columns, of shape (C*Hf*Wf, Hout*Wout).
+ */
+ C = nrow(img)
+ Hout = as.integer(floor((Hin-Hf)/strideh + 1))
+ Wout = as.integer(floor((Win-Wf)/stridew + 1))
+
+ # Note: We start with `img_cols` transposed to allow for row-major
+ # left-indexing inside the loop, which is more performant.
+ img_cols = matrix(0, rows=Hout*Wout, cols=C*Hf*Wf) # zeros
+ parfor (hout in 1:Hout, check=0) { # all output rows
+ hin = (hout-1)*strideh + 1
+ parfor (wout in 1:Wout, check=0) { # all output columns
+ win = (wout-1)*stridew + 1
+ # Extract a local patch of the input image corresponding spatially to the filter sizes.
+ img_patch = matrix(0, rows=C, cols=Hf*Wf) # zeros
+ parfor (c in 1:C) { # all channels
+ img_slice = matrix(img[c,], rows=Hin, cols=Win) # reshape
+ img_patch[c,] = matrix(img_slice[hin:hin+Hf-1, win:win+Wf-1], rows=1, cols=Hf*Wf)
+ }
+ img_cols[(hout-1)*Wout + wout,] = t(matrix(img_patch, rows=C*Hf*Wf, cols=1)) # reshape
+ }
+ }
+ img_cols = t(img_cols)
+}
+
+col2im = function(matrix[double] img_cols, int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, string reduction)
+ return (matrix[double] img) {
+ /*
+ * Create an image from columns of local image regions (patches).
+ *
+ * The reduction strategy determines how to deal with overlapping
+ * patches. If it is set to "add", any overlapping patches will be
+ * added together when creating the image. This is useful when
+ * computing gradients on the original image given gradients on the
+ * patches. Otherwise, if "none" is provided, any overlapping
+ * patches will just override previous ones when creating the image.
+ * This is useful when recreating an image from the output of
+ * `im2col`.
+ *
+ * Assumes original image was already padded as necessary.
+ *
+ * Inputs:
+ * - img_cols: Local spatial regions (patches) of the image stretched
+ * out into columns, of shape (C*Hf*Wf, Hout*Wout).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height, including padding.
+ * - Win: Input width, including padding.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - reduction: The reduction strategy to use for overlapping
+ * patches. Valid options are "add" and "none".
+ *
+ * Outputs:
+ * - img: Input image, of shape (C, Hin*Win).
+ */
+ Hout = as.integer(floor((Hin-Hf)/strideh + 1))
+ Wout = as.integer(floor((Win-Wf)/stridew + 1))
+
+ img = matrix(0, rows=C, cols=Hin*Win) # zeros
+ for (hout in 1:Hout) { # all output rows
+ hin = (hout-1)*strideh + 1
+ for (wout in 1:Wout) { # all output columns
+ win = (wout-1)*stridew + 1
+ # Extract a local patch of the input image corresponding spatially to the filter sizes.
+ img_patch = matrix(img_cols[,(hout-1)*Wout + wout], rows=C, cols=Hf*Wf) # zeros
+ parfor (c in 1:C) { # all channels
+ img_patch_slice = matrix(img_patch[c,], rows=Hf, cols=Wf) # reshape
+ if (reduction == "add") {
+ img_slice = matrix(0, rows=Hin, cols=Win)
+ img_slice[hin:hin+Hf-1, win:win+Wf-1] = img_patch_slice
+ img[c,] = img[c,] + matrix(img_slice, rows=1, cols=Hin*Win)
+ } else {
+ img_slice = matrix(img[c,], rows=Hin, cols=Win)
+ img_slice[hin:hin+Hf-1, win:win+Wf-1] = img_patch_slice
+ img[c,] = matrix(img_slice, rows=1, cols=Hin*Win)
+ }
+ }
+ }
+ }
+}
+
+pad_image = function(matrix[double] img, int Hin, int Win, int padh, int padw, double pad_value)
+ return (matrix[double] img_padded) {
+ /*
+ * Pads an image along the height and width dimensions with zeros.
+ *
+ * Inputs:
+ * - img: Input image, of shape (C, Hin*Win), where C is the number
+ * of input channels (depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - padh: Padding for top and bottom sides.
+ * - padw: Padding for left and right sides.
+ * - pad_value: Value to use for the padding.
+ * A typical value is 0.
+ *
+ * Outputs:
+ * - img_padded: The input image padded along the height and width
+ * dimensions, of shape (C, (Hin+2*padh)*(Win+2*padw)).
+ */
+ C = nrow(img)
+ img_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros
+ parfor (c in 1:C) {
+ img_slice = matrix(img[c,], rows=Hin, cols=Win) # depth slice C reshaped
+ img_padded_slice = matrix(pad_value, rows=Hin+2*padh, cols=Win+2*padw)
+ img_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = img_slice
+ img_padded[c,] = matrix(img_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape
+ }
+}
+
+unpad_image = function(matrix[double] img_padded, int Hin, int Win, int padh, int padw)
+ return (matrix[double] img) {
+ /*
+ * Unpads an image along the height and width dimensions.
+ *
+ * Inputs:
+ * - img_padded: The input image padded along the height and width
+ * dimensions, of shape (C, (Hin+2*padh)*(Win+2*padw)).
+ * - Hin: Input height of unpadded image.
+ * - Win: Input width of unpadded image.
+ * - padh: Padding for top and bottom sides.
+ * - padw: Padding for left and right sides.
+ *
+ * Outputs:
+ * - img: Input image, of shape (C, Hin*Win), where C is the number
+ * of input channels (depth).
+ */
+ C = nrow(img_padded)
+ img = matrix(0, rows=C, cols=Hin*Win)
+ parfor (c in 1:C) {
+ img_padded_slice = matrix(img_padded[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
+ img_slice = img_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
+ img[c,] = matrix(img_slice, rows=1, cols=Hin*Win)
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/README.md
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/README.md b/scripts/staging/SystemML-NN/README.md
deleted file mode 100644
index b80f2c6..0000000
--- a/scripts/staging/SystemML-NN/README.md
+++ /dev/null
@@ -1,183 +0,0 @@
-<!--
-{% comment %}
-Licensed to the Apache Software Foundation (ASF) under one or more
-contributor license agreements. See the NOTICE file distributed with
-this work for additional information regarding copyright ownership.
-The ASF licenses this file to you under the Apache License, Version 2.0
-(the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-{% endcomment %}
--->
-
-# SystemML-NN
-
-### A deep learning library for [Apache SystemML](https://github.com/apache/incubator-systemml).
-
-## Examples:
-#### Please see the [`examples`](nn/examples) folder for more detailed examples, or view the following two quick examples.
-### Neural net for regression with vanilla SGD:
-```python
-# Imports
-source("nn/layers/affine.dml") as affine
-source("nn/layers/l2_loss.dml") as l2_loss
-source("nn/layers/relu.dml") as relu
-source("nn/optim/sgd.dml") as sgd
-
-# Generate input data
-N = 1024 # num examples
-D = 100 # num features
-t = 1 # num targets
-X = rand(rows=N, cols=D, pdf="normal")
-y = rand(rows=N, cols=t)
-
-# Create 2-layer network:
-## affine1 -> relu1 -> affine2
-M = 64 # number of neurons
-[W1, b1] = affine::init(D, M)
-[W2, b2] = affine::init(M, t)
-
-# Initialize optimizer
-lr = 0.05 # learning rate
-mu = 0.9 # momentum
-decay = 0.99 # learning rate decay constant
-
-# Optimize
-print("Starting optimization")
-batch_size = 32
-epochs = 5
-iters = 1024 / batch_size
-for (e in 1:epochs) {
- for(i in 1:iters) {
- # Get next batch
- X_batch = X[i:i+batch_size-1,]
- y_batch = y[i:i+batch_size-1,]
-
- # Compute forward pass
- out1 = affine::forward(X_batch, W1, b1)
- outr1 = relu::forward(out1)
- out2 = affine::forward(outr1, W2, b2)
-
- # Compute loss
- loss = l2_loss::forward(out2, y_batch)
- print("L2 loss: " + loss)
-
- # Compute backward pass
- dout2 = l2_loss::backward(out2, y_batch)
- [doutr1, dW2, db2] = affine::backward(dout2, outr1, W2, b2)
- dout1 = relu::backward(doutr1, out1)
- [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)
-
- # Optimize with vanilla SGD
- W1 = sgd::update(W1, dW1, lr)
- b1 = sgd::update(b1, db1, lr)
- W2 = sgd::update(W2, dW2, lr)
- b2 = sgd::update(b2, db2, lr)
- }
- # Decay learning rate
- lr = lr * decay
-}
-```
-
-### Neural net for multi-class classification with dropout and SGD w/ Nesterov momentum:
-```python
-# Imports
-source("nn/layers/affine.dml") as affine
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/dropout.dml") as dropout
-source("nn/layers/relu.dml") as relu
-source("nn/layers/softmax.dml") as softmax
-source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
-
-# Generate input data
-N = 1024 # num examples
-D = 100 # num features
-t = 5 # num targets
-X = rand(rows=N, cols=D, pdf="normal")
-classes = round(rand(rows=N, cols=1, min=1, max=t, pdf="uniform"))
-y = matrix(0, rows=N, cols=t)
-parfor (i in 1:N) {
- y[i, as.scalar(classes[i,1])] = 1 # one-hot encoding
-}
-
-# Create network:
-# affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> dropout2 -> affine3 -> softmax
-H1 = 64 # number of neurons in 1st hidden layer
-H2 = 64 # number of neurons in 2nd hidden layer
-p = 0.5 # dropout probability
-[W1, b1] = affine::init(D, H1)
-[W2, b2] = affine::init(H1, H2)
-[W3, b3] = affine::init(H2, t)
-
-# Initialize SGD w/ Nesterov momentum optimizer
-lr = 0.05 # learning rate
-mu = 0.5 # momentum
-decay = 0.99 # learning rate decay constant
-vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
-vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
-vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
-
-# Optimize
-print("Starting optimization")
-batch_size = 64
-epochs = 10
-iters = 1024 / batch_size
-for (e in 1:epochs) {
- for(i in 1:iters) {
- # Get next batch
- X_batch = X[i:i+batch_size-1,]
- y_batch = y[i:i+batch_size-1,]
-
- # Compute forward pass
- ## layer 1:
- out1 = affine::forward(X_batch, W1, b1)
- outr1 = relu::forward(out1)
- [outd1, maskd1] = dropout::forward(outr1, p, -1)
- ## layer 2:
- out2 = affine::forward(outd1, W2, b2)
- outr2 = relu::forward(out2)
- [outd2, maskd2] = dropout::forward(outr2, p, -1)
- ## layer 3:
- out3 = affine::forward(outd2, W3, b3)
- probs = softmax::forward(out3)
-
- # Compute loss
- loss = cross_entropy_loss::forward(probs, y_batch)
- print("Cross entropy loss: " + loss)
-
- # Compute backward pass
- ## loss:
- dprobs = cross_entropy_loss::backward(probs, y_batch)
- ## layer 3:
- dout3 = softmax::backward(dprobs, out3)
- [doutd2, dW3, db3] = affine::backward(dout3, outd2, W3, b3)
- ## layer 2:
- doutr2 = dropout::backward(doutd2, outr2, p, maskd2)
- dout2 = relu::backward(doutr2, out2)
- [doutd1, dW2, db2] = affine::backward(dout2, outd1, W2, b2)
- ## layer 1:
- doutr1 = dropout::backward(doutd1, outr1, p, maskd1)
- dout1 = relu::backward(doutr1, out1)
- [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)
-
- # Optimize with SGD w/ Nesterov momentum
- [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
- [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
- [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
- [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
- [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
- [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
- }
- # Anneal momentum towards 0.999
- mu = mu + (0.999 - mu)/(1+epochs-e)
- # Decay learning rate
- lr = lr * decay
-}
-```
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb b/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb
deleted file mode 100644
index 0423269..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/Example - MNIST LeNet.ipynb
+++ /dev/null
@@ -1,189 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Quick Setup"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create a SystemML MLContext object\n",
- "from systemml import MLContext, dml\n",
- "ml = MLContext(sc)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Download Data - MNIST"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9]. Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%sh\n",
- "mkdir -p data/mnist/\n",
- "cd data/mnist/\n",
- "curl -O https://pjreddie.com/media/files/mnist_train.csv\n",
- "curl -O https://pjreddie.com/media/files/mnist_test.csv"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## SystemML \"LeNet\" Neural Network"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 1. Train"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "script_string = \"\"\"\n",
- "source(\"nn/examples/mnist_lenet.dml\") as mnist_lenet\n",
- "\n",
- "# Read training data\n",
- "data = read($data, format=\"csv\")\n",
- "n = nrow(data)\n",
- "\n",
- "# Extract images and labels\n",
- "images = data[,2:ncol(data)]\n",
- "labels = data[,1]\n",
- "\n",
- "# Scale images to [-1,1], and one-hot encode the labels\n",
- "images = (images / 255.0) * 2 - 1\n",
- "labels = table(seq(1, n), labels+1, n, 10)\n",
- "\n",
- "# Split into training (55,000 examples) and validation (5,000 examples)\n",
- "X = images[5001:nrow(images),]\n",
- "X_val = images[1:5000,]\n",
- "y = labels[5001:nrow(images),]\n",
- "y_val = labels[1:5000,]\n",
- "\n",
- "# Train\n",
- "epochs = 10\n",
- "[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)\n",
- "\"\"\"\n",
- "script = (dml(script_string).input(\"$data\", \"data/mnist/mnist_train.csv\")\n",
- " .input(C=1, Hin=28, Win=28)\n",
- " .output(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))\n",
- "W1, b1, W2, b2, W3, b3, W4, b4 = (ml.execute(script)\n",
- " .get(\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 2. Compute Test Accuracy"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "script_string = \"\"\"\n",
- "source(\"nn/examples/mnist_lenet.dml\") as mnist_lenet\n",
- "\n",
- "# Read test data\n",
- "data = read($data, format=\"csv\")\n",
- "n = nrow(data)\n",
- "\n",
- "# Extract images and labels\n",
- "X_test = data[,2:ncol(data)]\n",
- "y_test = data[,1]\n",
- "\n",
- "# Scale images to [-1,1], and one-hot encode the labels\n",
- "X_test = (X_test / 255.0) * 2 - 1\n",
- "y_test = table(seq(1, n), y_test+1, n, 10)\n",
- "\n",
- "# Eval on test set\n",
- "probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)\n",
- "[loss, accuracy] = mnist_lenet::eval(probs, y_test)\n",
- "\n",
- "print(\"Test Accuracy: \" + accuracy)\n",
- "\"\"\"\n",
- "script = dml(script_string).input(**{\"$data\": \"data/mnist/mnist_train.csv\",\n",
- " \"C\": 1, \"Hin\": 28, \"Win\": 28,\n",
- " \"W1\": W1, \"b1\": b1,\n",
- " \"W2\": W2, \"b2\": b2,\n",
- " \"W3\": W3, \"b3\": b3,\n",
- " \"W4\": W4, \"b4\": b4})\n",
- "ml.execute(script)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 3. Extract Model Into Spark DataFrames For Future Use"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "W1_df = W1.toDF()\n",
- "b1_df = b1.toDF()\n",
- "W2_df = W2.toDF()\n",
- "b2_df = b2.toDF()\n",
- "W3_df = W3.toDF()\n",
- "b3_df = b3.toDF()\n",
- "W4_df = W4.toDF()\n",
- "b4_df = b4.toDF()\n",
- "W1_df, b1_df, W2_df, b2_df, W3_df, b3_df, W4_df, b4_df"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 + Spark 2.x + SystemML",
- "language": "python",
- "name": "pyspark3_2.x"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.1"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb b/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb
deleted file mode 100644
index 5e7182a..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/Example - MNIST Softmax Classifier.ipynb
+++ /dev/null
@@ -1,179 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Quick Setup"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": false
- },
- "outputs": [],
- "source": [
- "# Create a SystemML MLContext object\n",
- "from systemml import MLContext, dml\n",
- "ml = MLContext(sc)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Download Data - MNIST"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9]. Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "%%sh\n",
- "mkdir -p data/mnist/\n",
- "cd data/mnist/\n",
- "curl -O https://pjreddie.com/media/files/mnist_train.csv\n",
- "curl -O https://pjreddie.com/media/files/mnist_test.csv"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## SystemML Softmax Model"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 1. Train"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "training = \"\"\"\n",
- "source(\"nn/examples/mnist_softmax.dml\") as mnist_softmax\n",
- "\n",
- "# Read training data\n",
- "data = read($data, format=\"csv\")\n",
- "n = nrow(data)\n",
- "\n",
- "# Extract images and labels\n",
- "images = data[,2:ncol(data)]\n",
- "labels = data[,1]\n",
- "\n",
- "# Scale images to [0,1], and one-hot encode the labels\n",
- "images = images / 255.0\n",
- "labels = table(seq(1, n), labels+1, n, 10)\n",
- "\n",
- "# Split into training (55,000 examples) and validation (5,000 examples)\n",
- "X = images[5001:nrow(images),]\n",
- "X_val = images[1:5000,]\n",
- "y = labels[5001:nrow(images),]\n",
- "y_val = labels[1:5000,]\n",
- "\n",
- "# Train\n",
- "epochs = 1\n",
- "[W, b] = mnist_softmax::train(X, y, X_val, y_val, epochs)\n",
- "\"\"\"\n",
- "script = dml(training).input(\"$data\", \"data/mnist/mnist_train.csv\").output(\"W\", \"b\")\n",
- "W, b = ml.execute(script).get(\"W\", \"b\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 2. Compute Test Accuracy"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "testing = \"\"\"\n",
- "source(\"nn/examples/mnist_softmax.dml\") as mnist_softmax\n",
- "\n",
- "# Read test data\n",
- "data = read($data, format=\"csv\")\n",
- "n = nrow(data)\n",
- "\n",
- "# Extract images and labels\n",
- "X_test = data[,2:ncol(data)]\n",
- "y_test = data[,1]\n",
- "\n",
- "# Scale images to [0,1], and one-hot encode the labels\n",
- "X_test = X_test / 255.0\n",
- "y_test = table(seq(1, n), y_test+1, n, 10)\n",
- "\n",
- "# Eval on test set\n",
- "probs = mnist_softmax::predict(X_test, W, b)\n",
- "[loss, accuracy] = mnist_softmax::eval(probs, y_test)\n",
- "\n",
- "print(\"Test Accuracy: \" + accuracy)\n",
- "\"\"\"\n",
- "script = dml(testing).input(\"$data\", \"data/mnist/mnist_test.csv\", W=W, b=b)\n",
- "ml.execute(script)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 3. Extract Model Into Spark DataFrames For Future Use"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "W_df = W.toDF()\n",
- "b_df = b.toDF()\n",
- "W_df, b_df"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.1"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/README.md
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/README.md b/scripts/staging/SystemML-NN/nn/examples/README.md
deleted file mode 100644
index d5e9d04..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/README.md
+++ /dev/null
@@ -1,74 +0,0 @@
-<!--
-{% comment %}
-Licensed to the Apache Software Foundation (ASF) under one or more
-contributor license agreements. See the NOTICE file distributed with
-this work for additional information regarding copyright ownership.
-The ASF licenses this file to you under the Apache License, Version 2.0
-(the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-{% endcomment %}
--->
-
-# SystemML-NN Examples
-
-#### This folder contains scripts and PySpark Jupyter notebooks serving as examples of using the *SystemML-NN* (`nn`) deep learning library.
-
----
-
-# Examples
-### MNIST Softmax Classifier
-
-* This example trains a softmax classifier, which is essentially a multi-class logistic regression model, on the MNIST data. The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
-* Notebook: `Example - MNIST Softmax Classifier.ipynb`.
-* DML Functions: `mnist_softmax.dml`
-* Training script: `mnist_softmax-train.dml`
-* Prediction script: `mnist_softmax-predict.dml`
-
-### MNIST "LeNet" Neural Net
-
-* This example trains a neural network on the MNIST data using a ["LeNet" architecture](http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf). The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images.
-* Notebook: `Example - MNIST LeNet.ipynb`.
-* DML Functions: `mnist_lenet.dml`
-* Training script: `mnist_lenet-train.dml`
-* Prediction script: `mnist_lenet-predict.dml`
-
----
-
-# Setup
-## Code
-* To run the examples, please first download and unzip the project via GitHub using the "Clone or download" button on the [homepage of the project](https://github.com/dusenberrymw/systemml-nn), *or* via the following commands:
-
- ```
- git clone https://github.com/dusenberrymw/systemml-nn.git
- ```
-
-* Then, move into the `systemml-nn` folder via:
- ```
- cd systemml-nn
- ```
-
-## Data
-* These examples use the classic [MNIST](http://yann.lecun.com/exdb/mnist/) dataset, which contains labeled 28x28 pixel images of handwritten digits in the range of 0-9. There are 60,000 training images, and 10,000 testing images. Of the 60,000 training images, 5,000 will be used as validation images.
-* **Download**:
- * **Notebooks**: The data will be automatically downloaded as a step in either of the example notebooks.
- * **Training scripts**: Please run `get_mnist_data.sh` to download the data separately.
-
-## Execution
-* These examples contain scripts written in SystemML's R-like language (`*.dml`), as well as PySpark Jupyter notebooks (`*.ipynb`). The scripts contain the math for the algorithms, enclosed in functions, and the notebooks serve as full, end-to-end examples of reading in data, training models using the functions within the scripts, and evaluating final performance.
-* **Notebooks**: To run the notebook examples, please install the SystemML Python package with `pip install systemml`, and then startup Jupyter in the following manner from this directory (or for more information, please see [this great blog post](http://spark.tc/0-to-life-changing-application-with-apache-systemml/)):
-
- ```
- PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook" pyspark --master local[*] --driver-memory 3G --driver-class-path SystemML.jar --jars SystemML.jar
- ```
-
- Note that all printed output, such as training statistics, from the SystemML scripts will be sent to the terminal in which Jupyter was started (for now...).
-
-* **Scripts**: To run the scripts from the command line using `spark-submit`, please see the comments located at the top of the `-train` and `-predict` scripts.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh b/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh
deleted file mode 100755
index deb0c40..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/get_mnist_data.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-DIR="$(cd "$(dirname "$0")" && pwd)"
-mkdir -p $DIR/data/mnist/
-cd $DIR/data/mnist/
-curl -O https://pjreddie.com/media/files/mnist_train.csv
-curl -O https://pjreddie.com/media/files/mnist_test.csv
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml
deleted file mode 100644
index 85a5307..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-predict.dml
+++ /dev/null
@@ -1,91 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST LeNet - Predict
-#
-# This script computes the class probability predictions of a
-# trained convolutional net using the "LeNet" architecture on
-# images of handwritten digits.
-#
-# Inputs:
-# - X: File containing training images.
-# The format is "pixel_1, pixel_2, ..., pixel_n".
-# - C: Number of color chanels in the images.
-# - Hin: Input image height.
-# - Win: Input image width.
-# - model_dir: Directory containing the trained weights and biases
-# of the model.
-# - out_dir: Directory to store class probability predictions for
-# each image.
-# - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
-# Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-# - probs: File containing class probability predictions for each
-# image.
-#
-# Data:
-# The X file should contain images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels.
-#
-# Sample Invocation (running from outside the `nn` folder):
-# 1. Download images.
-#
-# For example, save images to `nn/examples/data/mnist/images.csv`.
-#
-# 2. Execute using Spark
-# ```
-# spark-submit --master local[*] --driver-memory 5G
-# --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
-# $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_lenet-predict.dml
-# -nvargs X=nn/examples/data/mnist/images.csv C=1 Hin=28 Win=28
-# model_dir=nn/examples/model/mnist_lenet out_dir=nn/examples/data/mnist
-# ```
-#
-source("nn/examples/mnist_lenet.dml") as mnist_lenet
-
-# Read training data
-fmt = ifdef($fmt, "csv")
-X = read($X, format=fmt)
-C = $C
-Hin = $Hin
-Win = $Win
-
-# Scale images to [-1,1]
-X = (X / 255.0) * 2 - 1
-
-# Read model coefficients
-W1 = read($model_dir+"/W1")
-b1 = read($model_dir+"/b1")
-W2 = read($model_dir+"/W2")
-b2 = read($model_dir+"/b2")
-W3 = read($model_dir+"/W3")
-b3 = read($model_dir+"/b3")
-W4 = read($model_dir+"/W4")
-b4 = read($model_dir+"/b4")
-
-# Predict classes
-probs = mnist_lenet::predict(X, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
-
-# Output results
-write(probs, $out_dir+"/probs."+fmt, format=fmt)
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml
deleted file mode 100644
index 0fc733e..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet-train.dml
+++ /dev/null
@@ -1,123 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST LeNet - Train
-#
-# This script trains a convolutional net using the "LeNet" architecture
-# on images of handwritten digits.
-#
-# Inputs:
-# - train: File containing labeled MNIST training images.
-# The format is "label, pixel_1, pixel_2, ..., pixel_n".
-# - test: File containing labeled MNIST test images.
-# The format is "label, pixel_1, pixel_2, ..., pixel_n".
-# - C: Number of color chanels in the images.
-# - Hin: Input image height.
-# - Win: Input image width.
-# - epochs: [DEFAULT: 10] Total number of full training loops over
-# the full data set.
-# - out_dir: [DEFAULT: "."] Directory to store weights and bias
-# matrices of trained model, as well as final test accuracy.
-# - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
-# Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-# - W1, W2, W3, W4: Files containing the trained weights of the model.
-# - b1, b2, b3, b4: Files containing the trained biases of the model.
-# - accuracy: File containing the final accuracy on the test data.
-#
-# Data:
-# The MNIST dataset contains labeled images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels, and each label is
-# one of 10 possible digits in [0,9].
-#
-# Sample Invocation (running from outside the `nn` folder):
-# 1. Download data (60,000 training examples, and 10,000 test examples)
-# ```
-# nn/examples/get_mnist_data.sh
-# ```
-#
-# 2. Execute using Spark
-# ```
-# spark-submit --master local[*] --driver-memory 10G
-# --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
-# $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_lenet-train.dml
-# -nvargs train=nn/examples/data/mnist/mnist_train.csv test=nn/examples/data/mnist/mnist_test.csv
-# C=1 Hin=28 Win=28 epochs=10 out_dir=nn/examples/model/mnist_lenet
-# ```
-#
-source("nn/examples/mnist_lenet.dml") as mnist_lenet
-
-# Read training data & settings
-fmt = ifdef($fmt, "csv")
-train = read($train, format=fmt)
-test = read($test, format=fmt)
-C = $C
-Hin = $Hin
-Win = $Win
-epochs = ifdef($epochs, 10)
-out_dir = ifdef($out_dir, ".")
-
-# Extract images and labels
-images = train[,2:ncol(train)]
-labels = train[,1]
-X_test = test[,2:ncol(test)]
-y_test = test[,1]
-
-# Scale images to [-1,1], and one-hot encode the labels
-n = nrow(train)
-n_test = nrow(test)
-images = (images / 255.0) * 2 - 1
-labels = table(seq(1, n), labels+1, n, 10)
-X_test = (X_test / 255.0) * 2 - 1
-y_test = table(seq(1, n_test), y_test+1, n_test, 10)
-
-# Split into training (55,000 examples) and validation (5,000 examples)
-X = images[5001:nrow(images),]
-X_val = images[1:5000,]
-y = labels[5001:nrow(images),]
-y_val = labels[1:5000,]
-
-# Train
-[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win, epochs)
-
-# Write model out
-write(W1, out_dir+"/W1")
-write(b1, out_dir+"/b1")
-write(W2, out_dir+"/W2")
-write(b2, out_dir+"/b2")
-write(W3, out_dir+"/W3")
-write(b3, out_dir+"/b3")
-write(W4, out_dir+"/W4")
-write(b4, out_dir+"/b4")
-
-# Eval on test set
-probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
-[loss, accuracy] = mnist_lenet::eval(probs, y_test)
-
-# Output results
-print("Test Accuracy: " + accuracy)
-write(accuracy, out_dir+"/accuracy")
-
-print("")
-print("")
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml
deleted file mode 100644
index e5755c4..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/mnist_lenet.dml
+++ /dev/null
@@ -1,331 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * MNIST LeNet Example
- */
-# Imports
-source("nn/layers/affine.dml") as affine
-source("nn/layers/conv2d_builtin.dml") as conv2d
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/dropout.dml") as dropout
-source("nn/layers/l2_reg.dml") as l2_reg
-source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
-source("nn/layers/relu.dml") as relu
-source("nn/layers/softmax.dml") as softmax
-source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
-
-train = function(matrix[double] X, matrix[double] y,
- matrix[double] X_val, matrix[double] y_val,
- int C, int Hin, int Win, int epochs)
- return (matrix[double] W1, matrix[double] b1,
- matrix[double] W2, matrix[double] b2,
- matrix[double] W3, matrix[double] b3,
- matrix[double] W4, matrix[double] b4) {
- /*
- * Trains a convolutional net using the "LeNet" architecture.
- *
- * The input matrix, X, has N examples, each represented as a 3D
- * volume unrolled into a single vector. The targets, y, have K
- * classes, and are one-hot encoded.
- *
- * Inputs:
- * - X: Input data matrix, of shape (N, C*Hin*Win).
- * - y: Target matrix, of shape (N, K).
- * - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
- * - y_val: Target validation matrix, of shape (N, K).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - epochs: Total number of full training loops over the full data set.
- *
- * Outputs:
- * - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
- * - b1: 1st layer biases vector, of shape (F1, 1).
- * - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
- * - b2: 2nd layer biases vector, of shape (F2, 1).
- * - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
- * - b3: 3rd layer biases vector, of shape (1, N3).
- * - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
- * - b4: 4th layer biases vector, of shape (1, K).
- */
- N = nrow(X)
- K = ncol(y)
-
- # Create network:
- # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
- Hf = 5 # filter height
- Wf = 5 # filter width
- stride = 1
- pad = 2 # For same dimensions, (Hf - stride) / 2
-
- F1 = 32 # num conv filters in conv1
- F2 = 64 # num conv filters in conv2
- N3 = 512 # num nodes in affine3
- # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)
-
- [W1, b1] = conv2d::init(F1, C, Hf, Wf) # inputs: (N, C*Hin*Win)
- [W2, b2] = conv2d::init(F2, F1, Hf, Wf) # inputs: (N, F1*(Hin/2)*(Win/2))
- [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3) # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
- [W4, b4] = affine::init(N3, K) # inputs: (N, N3)
- W4 = W4 / sqrt(2) # different initialization, since being fed into softmax, instead of relu
-
- # Initialize SGD w/ Nesterov momentum optimizer
- lr = 0.01 # learning rate
- mu = 0.9 #0.5 # momentum
- decay = 0.95 # learning rate decay constant
- vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
- vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
- vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
- vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)
-
- # Regularization
- lambda = 5e-04
-
- # Optimize
- print("Starting optimization")
- batch_size = 64
- iters = ceil(N / batch_size)
- for (e in 1:epochs) {
- for(i in 1:iters) {
- # Get next batch
- beg = ((i-1) * batch_size) %% N + 1
- end = min(N, beg + batch_size - 1)
- X_batch = X[beg:end,]
- y_batch = y[beg:end,]
-
- # Compute forward pass
- ## layer 1: conv1 -> relu1 -> pool1
- [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- outr1 = relu::forward(outc1)
- [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
- strideh=2, stridew=2, pad=0, pad=0)
- ## layer 2: conv2 -> relu2 -> pool2
- [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
- stride, stride, pad, pad)
- outr2 = relu::forward(outc2)
- [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
- strideh=2, stridew=2, pad=0, pad=0)
- ## layer 3: affine3 -> relu3 -> dropout
- outa3 = affine::forward(outp2, W3, b3)
- outr3 = relu::forward(outa3)
- [outd3, maskd3] = dropout::forward(outr3, 0.5, -1)
- ## layer 4: affine4 -> softmax
- outa4 = affine::forward(outd3, W4, b4)
- probs = softmax::forward(outa4)
-
- # Compute loss & accuracy for training & validation data every 100 iterations.
- if (i %% 100 == 0) {
- # Compute training loss & accuracy
- loss_data = cross_entropy_loss::forward(probs, y_batch)
- loss_reg_W1 = l2_reg::forward(W1, lambda)
- loss_reg_W2 = l2_reg::forward(W2, lambda)
- loss_reg_W3 = l2_reg::forward(W3, lambda)
- loss_reg_W4 = l2_reg::forward(W4, lambda)
- loss = loss_data + loss_reg_W1 + loss_reg_W2 + loss_reg_W3 + loss_reg_W4
- accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
-
- # Compute validation loss & accuracy
- probs_val = predict(X_val, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
- loss_val = cross_entropy_loss::forward(probs_val, y_val)
- accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
-
- # Output results
- print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
- + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
- }
-
- # Compute data backward pass
- ## loss:
- dprobs = cross_entropy_loss::backward(probs, y_batch)
- ## layer 4: affine4 -> softmax
- douta4 = softmax::backward(dprobs, outa4)
- [doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4)
- ## layer 3: affine3 -> relu3 -> dropout
- doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3)
- douta3 = relu::backward(doutr3, outa3)
- [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
- ## layer 2: conv2 -> relu2 -> pool2
- doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
- strideh=2, stridew=2, pad=0, pad=0)
- doutc2 = relu::backward(doutr2, outc2)
- [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
- Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
- ## layer 1: conv1 -> relu1 -> pool1
- doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
- strideh=2, stridew=2, pad=0, pad=0)
- doutc1 = relu::backward(doutr1, outc1)
- [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win,
- Hf, Wf, stride, stride, pad, pad)
-
- # Compute regularization backward pass
- dW1_reg = l2_reg::backward(W1, lambda)
- dW2_reg = l2_reg::backward(W2, lambda)
- dW3_reg = l2_reg::backward(W3, lambda)
- dW4_reg = l2_reg::backward(W4, lambda)
- dW1 = dW1 + dW1_reg
- dW2 = dW2 + dW2_reg
- dW3 = dW3 + dW3_reg
- dW4 = dW4 + dW4_reg
-
- # Optimize with SGD w/ Nesterov momentum
- [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
- [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
- [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
- [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
- [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
- [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
- [W4, vW4] = sgd_nesterov::update(W4, dW4, lr, mu, vW4)
- [b4, vb4] = sgd_nesterov::update(b4, db4, lr, mu, vb4)
- }
- # Anneal momentum towards 0.999
- #mu = mu + (0.999 - mu)/(1+epochs-e)
- # Decay learning rate
- lr = lr * decay
- }
-}
-
-predict = function(matrix[double] X, int C, int Hin, int Win,
- matrix[double] W1, matrix[double] b1,
- matrix[double] W2, matrix[double] b2,
- matrix[double] W3, matrix[double] b3,
- matrix[double] W4, matrix[double] b4)
- return (matrix[double] probs) {
- /*
- * Computes the class probability predictions of a convolutional
- * net using the "LeNet" architecture.
- *
- * The input matrix, X, has N examples, each represented as a 3D
- * volume unrolled into a single vector.
- *
- * Inputs:
- * - X: Input data matrix, of shape (N, C*Hin*Win).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
- * - b1: 1st layer biases vector, of shape (F1, 1).
- * - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
- * - b2: 2nd layer biases vector, of shape (F2, 1).
- * - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
- * - b3: 3rd layer biases vector, of shape (1, N3).
- * - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
- * - b4: 4th layer biases vector, of shape (1, K).
- *
- * Outputs:
- * - probs: Class probabilities, of shape (N, K).
- */
- N = nrow(X)
-
- # Network:
- # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
- Hf = 5 # filter height
- Wf = 5 # filter width
- stride = 1
- pad = 2 # For same dimensions, (Hf - stride) / 2
-
- F1 = nrow(W1) # num conv filters in conv1
- F2 = nrow(W2) # num conv filters in conv2
- N3 = ncol(W3) # num nodes in affine3
- K = ncol(W4) # num nodes in affine4, equal to number of target dimensions (num classes)
-
- # Compute predictions over mini-batches
- probs = matrix(0, rows=N, cols=K)
- batch_size = 64
- iters = ceil(N / batch_size)
- for(i in 1:iters) {
- # Get next batch
- beg = ((i-1) * batch_size) %% N + 1
- end = min(N, beg + batch_size - 1)
- X_batch = X[beg:end,]
-
- # Compute forward pass
- ## layer 1: conv1 -> relu1 -> pool1
- [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
- pad, pad)
- outr1 = relu::forward(outc1)
- [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
- strideh=2, stridew=2, pad=0, pad=0)
- ## layer 2: conv2 -> relu2 -> pool2
- [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
- stride, stride, pad, pad)
- outr2 = relu::forward(outc2)
- [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
- strideh=2, stridew=2, pad=0, pad=0)
- ## layer 3: affine3 -> relu3
- outa3 = affine::forward(outp2, W3, b3)
- outr3 = relu::forward(outa3)
- ## layer 4: affine4 -> softmax
- outa4 = affine::forward(outr3, W4, b4)
- probs_batch = softmax::forward(outa4)
-
- # Store predictions
- probs[beg:end,] = probs_batch
- }
-}
-
-eval = function(matrix[double] probs, matrix[double] y)
- return (double loss, double accuracy) {
- /*
- * Evaluates a convolutional net using the "LeNet" architecture.
- *
- * The probs matrix contains the class probability predictions
- * of K classes over N examples. The targets, y, have K classes,
- * and are one-hot encoded.
- *
- * Inputs:
- * - probs: Class probabilities, of shape (N, K).
- * - y: Target matrix, of shape (N, K).
- *
- * Outputs:
- * - loss: Scalar loss, of shape (1).
- * - accuracy: Scalar accuracy, of shape (1).
- */
- # Compute loss & accuracy
- loss = cross_entropy_loss::forward(probs, y)
- correct_pred = rowIndexMax(probs) == rowIndexMax(y)
- accuracy = mean(correct_pred)
-}
-
-generate_dummy_data = function()
- return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
- /*
- * Generate a dummy dataset similar to the MNIST dataset.
- *
- * Outputs:
- * - X: Input data matrix, of shape (N, D).
- * - y: Target matrix, of shape (N, K).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- */
- # Generate dummy input data
- N = 1024 # num examples
- C = 1 # num input channels
- Hin = 28 # input height
- Win = 28 # input width
- K = 10 # num target classes
- X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
- classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))
- y = table(seq(1, N), classes) # one-hot encoding
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml
deleted file mode 100644
index 4c8c434..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-predict.dml
+++ /dev/null
@@ -1,77 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST Softmax - Predict
-#
-# This script computes the class probability predictions of a
-# trained softmax classifier on images of handwritten digits.
-#
-# Inputs:
-# - X: File containing training images.
-# The format is "pixel_1, pixel_2, ..., pixel_n".
-# - model_dir: Directory containing the trained weights and biases
-# of the model.
-# - out_dir: Directory to store class probability predictions for
-# each image.
-# - fmt: [DEFAULT: "csv"] File format of `X` and output predictions.
-# Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-# - probs: File containing class probability predictions for each
-# image.
-#
-# Data:
-# The X file should contain images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels.
-#
-# Sample Invocation:
-# 1. Download images.
-#
-# For example, save images to `nn/examples/data/mnist/images.csv`.
-#
-# 2. Execute using Spark
-# ```
-# spark-submit --master local[*] --driver-memory 5G
-# --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
-# $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_softmax-predict.dml
-# -nvargs X=nn/examples/data/mnist/images.csv
-# model_dir=nn/examples/model/mnist_softmax out_dir=nn/examples/data/mnist
-#
-source("nn/examples/mnist_softmax.dml") as mnist_softmax
-
-# Read training data
-fmt = ifdef($fmt, "csv")
-X = read($X, format=fmt)
-
-# Scale images to [0,1], and one-hot encode the labels
-X = X / 255.0
-
-# Read model coefficients
-W = read($model_dir+"/W")
-b = read($model_dir+"/b")
-
-# Predict classes
-probs = mnist_softmax::predict(X, W, b)
-
-# Output results
-write(probs, $out_dir+"/probs."+fmt, format=fmt)
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml
deleted file mode 100644
index 09970f0..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax-train.dml
+++ /dev/null
@@ -1,110 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# MNIST Softmax - Train
-#
-# This script trains a softmax classifier on images of handwritten
-# digits.
-#
-# Inputs:
-# - train: File containing labeled MNIST training images.
-# The format is "label, pixel_1, pixel_2, ..., pixel_n".
-# - test: File containing labeled MNIST test images.
-# The format is "label, pixel_1, pixel_2, ..., pixel_n".
-# - out_dir: Directory to store weights and bias matrices of
-# trained model, as well as final test accuracy.
-# - fmt: [DEFAULT: "csv"] File format of `train` and `test` data.
-# Options include: "csv", "mm", "text", and "binary".
-#
-# Outputs:
-# - W: File containing the trained weights of the model.
-# - b: File containing the trained biases of the model.
-# - accuracy: File containing the final accuracy on the test data.
-#
-# Data:
-# The MNIST dataset contains labeled images of handwritten digits,
-# where each example is a 28x28 pixel image of grayscale values in
-# the range [0,255] stretched out as 784 pixels, and each label is
-# one of 10 possible digits in [0,9].
-#
-# Sample Invocation (running from wihtin the `examples` folder):
-# 1. Download data (60,000 training examples, and 10,000 test examples)
-# ```
-# nn/examples/get_mnist_data.sh
-# ```
-#
-# 2. Execute using Spark
-# ```
-# spark-submit --master local[*] --driver-memory 10G
-# --conf spark.driver.maxResultSize=0 --conf spark.rpc.message.maxSize=128
-# $SYSTEMML_HOME/target/SystemML.jar -f nn/examples/mnist_softmax-train.dml
-# -nvargs train=nn/examples/data/mnist/mnist_train.csv test=nn/examples/data/mnist/mnist_test.csv
-# epochs=1 out_dir=nn/examples/model/mnist_softmax
-# ```
-#
-source("nn/examples/mnist_softmax.dml") as mnist_softmax
-
-# Read training data
-fmt = ifdef($fmt, "csv")
-train = read($train, format=fmt)
-test = read($test, format=fmt)
-epochs = ifdef($epochs, 1)
-out_dir = ifdef($out_dir, ".")
-
-# Extract images and labels
-images = train[,2:ncol(train)]
-labels = train[,1]
-X_test = test[,2:ncol(test)]
-y_test = test[,1]
-
-# Scale images to [0,1], and one-hot encode the labels
-n = nrow(train)
-n_test = nrow(test)
-classes = 10
-images = images / 255.0
-labels = table(seq(1, n), labels+1, n, classes)
-X_test = X_test / 255.0
-y_test = table(seq(1, n_test), y_test+1, n_test, classes)
-
-# Split into training (55,000 examples) and validation (5,000 examples)
-X = images[5001:nrow(images),]
-X_val = images[1:5000,]
-y = labels[5001:nrow(images),]
-y_val = labels[1:5000,]
-
-# Train
-[W, b] = mnist_softmax::train(X, y, X_val, y_val, epochs)
-
-# Write model out
-write(W, out_dir+"/W")
-write(b, out_dir+"/b")
-
-# Eval on test set
-probs = mnist_softmax::predict(X_test, W, b)
-[loss, accuracy] = mnist_softmax::eval(probs, y_test)
-
-# Output results
-print("Test Accuracy: " + accuracy)
-write(accuracy, out_dir+"/accuracy")
-
-print("")
-print("")
-
[10/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn`
library to `scripts/nn`
Posted by du...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/batch_norm2d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/batch_norm2d.dml b/scripts/nn/layers/batch_norm2d.dml
new file mode 100644
index 0000000..49c6746
--- /dev/null
+++ b/scripts/nn/layers/batch_norm2d.dml
@@ -0,0 +1,238 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D (Spatial) Batch Normalization layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ int C, int Hin, int Win, string mode,
+ matrix[double] ema_mean, matrix[double] ema_var,
+ double mu, double epsilon)
+ return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
+ matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
+ /*
+ * Computes the forward pass for a 2D (spatial) batch normalization
+ * layer. The input data has N examples, each represented as a 3D
+ * volume unrolled into a single vector.
+ *
+ * A spatial batch normalization layer uses the per-channel sample
+ * mean and per-channel uncorrected sample variance during training
+ * to normalize each channel of the input data. Additionally, it
+ * introduces learnable parameters (gamma, beta) to control the
+ * amount of normalization.
+ *
+ * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
+ *
+ * This implementation maintains exponential moving averages of the
+ * mean and variance during training for use during testing.
+ *
+ * Reference:
+ * - Batch Normalization: Accelerating Deep Network Training by
+ * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
+ * - https://arxiv.org/abs/1502.03167
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - mode: 'train' or 'test' to indicate if the model is currently
+ * being trained or tested. During training, the current batch
+ * mean and variance will be used to normalize the inputs, while
+ * during testing, the exponential average of the mean and
+ * variance over all previous batches will be used.
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (C, 1).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (C, 1).
+ * - mu: Momentum value for moving averages.
+ * Typical values are in the range of [0.9, 0.999].
+ * - epsilon: Smoothing term to avoid divide by zero errors.
+ * Typical values are in the range of [1e-5, 1e-3].
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, C*Hin*Win).
+ * - ema_mean_upd: Updated exponential moving average of the mean,
+ * of shape (C, 1).
+ * - ema_var_upd: Updated exponential moving average of the variance,
+ * of shape (C, 1).
+ * - cache_mean: Cache of the batch mean, of shape (C, 1).
+ * Note: This is used for performance during training.
+ * - cache_var: Cache of the batch variance, of shape (C, 1).
+ * Note: This is used for performance during training.
+ * - cache_norm: Cache of the normalized inputs, of
+ * shape (C, N*Hin*Win). Note: This is used for performance
+ * during training.
+ */
+ N = nrow(X)
+
+ if (mode == 'train') {
+ # Compute channel-wise mean and variance
+ # Since we don't have tensors, we will compute the means and variances in a piece-wise fashion.
+ # - mean of total group is mean of subgroup means
+ # - variance is the mean of the subgroup variances + the variance of the subgroup means
+ subgrp_means = matrix(colMeans(X), rows=C, cols=Hin*Win)
+ subgrp_vars = matrix(colVars(X) * ((N-1)/N), rows=C, cols=Hin*Win) # uncorrected variances
+ mean = rowMeans(subgrp_means) # shape (C, 1)
+ var = rowMeans(subgrp_vars) + rowVars(subgrp_means)*(((Hin*Win)-1)/(Hin*Win)) # shape (C, 1)
+ # Update moving averages
+ ema_mean_upd = mu*ema_mean + (1-mu)*mean
+ ema_var_upd = mu*ema_var + (1-mu)*var
+ }
+ else {
+ # Use moving averages of mean and variance during testing
+ mean = ema_mean
+ var = ema_var
+ ema_mean_upd = ema_mean
+ ema_var_upd = ema_var
+ }
+
+ # Normalize, shift, and scale
+ # norm = (X-mean)*(var+epsilon)^(-1/2)
+ # = (X-mean) / sqrt(var+epsilon)
+ centered = bias_add(X, -mean) # shape (N, C*Hin*Win)
+ norm = bias_multiply(centered, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win)
+ # out = norm*gamma + beta
+ scaled = bias_multiply(norm, gamma) # shape (N, C*Hin*Win)
+ out = bias_add(scaled, beta) # shape (N, C*Hin*Win)
+
+ # Save variable for backward pass
+ cache_mean = mean
+ cache_var = var
+ cache_norm = norm
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+ matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
+ matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
+ matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ int C, int Hin, int Win, string mode,
+ matrix[double] ema_mean, matrix[double] ema_var,
+ double mu, double epsilon)
+ return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+ /*
+ * Computes the backward pass for a 2D (spatial) batch normalization
+ * layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
+ * - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
+ * - ema_mean_upd: Updated exponential moving average of the mean
+ * from the forward pass, of shape (C, 1).
+ * - ema_var_upd: Updated exponential moving average of the variance
+ * from the forward pass, of shape (C, 1).
+ * - cache_mean: Cache of the batch mean from the forward pass, of
+ * shape (C, 1). Note: This is used for performance during
+ * training.
+ * - cache_var: Cache of the batch variance from the forward pass,
+ * of shape (C, 1). Note: This is used for performance during
+ * training.
+ * - cache_norm: Cache of the normalized inputs from the forward
+ * pass, of shape (C, N*Hin*Win). Note: This is used for
+ * performance during training.
+ * - X: Input data matrix to the forward pass, of
+ * shape (N, C*Hin*Win).
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - mode: 'train' or 'test' to indicate if the model is currently
+ * being trained or tested. During training, the current batch
+ * mean and variance will be used to normalize the inputs, while
+ * during testing, the exponential average of the mean and
+ * variance over all previous batches will be used.
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (C, 1).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (C, 1).
+ * - mu: Momentum value for moving averages.
+ * Typical values are in the range of [0.9, 0.999].
+ * - epsilon: Smoothing term to avoid divide by zero errors.
+ * Typical values are in the range of [1e-5, 1e-3].
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ * - dgamma: Gradient wrt `W`, of shape (C, 1).
+ * - dbeta: Gradient wrt `b`, of shape (C, 1).
+ *
+ */
+ N = nrow(X)
+ mean = cache_mean
+ var = cache_var
+ norm = cache_norm
+ centered = bias_add(X, -mean) # shape (N, C*Hin*Win)
+
+ if (mode == 'train') {
+ # Compute gradients during training
+ dgamma = util::channel_sums(dout*norm, C, Hin, Win) # shape (C, 1)
+ dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
+ dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
+ dvar = util::channel_sums((-1/2) * bias_multiply(centered, (var+epsilon)^(-3/2)) * dnorm,
+ C, Hin, Win) # shape (C, 1)
+ dmean_norm_branch = util::channel_sums(bias_multiply(dnorm, -1/sqrt(var+epsilon)), C, Hin, Win)
+ dmean_var_branch = util::channel_sums((-2/(N*Hin*Win)) * centered, C, Hin, Win)
+ dmean_var_branch = dmean_var_branch * dvar # we can't use a function within an expression yet
+ dmean = dmean_norm_branch + dmean_var_branch # shape (C, 1)
+ dX_norm_branch = bias_multiply(dnorm, 1/sqrt(var+epsilon))
+ dX_mean_branch = (1/(N*Hin*Win)) * bias_add(matrix(0, rows=1, cols=C*Hin*Win), dmean)
+ dX_var_branch = (2/(N*Hin*Win)) * bias_multiply(centered, dvar)
+ dX = dX_norm_branch + dX_mean_branch + dX_var_branch # shape (N, C*Hin*Win)
+ }
+ else {
+ # Compute gradients during testing
+ dgamma = util::channel_sums(dout*norm, C, Hin, Win) # shape (C, 1)
+ dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
+ dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
+ dX = bias_multiply(dnorm, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win)
+ }
+}
+
+init = function(int C)
+ return (matrix[double] gamma, matrix[double] beta,
+ matrix[double] ema_mean, matrix[double] ema_var) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - C: Number of input channels (dimensionality of input depth).
+ *
+ * Outputs:
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (C, 1).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (C, 1).
+ */
+ gamma = matrix(1, rows=C, cols=1)
+ beta = matrix(0, rows=C, cols=1)
+ ema_mean = matrix(0, rows=C, cols=1)
+ ema_var = matrix(1, rows=C, cols=1)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/conv2d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/conv2d.dml b/scripts/nn/layers/conv2d.dml
new file mode 100644
index 0000000..9d03568
--- /dev/null
+++ b/scripts/nn/layers/conv2d.dml
@@ -0,0 +1,194 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] out, int Hout, int Wout) {
+ /*
+ * Computes the forward pass for a 2D spatial convolutional layer with
+ * F filters. The input data has N examples, each represented as a 3D
+ * volume unrolled into a single vector.
+ *
+ * This implementation uses `im2col` internally for each image to
+ * extract local image regions (patches) into columns, and then
+ * performs a matrix multiplication with the filters to compute the
+ * output maps.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * For same output height as input, set `padh = (Hf - 1) / 2`,
+ * assuming `strideh = 1`.
+ * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+ * preserves the spatial dimensions of the input.
+ * - padw: Padding for left and right sides.
+ * For same output width as input, set `padw = (Wf - 1) / 2`,
+ * assuming `stridew = 1`.
+ * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+ * preserves the spatial dimensions of the input.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, F*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ */
+ N = nrow(X)
+ F = nrow(W)
+ Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+ Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+ # Create output volume
+ out = matrix(0, rows=N, cols=F*Hout*Wout)
+
+ # Convolution - im2col implementation
+ parfor (n in 1:N) { # all examples
+ Xn = matrix(X[n,], rows=C, cols=Hin*Win) # reshape
+
+ # Pad image
+ Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0) # shape (C, (Hin+2*padh)*(Win+2*padw))
+
+ # Extract local image patches into columns with im2col, of shape (C*Hf*Wf, Hout*Wout)
+ Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
+
+ # Convolve patches with filters
+ outn = W %*% Xn_padded_cols + b # shape (F, Hout*Wout)
+ out[n,] = matrix(outn, rows=1, cols=F*Hout*Wout) # reshape
+ }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+ matrix[double] X, matrix[double] W, matrix[double] b,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+ /*
+ * Computes the backward pass for a 2D spatial convolutional layer
+ * with F filters.
+ *
+ * This implementation uses `im2col` and `col2im` internally.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, F*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * - padw: Padding for left and right sides.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+ * - db: Gradient wrt `b`, of shape (F, 1).
+ */
+ N = nrow(X)
+ F = nrow(W)
+
+ # Create gradient volumes
+ # Note: Create convenience gradient volumes for dW and db that will
+ # allow for one gradient to be stored per example, allowing for
+ # parallel computation at the expense of memory. We will reduce at
+ # the end.
+ dX = matrix(0, rows=N, cols=C*Hin*Win)
+ dWN = matrix(0, rows=N, cols=F*C*Hf*Wf) # dW = matrix(0, rows=F, cols=C*Hf*Wf)
+ dbN = matrix(0, rows=N, cols=F) # db = matrix(0, rows=F, cols=1)
+
+ # Partial derivatives for convolution - im2col implementation
+ parfor (n in 1:N) { # all examples
+ doutn = matrix(dout[n,], rows=F, cols=Hout*Wout)
+
+ # Compute dW
+ Xn = matrix(X[n,], rows=C, cols=Hin*Win) # reshape
+ Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0) # shape (C, (Hin+2*padh)*(Win+2*padw))
+ Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
+ # dW = dW + doutn %*% t(Xn_padded_cols)
+ dWN[n,] = matrix(doutn %*% t(Xn_padded_cols), rows=1, cols=F*C*Hf*Wf)
+
+ # Compute db
+ # db = db + rowSums(doutn)
+ dbN[n,] = matrix(rowSums(doutn), rows=1, cols=F)
+
+ # Compute dX
+ dXn_padded_cols = t(W) %*% doutn # shape (C*Hf*Wf, Hout*Wout)
+ dXn_padded = util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, Wf,
+ strideh, stridew, "add")
+ dXn = util::unpad_image(dXn_padded, Hin, Win, padh, padw)
+ dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win) # reshape
+ }
+
+ # Reduce convenience gradient volumes with one gradient per example
+ # into single gradients for W and b.
+ dW = matrix(colSums(dWN), rows=F, cols=C*Hf*Wf)
+ db = matrix(colSums(dbN), rows=F, cols=1)
+}
+
+init = function(int F, int C, int Hf, int Wf)
+ return (matrix[double] W, matrix[double] b) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * We use the heuristic by He et al., which limits the magnification
+ * of inputs/gradients during forward/backward passes by scaling
+ * unit-Gaussian weights by a factor of sqrt(2/n), under the
+ * assumption of relu neurons.
+ * - http://arxiv.org/abs/1502.01852
+ *
+ * Inputs:
+ * - F: Number of filters.
+ * - C: Number of input channels (dimensionality of depth).
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ *
+ * Outputs:
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ */
+ W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+ b = matrix(0, rows=F, cols=1)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/conv2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/conv2d_builtin.dml b/scripts/nn/layers/conv2d_builtin.dml
new file mode 100644
index 0000000..bda7a9c
--- /dev/null
+++ b/scripts/nn/layers/conv2d_builtin.dml
@@ -0,0 +1,160 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ *
+ * This implementation uses a built-in operator for higher performance.
+ */
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] out, int Hout, int Wout) {
+ /*
+ * Computes the forward pass for a 2D spatial convolutional layer with
+ * F filters. The input data has N examples, each represented as a 3D
+ * volume unrolled into a single vector.
+ *
+ * This implementation uses a built-in operator for higher
+ * performance.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ * - C: Number of input channels (dimensionality of depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * For same output height as input, set `padh = (Hf - 1) / 2`,
+ * assuming `strideh = 1`.
+ * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+ * preserves the spatial dimensions of the input.
+ * - padw: Padding for left and right sides.
+ * For same output width as input, set `padw = (Wf - 1) / 2`,
+ * assuming `stridew = 1`.
+ * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+ * preserves the spatial dimensions of the input.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, F*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ */
+ N = nrow(X)
+ F = nrow(W)
+ Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+ Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+ # Convolution - built-in implementation
+ out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
+ stride=[strideh,stridew], padding=[padh,padw])
+
+ # Add bias term to each output filter
+ out = bias_add(out, b)
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+ matrix[double] X, matrix[double] W, matrix[double] b,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+ /*
+ * Computes the backward pass for a 2D spatial convolutional layer
+ * with F filters.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, F*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ * - C: Number of input channels (dimensionality of depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * For same output height as input, set `padh = (Hf - 1) / 2`,
+ * assuming `strideh = 1`.
+ * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+ * preserves the spatial dimensions of the input.
+ * - padw: Padding for left and right sides.
+ * For same output width as input, set `padw = (Wf - 1) / 2`,
+ * assuming `stridew = 1`.
+ * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+ * preserves the spatial dimensions of the input.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+ * - db: Gradient wrt `b`, of shape (F, 1).
+ */
+ N = nrow(X)
+ F = nrow(W)
+
+ # Partial derivatives for convolution - built-in implementation
+ dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
+ input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
+ dX = conv2d_backward_data(W, dout, stride=[strideh, stridew], padding=[padh,padw],
+ input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
+
+ # Partial derivatives for bias vector
+ db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))
+}
+
+init = function(int F, int C, int Hf, int Wf)
+ return (matrix[double] W, matrix[double] b) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * We use the heuristic by He et al., which limits the magnification
+ * of inputs/gradients during forward/backward passes by scaling
+ * unit-Gaussian weights by a factor of sqrt(2/n), under the
+ * assumption of relu neurons.
+ * - http://arxiv.org/abs/1502.01852
+ *
+ * Inputs:
+ * - F: Number of filters.
+ * - C: Number of input channels (dimensionality of depth).
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ *
+ * Outputs:
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ */
+ W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+ b = matrix(0, rows=F, cols=1)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/cross_entropy_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/cross_entropy_loss.dml b/scripts/nn/layers/cross_entropy_loss.dml
new file mode 100644
index 0000000..63db502
--- /dev/null
+++ b/scripts/nn/layers/cross_entropy_loss.dml
@@ -0,0 +1,78 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Cross-Entropy loss function.
+ */
+
+forward = function(matrix[double] pred, matrix[double] y)
+ return (double loss) {
+ /*
+ * Computes the forward pass for a cross-entropy loss function. The
+ * inputs consist of N examples, each with K dimensions corresponding
+ * to normalized probabilities of K classes.
+ *
+ * ```
+ * L_i = -y_i^T * log(pred_i)
+ * L = (1/N) sum(L_i) for i=1 to N
+ * ```
+ *
+ * In these equations, `L` is the total loss, `L_i` is the loss for
+ * example `i`, `y_i` is the K-dimensional vector of target class
+ * probabilities, `pred_i` is K-dimensional vector of predicted
+ * class probabilities, and `N` is the number of examples.
+ *
+ * This can be interpreted as the negative log-likelihood assuming
+ * a Bernoulli distribution generalized to K dimensions, or a
+ * Multinomial with one observation.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, K).
+ * - y: Targets, of shape (N, K).
+ *
+ * Outputs:
+ * - loss: Average loss.
+ */
+ N = nrow(y)
+ eps = 1e-10 # numerical stability to avoid log(0)
+ losses = rowSums(-y * log(pred+eps))
+ loss = sum(losses) / N
+}
+
+backward = function(matrix[double] pred, matrix[double] y)
+ return (matrix[double] dpred) {
+ /*
+ * Computes the backward pass of a cross-entropy loss function. The
+ * inputs consist of N examples, each with K dimensions corresponding
+ * to normalized probabilities of K classes.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, K).
+ * - y: Targets, of shape (N, K).
+ *
+ * Outputs:
+ * - dpred: Gradient wrt `pred`, of shape (N, K).
+ */
+ N = nrow(y)
+ eps = 1e-10 # numerical stability to avoid divide-by-zero
+ dpred = (1/N) * -y * (1/(pred+eps))
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/dropout.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/dropout.dml b/scripts/nn/layers/dropout.dml
new file mode 100644
index 0000000..a36878b
--- /dev/null
+++ b/scripts/nn/layers/dropout.dml
@@ -0,0 +1,76 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Dropout layer.
+ */
+
+forward = function(matrix[double] X, double p, int seed)
+ return (matrix[double] out, matrix[double] mask) {
+ /*
+ * Computes the forward pass for an inverted dropout layer.
+ *
+ * Drops the inputs element-wise with a probability p, and divides
+ * by p to maintain the expected values of those inputs (which are
+ * the outputs of neurons) at test time.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ * - p: Probability of keeping a neuron output.
+ * - seed: [Optional: -1] Random number generator seed to allow for
+ * deterministic evaluation. Set to -1 for a random seed.
+ *
+ * Outputs:
+ * - out: Outputs, of same shape as `X`.
+ * - mask: Dropout mask used to compute the output.
+ */
+ # Normally, we might use something like
+ # `mask = rand(rows=nrow(X), cols=ncol(X), min=0, max=1, seed=seed) <= p`
+ # to create a dropout mask. Fortunately, SystemML has a `sparsity` parameter on
+ # the `rand` function that allows use to create a mask directly.
+ if (seed == -1) {
+ mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p)
+ } else {
+ mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed)
+ }
+ out = X * mask / p
+}
+
+backward = function(matrix[double] dout, matrix[double] X, double p, matrix[double] mask)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for an inverted dropout layer.
+ *
+ * Applies the mask to the upstream gradient, and divides by p to
+ * maintain the expected values at test time.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out`, of same shape as `X`.
+ * - X: Inputs, of shape (any, any).
+ * - p: Probability of keeping a neuron output.
+ * - mask: Dropout mask used to compute the output.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ */
+ dX = mask / p * dout
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l1_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/l1_loss.dml b/scripts/nn/layers/l1_loss.dml
new file mode 100644
index 0000000..b74566d
--- /dev/null
+++ b/scripts/nn/layers/l1_loss.dml
@@ -0,0 +1,72 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * L1 loss function.
+ */
+
+forward = function(matrix[double] pred, matrix[double] y)
+ return (double loss) {
+ /*
+ * Computes the forward pass for an L1 loss function. The inputs
+ * consist of N examples, each with M dimensions to predict.
+ *
+ * ```
+ * L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j.
+ * L = (1/N) sum(L_i) for i=1 to N
+ * ```
+ *
+ * In these equations, `L` is the total loss, `L_i` is the loss for
+ * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
+ * prediction, and `N` is the number of examples.
+ *
+ * This can be interpreted as the negative log-likelihood assuming
+ * a Laplace distribution.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, M).
+ * - y: Targets, of shape (N, M).
+ *
+ * Outputs:
+ * - loss: Average loss.
+ */
+ N = nrow(y)
+ losses = rowSums(abs(pred-y))
+ loss = sum(losses) / N
+}
+
+backward = function(matrix[double] pred, matrix[double] y)
+ return (matrix[double] dpred) {
+ /*
+ * Computes the backward pass for an L1 loss function. The inputs
+ * consist of N examples, each with M dimensions to predict.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, M).
+ * - y: Targets, of shape (N, M).
+ *
+ * Outputs:
+ * - dpred: Gradient wrt `pred`, of shape (N, M).
+ */
+ N = nrow(y)
+ dpred = sign(pred-y) / N
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l1_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/l1_reg.dml b/scripts/nn/layers/l1_reg.dml
new file mode 100644
index 0000000..2b81c0b
--- /dev/null
+++ b/scripts/nn/layers/l1_reg.dml
@@ -0,0 +1,56 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * L1 regularization.
+ */
+
+forward = function(matrix[double] X, double lambda)
+ return (double reg_loss) {
+ /*
+ * Computes the forward pass for an L1 regularization function.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ * - lambda: Regularization strength.
+ * A typical value is 0.01.
+ *
+ * Outputs:
+ * - reg_loss: Total regularization loss.
+ */
+ reg_loss = lambda * sum(abs(X))
+}
+
+backward = function(matrix[double] X, double lambda)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for an L1 regularization function.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ * - lambda: Regularization strength.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ */
+ dX = lambda * sign(X)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l2_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/l2_loss.dml b/scripts/nn/layers/l2_loss.dml
new file mode 100644
index 0000000..0482f25
--- /dev/null
+++ b/scripts/nn/layers/l2_loss.dml
@@ -0,0 +1,72 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * L2 loss function.
+ */
+
+forward = function(matrix[double] pred, matrix[double] y)
+ return (double loss) {
+ /*
+ * Computes the forward pass for an L2 loss function. The inputs
+ * consist of N examples, each with M dimensions to predict.
+ *
+ * ```
+ * L_i = (1/2) norm(pred_i - y_i)^2
+ * L = (1/N) sum(L_i) for i=1 to N
+ * ```
+ *
+ * In these equations, `L` is the total loss, `L_i` is the loss for
+ * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
+ * prediction, and `N` is the number of examples.
+ *
+ * This can be interpreted as the negative log-likelihood assuming
+ * a Gaussian distribution.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, M).
+ * - y: Targets, of shape (N, M).
+ *
+ * Outputs:
+ * - loss: Average loss.
+ */
+ N = nrow(y)
+ losses = 0.5 * rowSums((pred-y)^2)
+ loss = sum(losses) / N
+}
+
+backward = function(matrix[double] pred, matrix[double] y)
+ return (matrix[double] dpred) {
+ /*
+ * Computes the backward pass for an L2 loss function. The inputs
+ * consist of N examples, each with M dimensions to predict.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, M).
+ * - y: Targets, of shape (N, M).
+ *
+ * Outputs:
+ * - dpred: Gradient wrt `pred`, of shape (N, M).
+ */
+ N = nrow(y)
+ dpred = (pred-y) / N
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l2_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/l2_reg.dml b/scripts/nn/layers/l2_reg.dml
new file mode 100644
index 0000000..7255efe
--- /dev/null
+++ b/scripts/nn/layers/l2_reg.dml
@@ -0,0 +1,56 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * L2 regularization.
+ */
+
+forward = function(matrix[double] X, double lambda)
+ return (double reg_loss) {
+ /*
+ * Computes the forward pass for an L2 regularization function.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ * - lambda: Regularization strength.
+ * A typical value is 0.01.
+ *
+ * Outputs:
+ * - reg_loss: Total regularization loss.
+ */
+ reg_loss = 0.5 * lambda * sum(X^2)
+}
+
+backward = function(matrix[double] X, double lambda)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for an L2 regularization function.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ * - lambda: Regularization strength.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ */
+ dX = lambda * X
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/log_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/log_loss.dml b/scripts/nn/layers/log_loss.dml
new file mode 100644
index 0000000..15914f7
--- /dev/null
+++ b/scripts/nn/layers/log_loss.dml
@@ -0,0 +1,76 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Log loss function.
+ */
+
+forward = function(matrix[double] pred, matrix[double] y)
+ return (double loss) {
+ /*
+ * Computes the forward pass for a log loss function.
+ *
+ * ```
+ * L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i)
+ * L = (1/N) sum(L_i) for i=1 to N
+ * ```
+ *
+ * In these equations, `L` is the total loss, `L_i` is the loss for
+ * example `i`, `y_i` is the binary target, `pred_i` is probability
+ * of the true class (i.e. `y=1`), and `N` is the number of examples.
+ *
+ * This can be interpreted as the negative log-likelihood assuming
+ * a Bernoulli distribution.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, 1).
+ * Predictions should be probabilities of the true
+ * class (i.e. probability of `y=1`).
+ * - y: Targets, of shape (N, 1).
+ * Targets should be binary in the set {0, 1}.
+ *
+ * Outputs:
+ * - loss: Average loss.
+ */
+ N = nrow(y)
+ losses = -y*log(pred) - (1-y)*log(1-pred)
+ loss = sum(losses) / N
+}
+
+backward = function(matrix[double] pred, matrix[double] y)
+ return (matrix[double] dpred) {
+ /*
+ * Computes the backward pass for a log loss function.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, 1).
+ * Predictions should be probabilities of the true
+ * class (i.e. probability of `y=1`).
+ * - y: Targets, of shape (N, 1).
+ * Targets should be binary in the set {0, 1}.
+ *
+ * Outputs:
+ * - dpred: Gradient wrt `pred`, of shape (N, 1).
+ */
+ N = nrow(y)
+ dpred = (1/N) * (pred-y) / (pred*(1-pred))
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/lstm.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/lstm.dml b/scripts/nn/layers/lstm.dml
new file mode 100644
index 0000000..a75add4
--- /dev/null
+++ b/scripts/nn/layers/lstm.dml
@@ -0,0 +1,260 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * LSTM layer.
+ */
+source("nn/layers/sigmoid.dml") as sigmoid
+source("nn/layers/tanh.dml") as tanh
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
+ boolean return_sequences, matrix[double] out0, matrix[double] c0)
+ return (matrix[double] out, matrix[double] c,
+ matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog) {
+ /*
+ * Computes the forward pass for an LSTM layer with M neurons.
+ * The input data has N sequences of T examples, each with D features.
+ *
+ * In an LSTM, an internal cell state is maintained, additive
+ * interactions operate over the cell state at each timestep, and
+ * some amount of this cell state is exposed as output at each
+ * timestep. Additionally, the output of the previous timestep is fed
+ * back in as an additional input at the current timestep.
+ *
+ * Reference:
+ * - Long Short-Term Memory, Hochreiter, 1997
+ * - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, T*D).
+ * - W: Weights, of shape (D+M, 4M).
+ * - b: Biases, of shape (1, 4M).
+ * - T: Length of example sequences (number of timesteps).
+ * - D: Dimensionality of the input features (number of features).
+ * - return_sequences: Whether to return `out` at all timesteps,
+ * or just for the final timestep.
+ * - out0: Outputs from previous timestep, of shape (N, M).
+ * Note: This is *optional* and could just be an empty matrix.
+ * - c0: Initial cell state, of shape (N, M).
+ * Note: This is *optional* and could just be an empty matrix.
+ *
+ * Outputs:
+ * - out: If `return_sequences` is True, outputs for all timesteps,
+ * of shape (N, T*M). Else, outputs for the final timestep, of
+ * shape (N, M).
+ * - c: Cell state for final timestep, of shape (N, M).
+ * - cache_out: Cache of outputs, of shape (T, N*M).
+ * Note: This is used for performance during training.
+ * - cache_c: Cache of cell state, of shape (T, N*M).
+ * Note: This is used for performance during training.
+ * - cache_ifog: Cache of intermediate values, of shape (T, N*4M).
+ * Note: This is used for performance during training.
+ */
+ N = nrow(X)
+ M = as.integer(ncol(W)/4)
+ out_prev = out0
+ c_prev = c0
+ c = c_prev
+ if (return_sequences) {
+ out = matrix(0, rows=N, cols=T*M)
+ }
+ else {
+ out = matrix(0, rows=N, cols=M)
+ }
+ # caches to be used during the backward pass for performance
+ cache_out = matrix(0, rows=T, cols=N*M)
+ cache_c = matrix(0, rows=T, cols=N*M)
+ cache_ifog = matrix(0, rows=T, cols=N*4*M)
+
+ for (t in 1:T) { # each timestep
+ X_t = X[,(t-1)*D+1:t*D] # shape (N, D)
+ input = cbind(X_t, out_prev) # shape (N, D+M)
+ ifog = input %*% W + b # input, forget, output, and g gates; shape (N, 4M)
+ tmp = sigmoid::forward(ifog[,1:3*M]) # i,f,o gates squashed with sigmoid
+ ifog[,1:3*M] = tmp
+ tmp = tanh::forward(ifog[,3*M+1:4*M]) # g gate squashed with tanh
+ ifog[,3*M+1:4*M] = tmp
+ # c_t = f*prev_c + i*g
+ c = ifog[,M+1:2*M]*c_prev + ifog[,1:M]*ifog[,3*M+1:4*M] # shape (N, M)
+ # out_t = o*tanh(c)
+ tmp = tanh::forward(c)
+ out_t = ifog[,2*M+1:3*M] * tmp # shape (N, M)
+
+ # store
+ if (return_sequences) {
+ out[,(t-1)*M+1:t*M] = out_t
+ }
+ else {
+ out = out_t
+ }
+ out_prev = out_t
+ c_prev = c
+ cache_out[t,] = matrix(out_t, rows=1, cols=N*M) # reshape
+ cache_c[t,] = matrix(c, rows=1, cols=N*M) # reshape
+ cache_ifog[t,] = matrix(ifog, rows=1, cols=N*4*M) # reshape
+ }
+}
+
+backward = function(matrix[double] dout, matrix[double] dc,
+ matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
+ boolean given_sequences, matrix[double] out0, matrix[double] c0,
+ matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog)
+ return (matrix[double] dX, matrix[double] dW, matrix[double] db,
+ matrix[double] dout0, matrix[double] dc0) {
+ /*
+ * Computes the backward pass for an LSTM layer with M neurons.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out`. If `given_sequences` is `True`,
+ * contains gradients on outputs for all timesteps, of
+ * shape (N, T*M). Else, contains the gradient on the output
+ * for the final timestep, of shape (N, M).
+ * - dc: Gradient wrt `c` (from later in time), of shape (N, M).
+ * This would come from later in time if the cell state was used
+ * downstream as the initial cell state for another LSTM layer.
+ * Typically, this would be used when a sequence was cut at
+ * timestep `T` and then continued in the next batch. If `c`
+ * was not used downstream, then `dc` would be an empty matrix.
+ * - X: Inputs, of shape (N, T*D).
+ * - W: Weights, of shape (D+M, 4M).
+ * - b: Biases, of shape (1, 4M).
+ * - T: Length of example sequences (number of timesteps).
+ * - D: Dimensionality of the input features.
+ * - given_sequences: Whether `dout` is for all timesteps,
+ * or just for the final timestep. This is based on whether
+ * `return_sequences` was true in the forward pass.
+ * - out0: Outputs from previous timestep, of shape (N, M).
+ * Note: This is *optional* and could just be an empty matrix.
+ * - c0: Initial cell state, of shape (N, M).
+ * Note: This is *optional* and could just be an empty matrix.
+ * - cache_out: Cache of outputs, of shape (T, N*M).
+ * Note: This is used for performance during training.
+ * - cache_c: Cache of cell state, of shape (T, N*M).
+ * Note: This is used for performance during training.
+ * - cache_ifog: Cache of intermediate values, of shape (T, N*4*M).
+ * Note: This is used for performance during training.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, T*D).
+ * - dW: Gradient wrt `W`, of shape (D+M, 4M).
+ * - db: Gradient wrt `b`, of shape (1, 4M).
+ * - dout0: Gradient wrt `out0`, of shape (N, M).
+ * - dc0: Gradient wrt `c0`, of shape (N, M).
+ */
+ N = nrow(X)
+ M = as.integer(ncol(W)/4)
+ dX = matrix(0, rows=N, cols=T*D)
+ dW = matrix(0, rows=D+M, cols=4*M)
+ db = matrix(0, rows=1, cols=4*M)
+ dout0 = matrix(0, rows=N, cols=M)
+ dc0 = matrix(0, rows=N, cols=M)
+ dct = dc
+ if (!given_sequences) {
+ # only given dout for output at final timestep, so prepend empty douts for all other timesteps
+ dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout) # shape (N, T*M)
+ }
+
+ t = T
+ for (iter in 1:T) { # each timestep in reverse order
+ X_t = X[,(t-1)*D+1:t*D] # shape (N, D)
+ dout_t = dout[,(t-1)*M+1:t*M] # shape (N, M)
+ out_t = matrix(cache_out[t,], rows=N, cols=M) # shape (N, M)
+ ct = matrix(cache_c[t,], rows=N, cols=M) # shape (N, M)
+ if (t == 1) {
+ out_prev = out0 # shape (N, M)
+ c_prev = c0 # shape (N, M)
+ }
+ else {
+ out_prev = matrix(cache_out[t-1,], rows=N, cols=M) # shape (N, M)
+ c_prev = matrix(cache_c[t-1,], rows=N, cols=M) # shape (N, M)
+ }
+ input = cbind(X_t, out_prev) # shape (N, D+M)
+ ifog = matrix(cache_ifog[t,], rows=N, cols=4*M)
+ i = ifog[,1:M] # input gate, shape (N, M)
+ f = ifog[,M+1:2*M] # forget gate, shape (N, M)
+ o = ifog[,2*M+1:3*M] # output gate, shape (N, M)
+ g = ifog[,3*M+1:4*M] # g gate, shape (N, M)
+
+ tmp = tanh::backward(dout_t, ct)
+ dct = dct + o*tmp # shape (N, M)
+ tmp = tanh::forward(ct)
+ do = tmp * dout_t # output gate, shape (N, M)
+ df = c_prev * dct # forget gate, shape (N, M)
+ dc_prev = f * dct # shape (N, M)
+ di = g * dct # input gate, shape (N, M)
+ dg = i * dct # g gate, shape (N, M)
+
+ di_raw = i * (1-i) * di
+ df_raw = f * (1-f) * df
+ do_raw = o * (1-o) * do
+ dg_raw = (1-g^2) * dg
+ difog_raw = cbind(di_raw, cbind(df_raw, cbind(do_raw, dg_raw))) # shape (N, 4M)
+
+ dW = dW + t(input) %*% difog_raw # shape (D+M, 4M)
+ db = db + colSums(difog_raw) # shape (1, 4M)
+ dinput = difog_raw %*% t(W) # shape (N, D+M)
+ dX[,(t-1)*D+1:t*D] = dinput[,1:D]
+ dout_prev = dinput[,D+1:D+M] # shape (N, M)
+ if (t == 1) {
+ dout0 = dout_prev # shape (N, M)
+ dc0 = dc_prev # shape (N, M)
+ }
+ else {
+ dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M)
+ dct = dc_prev # shape (N, M)
+ }
+ t = t - 1
+ }
+}
+
+init = function(int N, int D, int M)
+ return (matrix[double] W, matrix[double] b, matrix[double] out0, matrix[double] c0) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * We use the Glorot uniform heuristic which limits the magnification
+ * of inputs/gradients during forward/backward passes by scaling
+ * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
+ * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
+ *
+ * Inputs:
+ * - N: Number of examples in batch.
+ * - D: Dimensionality of the input features (number of features).
+ * - M: Number of neurons in this layer.
+ *
+ * Outputs:
+ * - W: Weights, of shape (D+M, 4M).
+ * - b: Biases, of shape (1, 4M).
+ * - out0: Empty previous timestep output matrix, of shape (N, M).
+ * - c0: Empty initial cell state matrix, of shape (N, M).
+ */
+ fan_in = D+M
+ fan_out = 4*M
+ scale = sqrt(6/(fan_in+fan_out))
+ W = rand(rows=D+M, cols=4*M, min=-scale, max=scale, pdf="uniform")
+ b = matrix(0, rows=1, cols=4*M)
+ out0 = matrix(0, rows=N, cols=M)
+ c0 = matrix(0, rows=N, cols=M)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/max_pool2d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/max_pool2d.dml b/scripts/nn/layers/max_pool2d.dml
new file mode 100644
index 0000000..fba1a4c
--- /dev/null
+++ b/scripts/nn/layers/max_pool2d.dml
@@ -0,0 +1,159 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Max Pooling layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] out, int Hout, int Wout) {
+ /*
+ * Computes the forward pass for a 2D spatial max pooling layer.
+ * The input data has N examples, each represented as a 3D volume
+ * unrolled into a single vector.
+ *
+ * This implementation uses `im2col` internally for each image to
+ * extract local image regions (patches) of each channel slice into
+ * columns, and then performs max pooling over the patches to compute
+ * the output maps.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * A typical value is 0.
+ * - padw: Padding for left and right sides.
+ * A typical value is 0.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, C*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ */
+ N = nrow(X)
+ Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+ Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+ pad_value = -1/0 # in max pooling we pad with -infinity
+
+ # Create output volume
+ out = matrix(0, rows=N, cols=C*Hout*Wout)
+
+ # Max pooling - im2col implementation
+ parfor (n in 1:N) { # all examples
+ img = matrix(X[n,], rows=C, cols=Hin*Win) # reshape
+
+ if (padh > 0 | padw > 0) {
+ # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
+ img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
+ }
+
+ img_maxes = matrix(0, rows=C, cols=Hout*Wout) # zeros
+ parfor (c in 1:C) { # all channels
+ # Extract local image slice patches into columns with im2col, of shape (Hf*Wf, Hout*Wout)
+ img_slice_cols = util::im2col(img[c,], Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
+
+ # Max pooling on patches
+ img_maxes[c,] = colMaxs(img_slice_cols)
+ }
+
+ out[n,] = matrix(img_maxes, rows=1, cols=C*Hout*Wout)
+ }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for a 2D spatial max pooling layer.
+ * The input data has N examples, each represented as a 3D volume
+ * unrolled into a single vector.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, C*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ * - X: Input data matrix, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * A typical value is 0.
+ * - padw: Padding for left and right sides.
+ * A typical value is 0.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ */
+ N = nrow(X)
+ pad_value = -1/0 # in max pooling we pad with -infinity
+
+ # Create gradient volume
+ dX = matrix(0, rows=N, cols=C*Hin*Win)
+
+ # Gradient of max pooling
+ parfor (n in 1:N, check=0) { # all examples
+ img = matrix(X[n,], rows=C, cols=Hin*Win)
+ if (padh > 0 | padw > 0) {
+ # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
+ img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
+ }
+
+ dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+ parfor (c in 1:C, check=0) { # all channels
+ img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+ dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+ for (hout in 1:Hout, check=0) { # all output rows
+ hin = (hout-1)*strideh + 1
+ for (wout in 1:Wout) { # all output columns
+ win = (wout-1)*stridew + 1
+ img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
+ max_val_ind = img_slice_patch == max(img_slice_patch) # max value indicator matrix
+ # gradient passes through only for the max value(s) in this patch
+ dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
+ dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
+ + dimg_slice_patch
+ }
+ }
+ dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
+ }
+
+ if (padh > 0 | padw > 0) {
+ # Unpad image gradient
+ dimg = util::unpad_image(dimg, Hin, Win, padh, padw) # shape (C, (Hin+2*padh)*(Win+2*padw))
+ }
+ dX[n,] = matrix(dimg, rows=1, cols=C*Hin*Win)
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/max_pool2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/max_pool2d_builtin.dml b/scripts/nn/layers/max_pool2d_builtin.dml
new file mode 100644
index 0000000..880f818
--- /dev/null
+++ b/scripts/nn/layers/max_pool2d_builtin.dml
@@ -0,0 +1,103 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Max Pooling layer.
+ *
+ * This implementation uses a built-in operator for higher performance.
+ */
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] out, int Hout, int Wout) {
+ /*
+ * Computes the forward pass for a 2D spatial max pooling layer.
+ * The input data has N examples, each represented as a 3D volume
+ * unrolled into a single vector.
+ *
+ * This implementation uses a built-in operator for higher
+ * performance.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * A typical value is 0.
+ * - padw: Padding for left and right sides.
+ * A typical value is 0.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, C*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ */
+ N = nrow(X)
+ Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+ Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+ # Max pooling - built-in implementation
+ out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
+ stride=[strideh,stridew], padding=[padh,padw])
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for a 2D spatial max pooling layer.
+ * The input data has N examples, each represented as a 3D volume
+ * unrolled into a single vector.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, C*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * A typical value is 0.
+ * - padw: Padding for left and right sides.
+ * A typical value is 0.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ */
+ N = nrow(X)
+
+ # Gradient of max pooling
+ dX = max_pool_backward(X, dout, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
+ stride=[strideh,stridew], padding=[padh,padw])
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/relu.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/relu.dml b/scripts/nn/layers/relu.dml
new file mode 100644
index 0000000..93a6e90
--- /dev/null
+++ b/scripts/nn/layers/relu.dml
@@ -0,0 +1,59 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Rectified Linear Unit (ReLU) nonlinearity layer.
+ */
+
+forward = function(matrix[double] X)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for a ReLU nonlinearity layer.
+ *
+ * Performs an element-wise evaluation of `f(input) = max(0, input)`.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ *
+ * Outputs:
+ * - out: Outputs, of same shape as `X`.
+ */
+ out = max(X, 0)
+}
+
+backward = function(matrix[double] dout, matrix[double] X)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for a ReLU nonlinearity layer.
+ *
+ * Essentially performs a pass-through of the upstream gradient
+ * for cells > 0.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+ * - X: Previous input data matrix, of shape (any, any).
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ */
+ dX = (X > 0) * dout
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/rnn.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/rnn.dml b/scripts/nn/layers/rnn.dml
new file mode 100644
index 0000000..3c6faae
--- /dev/null
+++ b/scripts/nn/layers/rnn.dml
@@ -0,0 +1,183 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Simple (Vanilla) RNN layer.
+ */
+source("nn/layers/tanh.dml") as tanh
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
+ boolean return_sequences, matrix[double] out0)
+ return (matrix[double] out, matrix[double] cache_out) {
+ /*
+ * Computes the forward pass for a simple RNN layer with M neurons.
+ * The input data has N sequences of T examples, each with D features.
+ *
+ * In a simple RNN, the output of the previous timestep is fed back
+ * in as an additional input at the current timestep.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, T*D).
+ * - W: Weights, of shape (D+M, M).
+ * - b: Biases, of shape (1, M).
+ * - T: Length of example sequences (number of timesteps).
+ * - D: Dimensionality of the input features (number of features).
+ * - return_sequences: Whether to return `out` at all timesteps,
+ * or just for the final timestep.
+ * - out0: Output matrix from previous timestep, of shape (N, M).
+ * Note: This is *optional* and could just be an empty matrix.
+ *
+ * Outputs:
+ * - out: If `return_sequences` is True, outputs for all timesteps,
+ * of shape (N, T*M). Else, outputs for the final timestep, of
+ * shape (N, M).
+ * - cache_out: Cache of outputs, of shape (T, N*M).
+ * Note: This is used for performance during training.
+ */
+ N = nrow(X)
+ M = ncol(W)
+ out_prev = out0
+ if (return_sequences) {
+ out = matrix(0, rows=N, cols=T*M)
+ }
+ else {
+ out = matrix(0, rows=N, cols=M)
+ }
+ # caches to be used during the backward pass for performance
+ cache_out = matrix(0, rows=T, cols=N*M)
+
+ for (t in 1:T) { # each timestep
+ X_t = X[,(t-1)*D+1:t*D] # shape (N, D)
+ input = cbind(X_t, out_prev) # shape (N, D+M)
+ out_t = tanh::forward(input %*% W + b) # shape (N, M)
+ # store
+ if (return_sequences) {
+ out[,(t-1)*M+1:t*M] = out_t
+ }
+ else {
+ out = out_t
+ }
+ out_prev = out_t
+ cache_out[t,] = matrix(out_t, rows=1, cols=N*M) # reshape
+ }
+}
+
+backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, matrix[double] b,
+ int T, int D, boolean given_sequences, matrix[double] out0,
+ matrix[double] cache_out)
+ return (matrix[double] dX, matrix[double] dW, matrix[double] db, matrix[double] dout0) {
+ /*
+ * Computes the backward pass for a simple RNN layer with M neurons.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream. If `given_sequences`
+ * is True, contains gradients on outputs for all timesteps,
+ * of shape (N, T*M). Else, contains gradient on output for
+ * the final timestep, of shape (N, M).
+ * - X: Inputs, of shape (N, T*D).
+ * - W: Weights, of shape (D+M, M).
+ * - b: Biases, of shape (1, M).
+ * - T: Length of example sequences (number of timesteps).
+ * - D: Dimensionality of the input features (number of features).
+ * - given_sequences: Whether `dout` is for all timesteps,
+ * or just for the final timestep. This is based on whether
+ * `return_sequences` was true in the forward pass.
+ * - out0: Output matrix from previous timestep, of shape (N, M).
+ * Note: This is *optional* and could just be an empty matrix.
+ * - cache_out: Cache of outputs, of shape (T, N*M).
+ * Note: This is used for performance during training.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, T*D).
+ * - dW: Gradient wrt `W`, of shape (D+M, 4M).
+ * - db: Gradient wrt `b`, of shape (1, 4M).
+ * - dout0: Gradient wrt `out0`, of shape (N, M).
+ */
+ N = nrow(X)
+ M = ncol(W)
+ dX = matrix(0, rows=N, cols=T*D)
+ dW = matrix(0, rows=D+M, cols=M)
+ db = matrix(0, rows=1, cols=M)
+ dout0 = matrix(0, rows=N, cols=M)
+ if (!given_sequences) {
+ # only given dout for output at final timestep, so prepend empty douts for all other timesteps
+ dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout) # shape (N, T*M)
+ }
+
+ t = T
+ for (iter in 1:T) { # each timestep in reverse order
+ X_t = X[,(t-1)*D+1:t*D] # shape (N, D)
+ dout_t = dout[,(t-1)*M+1:t*M] # shape (N, M)
+ out_t = matrix(cache_out[t,], rows=N, cols=M) # shape (N, M)
+ if (t == 1) {
+ out_prev = out0 # shape (N, M)
+ }
+ else {
+ out_prev = matrix(cache_out[t-1,], rows=N, cols=M) # shape (N, M)
+ }
+ input = cbind(X_t, out_prev) # shape (N, D+M)
+ dout_t_raw = (1-out_t^2) * dout_t # into tanh, shape (N, M)
+ dW = dW + t(input) %*% dout_t_raw # shape (D+M, M)
+ db = db + colSums(dout_t_raw) # shape (1, M)
+ dinput = dout_t_raw %*% t(W) # shape (N, D+M)
+ dX[,(t-1)*D+1:t*D] = dinput[,1:D]
+ dout_prev = dinput[,D+1:D+M] # shape (N, M)
+ if (t == 1) {
+ dout0 = dout_prev # shape (N, M)
+ }
+ else {
+ dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M)
+ }
+ t = t - 1
+ }
+}
+
+init = function(int N, int D, int M)
+ return (matrix[double] W, matrix[double] b, matrix[double] out0) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * We use the Glorot uniform heuristic which limits the magnification
+ * of inputs/gradients during forward/backward passes by scaling
+ * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
+ * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
+ *
+ * Inputs:
+ * - N: Number of examples in batch.
+ * - D: Dimensionality of the input features (number of features).
+ * - M: Number of neurons in this layer.
+ *
+ * Outputs:
+ * - W: Weights, of shape (D+M, M).
+ * - b: Biases, of shape (1, M).
+ * - out0: Empty previous timestep output matrix, of shape (N, M).
+ */
+ fan_in = D+M
+ fan_out = M
+ scale = sqrt(6/(fan_in+fan_out))
+ W = rand(rows=D+M, cols=M, min=-scale, max=scale, pdf="uniform")
+ b = matrix(0, rows=1, cols=M)
+ out0 = matrix(0, rows=N, cols=M)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/scale_shift1d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/scale_shift1d.dml b/scripts/nn/layers/scale_shift1d.dml
new file mode 100644
index 0000000..7e162a3
--- /dev/null
+++ b/scripts/nn/layers/scale_shift1d.dml
@@ -0,0 +1,95 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 1D Scale & Shift layer.
+ */
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for a 1D scale & shift layer. The input
+ * data has N examples, each with D features.
+ *
+ * A 1D scale & shift layer introduces learnable parameters
+ * (gamma, beta) to scale and shift the input on a per-feature basis.
+ *
+ * `y = x*gamma + beta`
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, D).
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, D).
+ */
+ # Scale and shift
+ out = X*gamma + beta # shape (N, D)
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+ matrix[double] X, matrix[double] gamma, matrix[double] beta)
+ return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+ /*
+ * Computes the backward pass for a 1D scale & shift layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of shape (N, D).
+ * - out: Outputs from the forward pass, of shape (N, D).
+ * - X: Inputs, of shape (N, D).
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, D).
+ * - dgamma: Gradient wrt `W`, of shape (1, D).
+ * - dbeta: Gradient wrt `b`, of shape (1, D).
+ *
+ */
+ # Compute gradients during training
+ dgamma = colSums(dout*X) # shape (1, D)
+ dbeta = colSums(dout) # shape (1, D)
+ dX = dout * gamma # shape (N, D)
+}
+
+init = function(int D)
+ return (matrix[double] gamma, matrix[double] beta) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * By default, we initialize to an identity function, with a scale
+ * filler of `1`, and a shift filler of `0`.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - D: Dimensionality of the input features (number of features).
+ *
+ * Outputs:
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ */
+ gamma = matrix(1, rows=1, cols=D)
+ beta = matrix(0, rows=1, cols=D)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/scale_shift2d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/scale_shift2d.dml b/scripts/nn/layers/scale_shift2d.dml
new file mode 100644
index 0000000..79c884a
--- /dev/null
+++ b/scripts/nn/layers/scale_shift2d.dml
@@ -0,0 +1,107 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Scale & Shift layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ int C, int Hin, int Win)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for a 2D scale & shift layer. The input
+ * data has N examples, each represented as a 3D volume unrolled into
+ * a single vector.
+ *
+ * A 2D scale & shift layer introduces learnable parameters
+ * (gamma, beta) to scale and shift the input on a per-channel basis.
+ *
+ * `y = x*gamma + beta`
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, C*Hin*Win).
+ */
+ # Scale and shift
+ scaled = bias_multiply(X, gamma) # shape (N, C*Hin*Win)
+ out = bias_add(scaled, beta) # shape (N, C*Hin*Win)
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+ matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ int C, int Hin, int Win)
+ return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+ /*
+ * Computes the backward pass for a 2D scale & shift layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
+ * - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
+ * - X: Input data matrix to the forward pass, of
+ * shape (N, C*Hin*Win).
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ * - dgamma: Gradient wrt `W`, of shape (C, 1).
+ * - dbeta: Gradient wrt `b`, of shape (C, 1).
+ *
+ */
+ # Compute gradients during training
+ dgamma = util::channel_sums(dout*X, C, Hin, Win) # shape (C, 1)
+ dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
+ dX = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
+}
+
+init = function(int C)
+ return (matrix[double] gamma, matrix[double] beta) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * By default, we initialize to an identity function, with a scale
+ * filler of `1`, and a shift filler of `0`.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - C: Number of input channels (dimensionality of input depth).
+ *
+ * Outputs:
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ */
+ gamma = matrix(1, rows=C, cols=1)
+ beta = matrix(0, rows=C, cols=1)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/sigmoid.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/sigmoid.dml b/scripts/nn/layers/sigmoid.dml
new file mode 100644
index 0000000..2d85adc
--- /dev/null
+++ b/scripts/nn/layers/sigmoid.dml
@@ -0,0 +1,62 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Sigmoid nonlinearity layer.
+ */
+
+forward = function(matrix[double] X)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for a sigmoid nonlinearity layer.
+ *
+ * `sigmoid(x) = 1 / (1 + e^-x)`
+ *
+ * If `X` contains a single feature column, the output of a sigmoid
+ * layer can be interpreted as a predicted probability of a true
+ * class when paired with a log loss function in a binary
+ * classification problem.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ *
+ * Outputs:
+ * - out: Outputs, of same shape as `X`.
+ */
+ out = 1 / (1+exp(-X))
+}
+
+backward = function(matrix[double] dout, matrix[double] X)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for a sigmoid nonlinearity layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+ * - X: Inputs, of shape (any, any).
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ */
+ out = 1 / (1+exp(-X))
+ dX = out * (1-out) * dout
+}
+