You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/04/01 01:42:35 UTC
[2/7] incubator-systemml git commit: [SYSTEMML-1452] General code
cleanup of SystemML-NN
[SYSTEMML-1452] General code cleanup of SystemML-NN
This commmit performs a general code & documentation cleanup across the
library.
Closes #447.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/16b1cbd7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/16b1cbd7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/16b1cbd7
Branch: refs/heads/master
Commit: 16b1cbd72601afbed0b19c1d4125a898fd324b1c
Parents: 2e48d95
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Fri Mar 31 18:38:15 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Fri Mar 31 18:38:16 2017 -0700
----------------------------------------------------------------------
projects/breast_cancer/hyperparam_tuning.dml | 8 +-
projects/breast_cancer/softmax_clf.dml | 16 +--
.../staging/SystemML-NN/nn/layers/affine.dml | 36 ++++---
.../SystemML-NN/nn/layers/batch_norm.dml | 17 +--
scripts/staging/SystemML-NN/nn/layers/conv.dml | 50 ++++-----
.../SystemML-NN/nn/layers/conv_builtin.dml | 63 ++++++-----
.../nn/layers/cross_entropy_loss.dml | 29 +++--
.../staging/SystemML-NN/nn/layers/dropout.dml | 23 ++--
.../staging/SystemML-NN/nn/layers/l1_loss.dml | 29 +++--
.../staging/SystemML-NN/nn/layers/l1_reg.dml | 15 +--
.../staging/SystemML-NN/nn/layers/l2_loss.dml | 29 +++--
.../staging/SystemML-NN/nn/layers/l2_reg.dml | 15 +--
.../staging/SystemML-NN/nn/layers/log_loss.dml | 40 ++++---
scripts/staging/SystemML-NN/nn/layers/lstm.dml | 65 ++++++------
.../staging/SystemML-NN/nn/layers/max_pool.dml | 15 +--
.../SystemML-NN/nn/layers/max_pool_builtin.dml | 14 +--
scripts/staging/SystemML-NN/nn/layers/relu.dml | 22 ++--
scripts/staging/SystemML-NN/nn/layers/rnn.dml | 43 ++++----
.../staging/SystemML-NN/nn/layers/sigmoid.dml | 30 ++++--
.../staging/SystemML-NN/nn/layers/softmax.dml | 29 ++---
.../nn/layers/spatial_batch_norm.dml | 12 +--
scripts/staging/SystemML-NN/nn/layers/tanh.dml | 28 ++---
.../staging/SystemML-NN/nn/optim/adagrad.dml | 22 ++--
scripts/staging/SystemML-NN/nn/optim/adam.dml | 38 +++----
.../staging/SystemML-NN/nn/optim/rmsprop.dml | 24 +++--
scripts/staging/SystemML-NN/nn/optim/sgd.dml | 12 ++-
.../SystemML-NN/nn/optim/sgd_momentum.dml | 24 +++--
.../SystemML-NN/nn/optim/sgd_nesterov.dml | 23 ++--
.../staging/SystemML-NN/nn/test/conv_simple.dml | 51 ++++-----
.../staging/SystemML-NN/nn/test/grad_check.dml | 106 ++++++++++---------
.../SystemML-NN/nn/test/max_pool_simple.dml | 18 ++--
scripts/staging/SystemML-NN/nn/util.dml | 46 ++++----
32 files changed, 549 insertions(+), 443 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/projects/breast_cancer/hyperparam_tuning.dml
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/hyperparam_tuning.dml b/projects/breast_cancer/hyperparam_tuning.dml
index 464c659..4f054c3 100644
--- a/projects/breast_cancer/hyperparam_tuning.dml
+++ b/projects/breast_cancer/hyperparam_tuning.dml
@@ -7,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -66,7 +66,9 @@ parfor(j in 1:10000) {
log_interval = 10
# Train
- [Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2] = clf::train(X, Y, X_val, Y_val, C, Hin, Win, lr, mu, decay, lambda, batch_size, epochs, log_interval, dir)
+ [Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2] =
+ clf::train(X, Y, X_val, Y_val, C, Hin, Win, lr, mu, decay, lambda, batch_size, epochs,
+ log_interval, dir)
# Eval
#probs = clf::predict(X, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/projects/breast_cancer/softmax_clf.dml
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/softmax_clf.dml b/projects/breast_cancer/softmax_clf.dml
index e106a36..35fd545 100644
--- a/projects/breast_cancer/softmax_clf.dml
+++ b/projects/breast_cancer/softmax_clf.dml
@@ -7,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -79,7 +79,7 @@ train = function(matrix[double] X, matrix[double] Y,
accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
# Output results
print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
-
+
# Optimize
print("Starting optimization")
iters = ceil(N / batch_size)
@@ -152,7 +152,7 @@ predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
*/
N = nrow(X) # num examples
K = ncol(W) # num classes
-
+
# Compute forward pass
## affine & softmax:
out = affine::forward(X, W, b)
@@ -185,7 +185,7 @@ eval = function(matrix[double] probs, matrix[double] Y)
generate_dummy_data = function()
return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) {
/*
- * Generate a dummy dataset similar to the MNIST dataset.
+ * Generate a dummy dataset similar to the breast cancer dataset.
*
* Outputs:
* - X: Input data matrix, of shape (N, D).
@@ -196,9 +196,9 @@ generate_dummy_data = function()
*/
# Generate dummy input data
N = 1024 # num examples
- C = 1 # num input channels
- Hin = 28 # input height
- Win = 28 # input width
+ C = 3 # num input channels
+ Hin = 256 # input height
+ Win = 256 # input width
T = 10 # num targets
X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/affine.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/affine.dml b/scripts/staging/SystemML-NN/nn/layers/affine.dml
index 6a4c210..f9f8559 100644
--- a/scripts/staging/SystemML-NN/nn/layers/affine.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/affine.dml
@@ -22,6 +22,7 @@
/*
* Fully-connected (affine) layer.
*/
+
forward = function(matrix[double] X, matrix[double] W, matrix[double] b)
return (matrix[double] out) {
/*
@@ -29,9 +30,9 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b)
* M neurons. The input data has N examples, each with D features.
*
* Inputs:
- * - X: Input data matrix, of shape (N, D).
- * - W: Weights (parameters) matrix, of shape (D, M).
- * - b: Biases vector, of shape (1, M).
+ * - X: Inputs, of shape (N, D).
+ * - W: Weights, of shape (D, M).
+ * - b: Biases, of shape (1, M).
*
* Outputs:
* - out: Outputs, of shape (N, M).
@@ -47,15 +48,15 @@ backward = function(matrix[double] dout, matrix[double] X,
* with M neurons.
*
* Inputs:
- * - dout: Derivatives from upstream, of shape (N, M).
- * - X: Previous input data matrix, of shape (N, D).
- * - W: Weights (parameters) matrix, of shape (D, M).
- * - b: Biases vector, of shape (1, M).
+ * - dout: Gradient wrt `out` from upstream, of shape (N, M).
+ * - X: Inputs, of shape (N, D).
+ * - W: Weights, of shape (D, M).
+ * - b: Biases, of shape (1, M).
*
* Outputs:
- * - dX: Gradient wrt X, of shape (N, D).
- * - dW: Gradient wrt W, of shape (D, M).
- * - db: Gradient wrt b, of shape (1, M).
+ * - dX: Gradient wrt `X`, of shape (N, D).
+ * - dW: Gradient wrt `W`, of shape (D, M).
+ * - db: Gradient wrt `b`, of shape (1, M).
*/
dX = dout %*% t(W)
dW = t(X) %*% dout
@@ -70,18 +71,19 @@ init = function(int D, int M)
* Note: This is just a convenience function, and parameters
* may be initialized manually if needed.
*
- * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852],
- * which limits the magnification of inputs/gradients during
- * forward/backward passes by scaling unit-Gaussian weights by a
- * factor of sqrt(2/n), under the assumption of relu neurons.
+ * We use the heuristic by He et al., which limits the magnification
+ * of inputs/gradients during forward/backward passes by scaling
+ * unit-Gaussian weights by a factor of sqrt(2/n), under the
+ * assumption of relu neurons.
+ * - http://arxiv.org/abs/1502.01852
*
* Inputs:
- * - D: Dimensionality of the input features.
+ * - D: Dimensionality of the input features (number of features).
* - M: Number of neurons in this layer.
*
* Outputs:
- * - W: Weight matrix, of shape (D, M).
- * - b: Biases vector, of shape (1, M).
+ * - W: Weights, of shape (D, M).
+ * - b: Biases, of shape (1, M).
*/
W = rand(rows=D, cols=M, pdf="normal") * sqrt(2.0/D)
b = matrix(0, rows=1, cols=M)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
index d332e8c..82240f7 100644
--- a/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
@@ -22,6 +22,7 @@
/*
* Batch normalization layer.
*/
+
forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
string mode, matrix[double] ema_mean, matrix[double] ema_var,
double mu, double epsilon)
@@ -36,7 +37,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
* introduces learnable parameters (gamma, beta) to control the
* amount of normalization.
*
- * y = ((x-mean) / sqrt(var+eps)) * gamma + beta
+ * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
*
* This implementation maintains exponential moving averages of the
* mean and variance during training for use during testing.
@@ -47,7 +48,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
* - https://arxiv.org/abs/1502.03167
*
* Inputs:
- * - X: Input data matrix, of shape (N, D).
+ * - X: Inputs, of shape (N, D).
* - gamma: Scale parameters, of shape (1, D).
* - beta: Shift parameters, of shape (1, D).
* - mode: 'train' or 'test' to indicate if the model is currently
@@ -118,7 +119,7 @@ backward = function(matrix[double] dout, matrix[double] out,
* Computes the backward pass for a batch normalization layer.
*
* Inputs:
- * - dout: Derivatives from upstream, of shape (N, D).
+ * - dout: Gradient wrt `out` from upstream, of shape (N, D).
* - out: Outputs from the forward pass, of shape (N, D).
* - ema_mean_upd: Updated exponential moving average of the mean
* from the forward pass, of shape (1, D).
@@ -133,7 +134,7 @@ backward = function(matrix[double] dout, matrix[double] out,
* - cache_norm: Cache of the normalized inputs from the forward
* pass, of shape (N, D). Note: This is used for performance
* during training.
- * - X: Input data matrix to the forward pass, of shape (N, D).
+ * - X: Inputs, of shape (N, D).
* - gamma: Scale parameters, of shape (1, D).
* - beta: Shift parameters, of shape (1, D).
* - mode: 'train' or 'test' to indicate if the model is currently
@@ -151,9 +152,9 @@ backward = function(matrix[double] dout, matrix[double] out,
* Typical values are in the range of [1e-5, 1e-3].
*
* Outputs:
- * - dX: Gradient wrt X, of shape (N, D).
- * - dgamma: Gradient wrt W, of shape (1, D).
- * - dbeta: Gradient wrt b, of shape (1, D).
+ * - dX: Gradient wrt `X`, of shape (N, D).
+ * - dgamma: Gradient wrt `W`, of shape (1, D).
+ * - dbeta: Gradient wrt `b`, of shape (1, D).
*
*/
N = nrow(X)
@@ -190,7 +191,7 @@ init = function(int D)
* may be initialized manually if needed.
*
* Inputs:
- * - D: Dimensionality of the input features.
+ * - D: Dimensionality of the input features (number of features).
*
* Outputs:
* - gamma: Scale parameters, of shape (1, D).
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/conv.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv.dml b/scripts/staging/SystemML-NN/nn/layers/conv.dml
index cc60a46..435b3cf 100644
--- a/scripts/staging/SystemML-NN/nn/layers/conv.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/conv.dml
@@ -39,9 +39,9 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
* output maps.
*
* Inputs:
- * - X: Input data matrix, of shape (N, C*Hin*Win).
- * - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
- * - b: Biases vector, of shape (F, 1).
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
* - C: Number of input channels (dimensionality of input depth).
* - Hin: Input height.
* - Win: Input width.
@@ -50,14 +50,14 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
* - strideh: Stride over height.
* - stridew: Stride over width.
* - padh: Padding for top and bottom sides.
- * For same output height as input, set padh = (Hf - 1) / 2,
- * assuming strideh = 1.
- * More generally, padh = (Hin*(strideh-1) + Hf - strideh) / 2
+ * For same output height as input, set `padh = (Hf - 1) / 2`,
+ * assuming `strideh = 1`.
+ * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
* preserves the spatial dimensions of the input.
* - padw: Padding for left and right sides.
- * For same output width as input, set padw = (Wf - 1) / 2,
- * assuming stridew = 1.
- * More generally, padw = (Win*(stridew-1) + Wf - stridew) / 2
+ * For same output width as input, set `padw = (Wf - 1) / 2`,
+ * assuming `stridew = 1`.
+ * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
* preserves the spatial dimensions of the input.
*
* Outputs:
@@ -67,8 +67,8 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
*/
N = nrow(X)
F = nrow(W)
- Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
- Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+ Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+ Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
# Create output volume
out = matrix(0, rows=N, cols=F*Hout*Wout)
@@ -101,12 +101,13 @@ backward = function(matrix[double] dout, int Hout, int Wout,
* This implementation uses `im2col` and `col2im` internally.
*
* Inputs:
- * - dout: Derivatives from upstream, of shape (N, F*Hout*Wout).
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, F*Hout*Wout).
* - Hout: Output height.
* - Wout: Output width.
- * - X: Previous input data matrix, of shape (N, C*Hin*Win).
- * - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
- * - b: Biases vector, of shape (F, 1).
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
* - C: Number of input channels (dimensionality of input depth).
* - Hin: Input height.
* - Win: Input width.
@@ -118,9 +119,9 @@ backward = function(matrix[double] dout, int Hout, int Wout,
* - padw: Padding for left and right sides.
*
* Outputs:
- * - dX: Gradient wrt X, of shape (N, C*Hin*Win).
- * - dW: Gradient wrt W, of shape (F, C*Hf*Wf).
- * - db: Gradient wrt b, of shape (F, 1).
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+ * - db: Gradient wrt `b`, of shape (F, 1).
*/
N = nrow(X)
F = nrow(W)
@@ -171,10 +172,11 @@ init = function(int F, int C, int Hf, int Wf)
* Note: This is just a convenience function, and parameters
* may be initialized manually if needed.
*
- * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852],
- * which limits the magnification of inputs/gradients during
- * forward/backward passes by scaling unit-Gaussian weights by a
- * factor of sqrt(2/n), under the assumption of relu neurons.
+ * We use the heuristic by He et al., which limits the magnification
+ * of inputs/gradients during forward/backward passes by scaling
+ * unit-Gaussian weights by a factor of sqrt(2/n), under the
+ * assumption of relu neurons.
+ * - http://arxiv.org/abs/1502.01852
*
* Inputs:
* - F: Number of filters.
@@ -183,8 +185,8 @@ init = function(int F, int C, int Hf, int Wf)
* - Wf: Filter width.
*
* Outputs:
- * - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
- * - b: Biases vector, of shape (F, 1).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
*/
W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
b = matrix(0, rows=F, cols=1)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
index 44df74a..c2b809e 100644
--- a/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
@@ -22,6 +22,7 @@
/*
* 2D Convolutional layer.
*/
+
forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
int C, int Hin, int Win, int Hf, int Wf,
int strideh, int stridew, int padh, int padw)
@@ -32,10 +33,10 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
* volume unrolled into a single vector.
*
* Inputs:
- * - X: Input data matrix, of shape (N, C*Hin*Win).
- * - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
- * - b: Biases vector, of shape (F, 1).
- * - C: Number of input channels (dimensionality of input depth).
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ * - C: Number of input channels (dimensionality of depth).
* - Hin: Input height.
* - Win: Input width.
* - Hf: Filter height.
@@ -43,14 +44,14 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
* - strideh: Stride over height.
* - stridew: Stride over width.
* - padh: Padding for top and bottom sides.
- * For same output height as input, set padh = (Hf - 1) / 2,
- * assuming strideh = 1.
- * More generally, padh = (Hin*(strideh-1) + Hf - strideh) / 2
+ * For same output height as input, set `padh = (Hf - 1) / 2`,
+ * assuming `strideh = 1`.
+ * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
* preserves the spatial dimensions of the input.
* - padw: Padding for left and right sides.
- * For same output width as input, set padw = (Wf - 1) / 2,
- * assuming stridew = 1.
- * More generally, padw = (Win*(stridew-1) + Wf - stridew) / 2
+ * For same output width as input, set `padw = (Wf - 1) / 2`,
+ * assuming `stridew = 1`.
+ * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
* preserves the spatial dimensions of the input.
*
* Outputs:
@@ -60,8 +61,8 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
*/
N = nrow(X)
F = nrow(W)
- Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
- Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+ Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+ Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
# Convolution - built-in implementation
out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
@@ -81,13 +82,14 @@ backward = function(matrix[double] dout, int Hout, int Wout,
* with F filters.
*
* Inputs:
- * - dout: Derivatives from upstream, of shape (N, F*Hout*Wout).
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, F*Hout*Wout).
* - Hout: Output height.
* - Wout: Output width.
- * - X: Previous input data matrix, of shape (N, C*Hin*Win).
- * - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
- * - b: Biases vector, of shape (F, 1).
- * - C: Number of input channels (dimensionality of input depth).
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ * - C: Number of input channels (dimensionality of depth).
* - Hin: Input height.
* - Win: Input width.
* - Hf: Filter height.
@@ -95,12 +97,20 @@ backward = function(matrix[double] dout, int Hout, int Wout,
* - strideh: Stride over height.
* - stridew: Stride over width.
* - padh: Padding for top and bottom sides.
+ * For same output height as input, set `padh = (Hf - 1) / 2`,
+ * assuming `strideh = 1`.
+ * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+ * preserves the spatial dimensions of the input.
* - padw: Padding for left and right sides.
+ * For same output width as input, set `padw = (Wf - 1) / 2`,
+ * assuming `stridew = 1`.
+ * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+ * preserves the spatial dimensions of the input.
*
* Outputs:
- * - dX: Gradient wrt X, of shape (N, C*Hin*Win).
- * - dW: Gradient wrt W, of shape (F, C*Hf*Wf).
- * - db: Gradient wrt b, of shape (F, 1).
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+ * - db: Gradient wrt `b`, of shape (F, 1).
*/
N = nrow(X)
F = nrow(W)
@@ -123,10 +133,11 @@ init = function(int F, int C, int Hf, int Wf)
* Note: This is just a convenience function, and parameters
* may be initialized manually if needed.
*
- * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852],
- * which limits the magnification of inputs/gradients during
- * forward/backward passes by scaling unit-Gaussian weights by a
- * factor of sqrt(2/n), under the assumption of relu neurons.
+ * We use the heuristic by He et al., which limits the magnification
+ * of inputs/gradients during forward/backward passes by scaling
+ * unit-Gaussian weights by a factor of sqrt(2/n), under the
+ * assumption of relu neurons.
+ * - http://arxiv.org/abs/1502.01852
*
* Inputs:
* - F: Number of filters.
@@ -135,8 +146,8 @@ init = function(int F, int C, int Hf, int Wf)
* - Wf: Filter width.
*
* Outputs:
- * - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
- * - b: Biases vector, of shape (F, 1).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
*/
W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
b = matrix(0, rows=F, cols=1)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
index f9cd507..55552e1 100644
--- a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
@@ -21,11 +21,8 @@
/*
* Cross-entropy loss function.
- *
- * L_i = -y_i^T * log(pred_i), where y_i and pred_i are K-dimensional
- * vectors of class probs.
- * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
*/
+
forward = function(matrix[double] pred, matrix[double] y)
return (double loss) {
/*
@@ -33,16 +30,26 @@ forward = function(matrix[double] pred, matrix[double] y)
* inputs consist of N examples, each with K dimensions corresponding
* to normalized probabilities of K classes.
*
+ * ```
+ * L_i = -y_i^T * log(pred_i)
+ * L = (1/N) sum(L_i) for i=1 to N
+ * ```
+ *
+ * In these equations, `L` is the total loss, `L_i` is the loss for
+ * example `i`, `y_i` is the K-dimensional vector of target class
+ * probabilities, `pred_i` is K-dimensional vector of predicted
+ * class probabilities, and `N` is the number of examples.
+ *
* This can be interpreted as the negative log-likelihood assuming
* a Bernoulli distribution generalized to K dimensions, or a
- * Multinomial with 1 observation.
+ * Multinomial with one observation.
*
* Inputs:
- * - pred: Prediction matrix, of shape (N, K).
- * - y: Target matrix, of shape (N, K).
+ * - pred: Predictions, of shape (N, K).
+ * - y: Targets, of shape (N, K).
*
* Outputs:
- * - loss: Scalar loss, of shape (1).
+ * - loss: Average loss.
*/
N = nrow(y)
eps = 1e-10 # numerical stability to avoid log(0)
@@ -58,11 +65,11 @@ backward = function(matrix[double] pred, matrix[double] y)
* to normalized probabilities of K classes.
*
* Inputs:
- * - pred: Prediction matrix, of shape (N, K).
- * - y: Target matrix, of shape (N, K).
+ * - pred: Predictions, of shape (N, K).
+ * - y: Targets, of shape (N, K).
*
* Outputs:
- * - dpred: Gradient wrt pred, of shape (N, K).
+ * - dpred: Gradient wrt `pred`, of shape (N, K).
*/
N = nrow(y)
eps = 1e-10 # numerical stability to avoid divide-by-zero
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/dropout.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/dropout.dml b/scripts/staging/SystemML-NN/nn/layers/dropout.dml
index 2b1bd1d..b348642 100644
--- a/scripts/staging/SystemML-NN/nn/layers/dropout.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/dropout.dml
@@ -22,6 +22,7 @@
/*
* Dropout layer.
*/
+
forward = function(matrix[double] X, double p, int seed)
return (matrix[double] out, matrix[double] mask) {
/*
@@ -32,14 +33,13 @@ forward = function(matrix[double] X, double p, int seed)
* the outputs of neurons) at test time.
*
* Inputs:
- * - X: Input data matrix, of shape (any, any).
+ * - X: Inputs, of shape (any, any).
* - p: Probability of keeping a neuron output.
- * - seed: [Optional: -1] Random number generator seed. Setting this
- * allows for deterministic evaluation. Set to -1 for a random
- * seed.
+ * - seed: [Optional: -1] Random number generator seed to allow for
+ * deterministic evaluation. Set to -1 for a random seed.
*
* Outputs:
- * - out: Ouptuts, of same shape as X.
+ * - out: Outputs, of same shape as `X`.
* - mask: Dropout mask used to compute the output.
*/
# Normally, we might use something like
@@ -48,8 +48,7 @@ forward = function(matrix[double] X, double p, int seed)
# the `rand` function that allows use to create a mask directly.
if (seed == -1) {
mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p)
- }
- else {
+ } else {
mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed)
}
out = X * mask / p
@@ -64,13 +63,13 @@ backward = function(matrix[double] dout, matrix[double] X, double p, matrix[doub
* maintain the expected values at test time.
*
* Inputs:
- * - dout: Derivatives from upstream, of same shape as X.
- * - X: Previous input data matrix, of shape (any, any).
- * - p: Previous probability of keeping a neuron output.
- * - mask: Previous dropout mask used to compute the output.
+ * - dout: Gradient wrt `out`, of same shape as `X`.
+ * - X: Inputs, of shape (any, any).
+ * - p: Probability of keeping a neuron output.
+ * - mask: Dropout mask used to compute the output.
*
* Outputs:
- * - dX: Gradient wrt X, of same shape as X.
+ * - dX: Gradient wrt `X`, of same shape as `X`.
*/
dX = mask / p * dout
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
index 7d6c821..24b15e2 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
@@ -21,28 +21,35 @@
/*
* L1 loss function.
- *
- * L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j.
- * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
*/
+
forward = function(matrix[double] pred, matrix[double] y)
return (double loss) {
/*
* Computes the forward pass for an L1 loss function. The inputs
* consist of N examples, each with M dimensions to predict.
*
+ * ```
+ * L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j.
+ * L = (1/N) sum(L_i) for i=1 to N
+ * ```
+ *
+ * In these equations, `L` is the total loss, `L_i` is the loss for
+ * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
+ * prediction, and `N` is the number of examples.
+ *
* This can be interpreted as the negative log-likelihood assuming
* a Laplace distribution.
*
* Inputs:
- * - pred: Prediction matrix, of shape (N, M).
- * - y: Target matrix, of shape (N, M).
+ * - pred: Predictions, of shape (N, M).
+ * - y: Targets, of shape (N, M).
*
* Outputs:
- * - loss: Scalar loss, of shape (1).
+ * - loss: Average loss.
*/
N = nrow(y)
- losses = rowSums(abs(pred - y))
+ losses = rowSums(abs(pred-y))
loss = sum(losses) / N
}
@@ -53,13 +60,13 @@ backward = function(matrix[double] pred, matrix[double] y)
* consist of N examples, each with M dimensions to predict.
*
* Inputs:
- * - pred: Prediction matrix, of shape (N, M).
- * - y: Target matrix, of shape (N, M).
+ * - pred: Predictions, of shape (N, M).
+ * - y: Targets, of shape (N, M).
*
* Outputs:
- * - dpred: Gradient wrt pred, of shape (N, M).
+ * - dpred: Gradient wrt `pred`, of shape (N, M).
*/
N = nrow(y)
- dpred = sign(pred - y) / N
+ dpred = sign(pred-y) / N
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
index b2175ab..f643274 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
@@ -22,31 +22,34 @@
/*
* L1 regularizataion.
*/
-forward = function(matrix[double] X, double lambda) return (double reg_loss) {
+
+forward = function(matrix[double] X, double lambda)
+ return (double reg_loss) {
/*
* Computes the forward pass for an L1 regularization function.
*
* Inputs:
- * - X: Parameters, of shape (any, any).
+ * - X: Inputs, of shape (any, any).
* - lambda: Regularization strength.
* A typical value is 0.01.
*
* Outputs:
- * - reg_loss: Scalar L1 regularization loss, of shape (1).
+ * - reg_loss: Total regularization loss.
*/
reg_loss = lambda * sum(abs(X))
}
-backward = function(matrix[double] X, double lambda) return (matrix[double] dX) {
+backward = function(matrix[double] X, double lambda)
+ return (matrix[double] dX) {
/*
* Computes the backward pass for an L1 regularization function.
*
* Inputs:
- * - X: Parameters, of shape (any, any).
+ * - X: Inputs, of shape (any, any).
* - lambda: Regularization strength.
*
* Outputs:
- * - dX: Gradient wrt X, of same shape as X.
+ * - dX: Gradient wrt `X`, of same shape as `X`.
*/
dX = lambda * sign(X)
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
index 9f27cc2..df8bc1c 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
@@ -21,28 +21,35 @@
/*
* L2 loss function.
- *
- * L_i = (1/2) 2norm(pred_i - y_i)^2
- * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
*/
+
forward = function(matrix[double] pred, matrix[double] y)
return (double loss) {
/*
* Computes the forward pass for an L2 loss function. The inputs
* consist of N examples, each with M dimensions to predict.
*
+ * ```
+ * L_i = (1/2) norm(pred_i - y_i)^2
+ * L = (1/N) sum(L_i) for i=1 to N
+ * ```
+ *
+ * In these equations, `L` is the total loss, `L_i` is the loss for
+ * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
+ * prediction, and `N` is the number of examples.
+ *
* This can be interpreted as the negative log-likelihood assuming
* a Gaussian distribution.
*
* Inputs:
- * - pred: Prediction matrix, of shape (N, M).
- * - y: Target matrix, of shape (N, M).
+ * - pred: Predictions, of shape (N, M).
+ * - y: Targets, of shape (N, M).
*
* Outputs:
- * - loss: Scalar loss, of shape (1).
+ * - loss: Average loss.
*/
N = nrow(y)
- losses = 0.5 * rowSums((pred - y)^2)
+ losses = 0.5 * rowSums((pred-y)^2)
loss = sum(losses) / N
}
@@ -53,13 +60,13 @@ backward = function(matrix[double] pred, matrix[double] y)
* consist of N examples, each with M dimensions to predict.
*
* Inputs:
- * - pred: Prediction matrix, of shape (N, M).
- * - y: Target matrix, of shape (N, M).
+ * - pred: Predictions, of shape (N, M).
+ * - y: Targets, of shape (N, M).
*
* Outputs:
- * - dpred: Gradient wrt pred, of shape (N, M).
+ * - dpred: Gradient wrt `pred`, of shape (N, M).
*/
N = nrow(y)
- dpred = (pred - y) / N
+ dpred = (pred-y) / N
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
index 44f2a54..5074c06 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
@@ -22,31 +22,34 @@
/*
* L2 regularizataion.
*/
-forward = function(matrix[double] X, double lambda) return (double reg_loss) {
+
+forward = function(matrix[double] X, double lambda)
+ return (double reg_loss) {
/*
* Computes the forward pass for an L2 regularization function.
*
* Inputs:
- * - X: Parameters, of shape (any, any).
+ * - X: Inputs, of shape (any, any).
* - lambda: Regularization strength.
* A typical value is 0.01.
*
* Outputs:
- * - reg_loss: Scalar l2 regularization loss, of shape (1).
+ * - reg_loss: Total regularization loss.
*/
reg_loss = 0.5 * lambda * sum(X^2)
}
-backward = function(matrix[double] X, double lambda) return (matrix[double] dX) {
+backward = function(matrix[double] X, double lambda)
+ return (matrix[double] dX) {
/*
* Computes the backward pass for an L2 regularization function.
*
* Inputs:
- * - X: Parameters, of shape (any, any).
+ * - X: Inputs, of shape (any, any).
* - lambda: Regularization strength.
*
* Outputs:
- * - dX: Gradient wrt X, of same shape as X.
+ * - dX: Gradient wrt `X`, of same shape as `X`.
*/
dX = lambda * X
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
index ad5e561..7dd85d3 100644
--- a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
@@ -21,30 +21,37 @@
/*
* Log loss function.
- *
- * L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i), where y_i is a
- * binary target, and pred_i is a probability of y=1.
- * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
*/
+
forward = function(matrix[double] pred, matrix[double] y)
return (double loss) {
/*
* Computes the forward pass for a log loss function.
*
+ * ```
+ * L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i)
+ * L = (1/N) sum(L_i) for i=1 to N
+ * ```
+ *
+ * In these equations, `L` is the total loss, `L_i` is the loss for
+ * example `i`, `y_i` is the binary target, `pred_i` is probability
+ * of the true class (i.e. `y=1`), and `N` is the number of examples.
+ *
* This can be interpreted as the negative log-likelihood assuming
* a Bernoulli distribution.
*
* Inputs:
- * - pred: Prediction matrix, of shape (N, 1). Predictions should
- * be probabilities that y=1.
- * - y: Target matrix, of shape (N, 1). Targets should be binary
- * in the set {0,1}.
+ * - pred: Predictions, of shape (N, 1).
+ * Predictions should be probabilities of the true
+ * class (i.e. probability of `y=1`).
+ * - y: Targets, of shape (N, 1).
+ * Targets should be binary in the set {0, 1}.
*
* Outputs:
- * - loss: Scalar loss, of shape (1).
+ * - loss: Average loss.
*/
N = nrow(y)
- losses = -y * log(pred) - (1-y) * log(1-pred)
+ losses = -y*log(pred) - (1-y)*log(1-pred)
loss = sum(losses) / N
}
@@ -54,15 +61,16 @@ backward = function(matrix[double] pred, matrix[double] y)
* Computes the backward pass for a log loss function.
*
* Inputs:
- * - pred: Prediction matrix, of shape (N, 1). Predictions should
- * be probabilities that y=1.
- * - y: Target matrix, of shape (N, 1). Targets should be binary
- * in the set {0,1}.
+ * - pred: Predictions, of shape (N, 1).
+ * Predictions should be probabilities of the true
+ * class (i.e. probability of `y=1`).
+ * - y: Targets, of shape (N, 1).
+ * Targets should be binary in the set {0, 1}.
*
* Outputs:
- * - dpred: Gradient wrt pred, of shape (N, 1).
+ * - dpred: Gradient wrt `pred`, of shape (N, 1).
*/
N = nrow(y)
- dpred = (1/N) * (pred-y) / (pred * (1-pred))
+ dpred = (1/N) * (pred-y) / (pred*(1-pred))
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/lstm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/lstm.dml b/scripts/staging/SystemML-NN/nn/layers/lstm.dml
index 0dd9f4c..44f2ef2 100644
--- a/scripts/staging/SystemML-NN/nn/layers/lstm.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/lstm.dml
@@ -44,16 +44,16 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T,
* - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
*
* Inputs:
- * - X: Input data matrix, of shape (N, T*D).
- * - W: Weights (parameters) matrix, of shape (D+M, 4M).
- * - b: Biases vector, of shape (1, 4M).
+ * - X: Inputs, of shape (N, T*D).
+ * - W: Weights, of shape (D+M, 4M).
+ * - b: Biases, of shape (1, 4M).
* - T: Length of example sequences (number of timesteps).
- * - D: Dimensionality of the input features.
+ * - D: Dimensionality of the input features (number of features).
* - return_sequences: Whether to return `out` at all timesteps,
* or just for the final timestep.
- * - out0: Output matrix at previous timestep, of shape (N, M).
+ * - out0: Outputs from previous timestep, of shape (N, M).
* Note: This is *optional* and could just be an empty matrix.
- * - c0: Initial cell state matrix, of shape (N, M).
+ * - c0: Initial cell state, of shape (N, M).
* Note: This is *optional* and could just be an empty matrix.
*
* Outputs:
@@ -123,23 +123,27 @@ backward = function(matrix[double] dout, matrix[double] dc,
* Computes the backward pass for an LSTM layer with M neurons.
*
* Inputs:
- * - dout: Gradient on output from upstream. If `given_sequences`
- * is True, contains gradients on outputs for all timesteps,
- * of shape (N, T*M). Else, contains gradient on output for
- * the final timestep, of shape (N, M).
- * - dc: Gradient on final (current) cell state from later in time,
- * of shape (N, M).
- * - X: Input data matrix, of shape (N, T*D).
- * - W: Weights (parameters) matrix, of shape (D+M, 4M).
- * - b: Biases vector, of shape (1, 4M).
+ * - dout: Gradient wrt `out`. If `given_sequences` is `True`,
+ * contains gradients on outputs for all timesteps, of
+ * shape (N, T*M). Else, contains the gradient on the output
+ * for the final timestep, of shape (N, M).
+ * - dc: Gradient wrt `c` (from later in time), of shape (N, M).
+ * This would come from later in time if the cell state was used
+ * downstream as the initial cell state for another LSTM layer.
+ * Typically, this would be used when a sequence was cut at
+ * timestep `T` and then continued in the next batch. If `c`
+ * was not used downstream, then `dc` would be an empty matrix.
+ * - X: Inputs, of shape (N, T*D).
+ * - W: Weights, of shape (D+M, 4M).
+ * - b: Biases, of shape (1, 4M).
* - T: Length of example sequences (number of timesteps).
* - D: Dimensionality of the input features.
* - given_sequences: Whether `dout` is for all timesteps,
* or just for the final timestep. This is based on whether
* `return_sequences` was true in the forward pass.
- * - out0: Output matrix at previous timestep, of shape (N, M).
+ * - out0: Outputs from previous timestep, of shape (N, M).
* Note: This is *optional* and could just be an empty matrix.
- * - c0: Initial cell state matrix, of shape (N, M).
+ * - c0: Initial cell state, of shape (N, M).
* Note: This is *optional* and could just be an empty matrix.
* - cache_out: Cache of outputs, of shape (T, N*M).
* Note: This is used for performance during training.
@@ -149,11 +153,11 @@ backward = function(matrix[double] dout, matrix[double] dc,
* Note: This is used for performance during training.
*
* Outputs:
- * - dX: Gradient wrt X, of shape (N, T*D).
- * - dW: Gradient wrt W, of shape (D+M, 4M).
- * - db: Gradient wrt b, of shape (1, 4M).
- * - dout0: Gradient wrt out0, of shape (N, M).
- * - dc0: Gradient wrt c0, of shape (N, M).
+ * - dX: Gradient wrt `X`, of shape (N, T*D).
+ * - dW: Gradient wrt `W`, of shape (D+M, 4M).
+ * - db: Gradient wrt `b`, of shape (1, 4M).
+ * - dout0: Gradient wrt `out0`, of shape (N, M).
+ * - dc0: Gradient wrt `c0`, of shape (N, M).
*/
N = nrow(X)
M = as.integer(ncol(W)/4)
@@ -190,7 +194,7 @@ backward = function(matrix[double] dout, matrix[double] dc,
g = ifog[,3*M+1:4*M] # g gate, shape (N, M)
tmp = tanh::backward(dout_t, ct)
- dct = dct + o * tmp # shape (N, M)
+ dct = dct + o*tmp # shape (N, M)
tmp = tanh::forward(ct)
do = tmp * dout_t # output gate, shape (N, M)
df = c_prev * dct # forget gate, shape (N, M)
@@ -201,7 +205,7 @@ backward = function(matrix[double] dout, matrix[double] dc,
di_raw = i * (1-i) * di
df_raw = f * (1-f) * df
do_raw = o * (1-o) * do
- dg_raw = (1 - g^2) * dg
+ dg_raw = (1-g^2) * dg
difog_raw = cbind(di_raw, cbind(df_raw, cbind(do_raw, dg_raw))) # shape (N, 4M)
dW = dW + t(input) %*% difog_raw # shape (D+M, 4M)
@@ -217,7 +221,7 @@ backward = function(matrix[double] dout, matrix[double] dc,
dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M)
dct = dc_prev # shape (N, M)
}
- t = t-1
+ t = t - 1
}
}
@@ -232,17 +236,18 @@ init = function(int N, int D, int M)
* We use the Glorot uniform heuristic which limits the magnification
* of inputs/gradients during forward/backward passes by scaling
* uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
+ * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
*
* Inputs:
* - N: Number of examples in batch.
- * - D: Dimensionality of the input features.
+ * - D: Dimensionality of the input features (number of features).
* - M: Number of neurons in this layer.
*
* Outputs:
- * - W: Weights (parameters) matrix, of shape (D+M, 4M).
- * - b: Biases vector, of shape (1, 4M).
- * - out0: Dummy output matrix at previous timestep, of shape (N, M).
- * - c0: Initial empty cell state matrix, of shape (N, M).
+ * - W: Weights, of shape (D+M, 4M).
+ * - b: Biases, of shape (1, 4M).
+ * - out0: Empty previous timestep output matrix, of shape (N, M).
+ * - c0: Empty initial cell state matrix, of shape (N, M).
*/
fan_in = D+M
fan_out = 4*M
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
index 22e1747..a12877f 100644
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
@@ -38,7 +38,7 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
* the output maps.
*
* Inputs:
- * - X: Input data matrix, of shape (N, C*Hin*Win).
+ * - X: Inputs, of shape (N, C*Hin*Win).
* - C: Number of input channels (dimensionality of input depth).
* - Hin: Input height.
* - Win: Input width.
@@ -57,8 +57,8 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
* - Wout: Output width.
*/
N = nrow(X)
- Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
- Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+ Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+ Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
pad_value = -1/0 # in max pooling we pad with -infinity
# Create output volume
@@ -96,7 +96,8 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
* unrolled into a single vector.
*
* Inputs:
- * - dout: Derivatives from upstream, of shape (N, C*Hout*Wout).
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, C*Hout*Wout).
* - Hout: Output height.
* - Wout: Output width.
* - X: Input data matrix, of shape (N, C*Hin*Win).
@@ -113,7 +114,7 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
* A typical value is 0.
*
* Outputs:
- * - dX: Gradient wrt X, of shape (N, C*Hin*Win).
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
*/
N = nrow(X)
pad_value = -1/0 # in max pooling we pad with -infinity
@@ -134,9 +135,9 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
for (hout in 1:Hout, check=0) { # all output rows
- hin = (hout-1) * strideh + 1
+ hin = (hout-1)*strideh + 1
for (wout in 1:Wout) { # all output columns
- win = (wout-1) * stridew + 1
+ win = (wout-1)*stridew + 1
img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
max_val_ind = img_slice_patch == max(img_slice_patch) # max value indicator matrix
# gradient passes through only for the max value(s) in this patch
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
index ae2b4a1..f1cb863 100644
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
@@ -22,6 +22,7 @@
/*
* Max pooling layer.
*/
+
forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
int strideh, int stridew, int padh, int padw)
return (matrix[double] out, int Hout, int Wout) {
@@ -36,7 +37,7 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
* the output maps.
*
* Inputs:
- * - X: Input data matrix, of shape (N, C*Hin*Win).
+ * - X: Inputs, of shape (N, C*Hin*Win).
* - C: Number of input channels (dimensionality of input depth).
* - Hin: Input height.
* - Win: Input width.
@@ -55,8 +56,8 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
* - Wout: Output width.
*/
N = nrow(X)
- Hout = as.integer((Hin - Hf) / strideh + 1)
- Wout = as.integer((Win - Wf) / stridew + 1)
+ Hout = as.integer((Hin-Hf)/strideh + 1)
+ Wout = as.integer((Win-Wf)/stridew + 1)
# Max pooling - built-in implementation
out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
@@ -73,10 +74,11 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
* unrolled into a single vector.
*
* Inputs:
- * - dout: Derivatives from upstream, of shape (N, C*Hout*Wout).
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, C*Hout*Wout).
* - Hout: Output height.
* - Wout: Output width.
- * - X: Input data matrix, of shape (N, C*Hin*Win).
+ * - X: Inputs, of shape (N, C*Hin*Win).
* - C: Number of input channels (dimensionality of input depth).
* - Hin: Input height.
* - Win: Input width.
@@ -90,7 +92,7 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
* A typical value is 0.
*
* Outputs:
- * - dX: Gradient wrt X, of shape (N, C*Hin*Win).
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
*/
N = nrow(X)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/relu.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/relu.dml b/scripts/staging/SystemML-NN/nn/layers/relu.dml
index a5c5230..6a4c15c 100644
--- a/scripts/staging/SystemML-NN/nn/layers/relu.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/relu.dml
@@ -22,33 +22,37 @@
/*
* Rectified Linear Unit (ReLU) nonlinearity layer.
*/
-forward = function(matrix[double] X) return (matrix[double] out) {
+
+forward = function(matrix[double] X)
+ return (matrix[double] out) {
/*
* Computes the forward pass for a ReLU nonlinearity layer.
*
- * Performs an element-wise evaluation of f(input) = max(0, input).
+ * Performs an element-wise evaluation of `f(input) = max(0, input)`.
*
* Inputs:
- * - X: Input data matrix, of shape (any, any).
+ * - X: Inputs, of shape (any, any).
*
* Outputs:
- * - out: Ouptuts, of same shape as X.
+ * - out: Outputs, of same shape as `X`.
*/
- out = max(0.0, X)
+ out = max(X, 0)
}
-backward = function(matrix[double] dout, matrix[double] X) return (matrix[double] dX) {
+backward = function(matrix[double] dout, matrix[double] X)
+ return (matrix[double] dX) {
/*
* Computes the backward pass for a ReLU nonlinearity layer.
*
- * Essentially performs a pass-through of the upstream gradient for cells > 0.
+ * Essentially performs a pass-through of the upstream gradient
+ * for cells > 0.
*
* Inputs:
- * - dout: Derivatives from upstream, of same shape as X.
+ * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
* - X: Previous input data matrix, of shape (any, any).
*
* Outputs:
- * - dX: Gradient wrt X, of same shape as X.
+ * - dX: Gradient wrt `X`, of same shape as `X`.
*/
dX = (X > 0) * dout
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/rnn.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/rnn.dml b/scripts/staging/SystemML-NN/nn/layers/rnn.dml
index cd3eefe..cdceab8 100644
--- a/scripts/staging/SystemML-NN/nn/layers/rnn.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/rnn.dml
@@ -35,14 +35,14 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T,
* in as an additional input at the current timestep.
*
* Inputs:
- * - X: Input data matrix, of shape (N, T*D).
- * - W: Weights (parameters) matrix, of shape (D+M, M).
- * - b: Biases vector, of shape (1, M).
+ * - X: Inputs, of shape (N, T*D).
+ * - W: Weights, of shape (D+M, M).
+ * - b: Biases, of shape (1, M).
* - T: Length of example sequences (number of timesteps).
- * - D: Dimensionality of the input features.
+ * - D: Dimensionality of the input features (number of features).
* - return_sequences: Whether to return `out` at all timesteps,
* or just for the final timestep.
- * - out0: Output matrix at previous timestep, of shape (N, M).
+ * - out0: Output matrix from previous timestep, of shape (N, M).
* Note: This is *optional* and could just be an empty matrix.
*
* Outputs:
@@ -88,28 +88,28 @@ backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, mat
* Computes the backward pass for a simple RNN layer with M neurons.
*
* Inputs:
- * - dout: Gradient on output from upstream. If `given_sequences`
+ * - dout: Gradient wrt `out` from upstream. If `given_sequences`
* is True, contains gradients on outputs for all timesteps,
* of shape (N, T*M). Else, contains gradient on output for
* the final timestep, of shape (N, M).
- * - X: Input data matrix, of shape (N, T*D).
- * - W: Weights (parameters) matrix, of shape (D+M, M).
- * - b: Biases vector, of shape (1, M).
+ * - X: Inputs, of shape (N, T*D).
+ * - W: Weights, of shape (D+M, M).
+ * - b: Biases, of shape (1, M).
* - T: Length of example sequences (number of timesteps).
- * - D: Dimensionality of the input features.
+ * - D: Dimensionality of the input features (number of features).
* - given_sequences: Whether `dout` is for all timesteps,
* or just for the final timestep. This is based on whether
* `return_sequences` was true in the forward pass.
- * - out0: Output matrix at previous timestep, of shape (N, M).
+ * - out0: Output matrix from previous timestep, of shape (N, M).
* Note: This is *optional* and could just be an empty matrix.
* - cache_out: Cache of outputs, of shape (T, N*M).
* Note: This is used for performance during training.
*
* Outputs:
- * - dX: Gradient wrt X, of shape (N, T*D).
- * - dW: Gradient wrt W, of shape (D+M, 4M).
- * - db: Gradient wrt b, of shape (1, 4M).
- * - dout0: Gradient wrt out0, of shape (N, M).
+ * - dX: Gradient wrt `X`, of shape (N, T*D).
+ * - dW: Gradient wrt `W`, of shape (D+M, 4M).
+ * - db: Gradient wrt `b`, of shape (1, 4M).
+ * - dout0: Gradient wrt `out0`, of shape (N, M).
*/
N = nrow(X)
M = ncol(W)
@@ -134,7 +134,7 @@ backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, mat
out_prev = matrix(cache_out[t-1,], rows=N, cols=M) # shape (N, M)
}
input = cbind(X_t, out_prev) # shape (N, D+M)
- dout_t_raw = (1 - out_t^2) * dout_t # into tanh, shape (N, M)
+ dout_t_raw = (1-out_t^2) * dout_t # into tanh, shape (N, M)
dW = dW + t(input) %*% dout_t_raw # shape (D+M, M)
db = db + colSums(dout_t_raw) # shape (1, M)
dinput = dout_t_raw %*% t(W) # shape (N, D+M)
@@ -146,7 +146,7 @@ backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, mat
else {
dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M)
}
- t = t-1
+ t = t - 1
}
}
@@ -161,16 +161,17 @@ init = function(int N, int D, int M)
* We use the Glorot uniform heuristic which limits the magnification
* of inputs/gradients during forward/backward passes by scaling
* uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
+ * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
*
* Inputs:
* - N: Number of examples in batch.
- * - D: Dimensionality of the input features.
+ * - D: Dimensionality of the input features (number of features).
* - M: Number of neurons in this layer.
*
* Outputs:
- * - W: Weights (parameters) matrix, of shape (D+M, M).
- * - b: Biases vector, of shape (1, M).
- * - out0: Dummy output matrix at previous timestep, of shape (N, M).
+ * - W: Weights, of shape (D+M, M).
+ * - b: Biases, of shape (1, M).
+ * - out0: Empty previous timestep output matrix, of shape (N, M).
*/
fan_in = D+M
fan_out = M
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml b/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
index a7066f2..185befb 100644
--- a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
@@ -22,33 +22,41 @@
/*
* Sigmoid nonlinearity layer.
*/
-forward = function(matrix[double] X) return (matrix[double] out) {
+
+forward = function(matrix[double] X)
+ return (matrix[double] out) {
/*
* Computes the forward pass for a sigmoid nonlinearity layer.
*
- * sigmoid(x) = 1 / (1 + e^-x)
+ * `sigmoid(x) = 1 / (1 + e^-x)`
+ *
+ * If `X` contains a single feature column, the output of a sigmoid
+ * layer can be interpreted as a predicted probability of a true
+ * class when paired with a log loss function in a binary
+ * classification problem.
*
* Inputs:
- * - X: Input data matrix, of shape (any, any).
+ * - X: Inputs, of shape (any, any).
*
* Outputs:
- * - out: Ouptuts, of same shape as X.
+ * - out: Outputs, of same shape as `X`.
*/
- out = 1 / (1 + exp(-X))
+ out = 1 / (1+exp(-X))
}
-backward = function(matrix[double] dout, matrix[double] X) return (matrix[double] dX) {
+backward = function(matrix[double] dout, matrix[double] X)
+ return (matrix[double] dX) {
/*
* Computes the backward pass for a sigmoid nonlinearity layer.
*
* Inputs:
- * - dout: Derivatives from upstream, of same shape as X.
- * - X: Previous input data matrix, of shape (any, any).
+ * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+ * - X: Inputs, of shape (any, any).
*
* Outputs:
- * - dX: Gradient wrt X, of same shape as X.
+ * - dX: Gradient wrt `X`, of same shape as `X`.
*/
- out = 1 / (1 + exp(-X))
- dX = out * (1 - out) * dout
+ out = 1 / (1+exp(-X))
+ dX = out * (1-out) * dout
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/softmax.dml b/scripts/staging/SystemML-NN/nn/layers/softmax.dml
index 854e8a8..1751838 100644
--- a/scripts/staging/SystemML-NN/nn/layers/softmax.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/softmax.dml
@@ -22,7 +22,9 @@
/*
* Softmax classifier layer.
*/
-forward = function(matrix[double] scores) return (matrix[double] probs) {
+
+forward = function(matrix[double] scores)
+ return (matrix[double] probs) {
/*
* Computes the forward pass for a softmax classifier. The inputs
* are interpreted as unnormalized, log-probabilities for each of
@@ -32,10 +34,10 @@ forward = function(matrix[double] scores) return (matrix[double] probs) {
* This can be interpreted as a generalization of the sigmoid
* function to multiple classes.
*
- * probs_ij = e^scores_ij / sum(e^scores_i)
+ * `probs_ij = e^scores_ij / sum(e^scores_i)`
*
* Inputs:
- * - scores: Input data matrix, of shape (N, D).
+ * - scores: Inputs, of shape (N, D).
*
* Outputs:
* - probs: Outputs, of shape (N, D).
@@ -56,20 +58,23 @@ backward = function(matrix[double] dprobs, matrix[double] scores)
/*
* Computes the backward pass for a softmax classifier.
*
- * Note that dscores_ij has multiple sources:
+ * Note that dscores_ij has multiple source branches:
*
- * dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij)
- * dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j
+ * ```
+ * dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij)
+ * dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j
*
- * dloss/dscores_ij = dloss/dprobs_ij * dprobs_ij/dscores_ij +
- * sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij)
+ * dloss/dscores_ij =
+ * (dloss/dprobs_ij * dprobs_ij/dscores_ij)
+ * + sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij)
+ * ```
*
* Inputs:
- * - dprobs: Derivatives from upstream, of shape (N, D).
- * - scores: Previous input data matrix, of shape (N, D).
+ * - dprobs: Gradient wrt `probs` from upstream, of shape (N, D).
+ * - scores: Inputs, of shape (N, D).
*
* Outputs:
- * - dscores: Gradient wrt scores, of shape (N, D).
+ * - dscores: Gradient wrt `scores`, of shape (N, D).
*/
scores = scores - rowMaxs(scores) # numerical stability
unnorm_probs = exp(scores) # unnormalized probabilities
@@ -77,6 +82,6 @@ backward = function(matrix[double] dprobs, matrix[double] scores)
# After some cancellation:
# dscores = dprobs*probs - probs*rowSums(dprobs*probs)
dtemp = dprobs * probs
- dscores = dtemp - probs * rowSums(dtemp)
+ dscores = dtemp - probs*rowSums(dtemp)
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml b/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
index 53ca989..0185a2c 100644
--- a/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
@@ -39,7 +39,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
* introduces learnable parameters (gamma, beta) to control the
* amount of normalization.
*
- * y = ((x-mean) / sqrt(var+eps)) * gamma + beta
+ * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
*
* This implementation maintains exponential moving averages of the
* mean and variance during training for use during testing.
@@ -50,7 +50,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
* - https://arxiv.org/abs/1502.03167
*
* Inputs:
- * - X: Input data matrix, of shape (N, C*Hin*Win).
+ * - X: Inputs, of shape (N, C*Hin*Win).
* - gamma: Scale parameters, of shape (C, 1).
* - beta: Shift parameters, of shape (C, 1).
* - C: Number of input channels (dimensionality of input depth).
@@ -134,7 +134,7 @@ backward = function(matrix[double] dout, matrix[double] out,
* Computes the backward pass for a spatial batch normalization layer.
*
* Inputs:
- * - dout: Derivatives from upstream, of shape (N, C*Hin*Win).
+ * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
* - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
* - ema_mean_upd: Updated exponential moving average of the mean
* from the forward pass, of shape (C, 1).
@@ -171,9 +171,9 @@ backward = function(matrix[double] dout, matrix[double] out,
* Typical values are in the range of [1e-5, 1e-3].
*
* Outputs:
- * - dX: Gradient wrt X, of shape (N, C*Hin*Win).
- * - dgamma: Gradient wrt W, of shape (C, 1).
- * - dbeta: Gradient wrt b, of shape (C, 1).
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ * - dgamma: Gradient wrt `W`, of shape (C, 1).
+ * - dbeta: Gradient wrt `b`, of shape (C, 1).
*
*/
N = nrow(X)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/tanh.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/tanh.dml b/scripts/staging/SystemML-NN/nn/layers/tanh.dml
index 9308a7c..589a574 100644
--- a/scripts/staging/SystemML-NN/nn/layers/tanh.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/tanh.dml
@@ -24,38 +24,42 @@
*/
source("nn/layers/sigmoid.dml") as sigmoid
-forward = function(matrix[double] X) return (matrix[double] out) {
+forward = function(matrix[double] X)
+ return (matrix[double] out) {
/*
* Computes the forward pass for a tanh nonlinearity layer.
*
- * tanh(x) = (e^x - e^-x) / (e^x + e^-x)
- * = 2 * sigmoid(2x) - 1
+ * ```
+ * tanh(x) = (e^x - e^-x) / (e^x + e^-x)
+ * = 2 * sigmoid(2x) - 1
+ * ```
*
* Inputs:
- * - X: Input data matrix, of shape (any, any).
+ * - X: Inputs, of shape (any, any).
*
* Outputs:
- * - out: Ouptuts, of same shape as X.
+ * - out: Outputs, of same shape as `X`.
*/
# out = (exp(X) - exp(-X)) / (exp(X) + exp(-X))
# Simplification of the above formulation to use the sigmoid function:
sigma2X = sigmoid::forward(2*X)
- out = 2 * sigma2X - 1
+ out = 2*sigma2X - 1
}
-backward = function(matrix[double] dout, matrix[double] X) return (matrix[double] dX) {
+backward = function(matrix[double] dout, matrix[double] X)
+ return (matrix[double] dX) {
/*
* Computes the backward pass for a tanh nonlinearity layer.
*
* Inputs:
- * - dout: Derivatives from upstream, of same shape as X.
- * - X: Previous input data matrix, of shape (any, any).
+ * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+ * - X: Inputs, of shape (any, any).
*
* Outputs:
- * - dX: Gradient wrt X, of same shape as X.
+ * - dX: Gradient wrt `X`, of same shape as `X`.
*/
sigma2X = sigmoid::forward(2*X)
- out = 2 * sigma2X - 1
- dX = (1 - out^2) * dout
+ out = 2*sigma2X - 1
+ dX = (1-out^2) * dout
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml b/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
index 688109b..20b26c4 100644
--- a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
@@ -22,6 +22,7 @@
/*
* Adagrad optimizer.
*/
+
update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon,
matrix[double] cache)
return (matrix[double] X, matrix[double] cache) {
@@ -39,24 +40,25 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon
*
* Inputs:
* - X: Parameters to update, of shape (any, any).
- * - dX: Gradient of X wrt to a loss function being optimized, of
- * same shape as X.
+ * - dX: Gradient wrt `X` of a loss function being optimized, of
+ * same shape as `X`.
* - lr: Learning rate.
* - epsilon: Smoothing term to avoid divide by zero errors.
* Typical values are in the range of [1e-8, 1e-4].
* - cache: State that maintains per-parameter sum of squared
- * gradients, of same shape as X.
+ * gradients, of same shape as `X`.
*
* Outputs:
- * - X: Updated parameters X, of same shape as input X.
- * - v: Updated velocity of the parameters X, of same shape as
- * input v.
+ * - X: Updated parameters `X`, of same shape as input `X`.
+ * - cache: State that maintains per-parameter sum of squared
+ * gradients, of same shape as `X`.
*/
cache = cache + dX^2
- X = X - lr * dX / (sqrt(cache) + epsilon)
+ X = X - (lr * dX / (sqrt(cache)+epsilon))
}
-init = function(matrix[double] X) return (matrix[double] cache) {
+init = function(matrix[double] X)
+ return (matrix[double] cache) {
/*
* Initialize the state for this optimizer.
*
@@ -65,10 +67,10 @@ init = function(matrix[double] X) return (matrix[double] cache) {
*
* Inputs:
* - X: Parameters to update, of shape (any, any).
- *
+ *
* Outputs:
* - cache: State that maintains per-parameter sum of squared
- * gradients, of same shape as X.
+ * gradients, of same shape as `X`.
*/
cache = matrix(0, rows=nrow(X), cols=ncol(X))
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/adam.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/adam.dml b/scripts/staging/SystemML-NN/nn/optim/adam.dml
index a25f74d..0607fa5 100644
--- a/scripts/staging/SystemML-NN/nn/optim/adam.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/adam.dml
@@ -22,6 +22,7 @@
/*
* Adam optimizer.
*/
+
update = function(matrix[double] X, matrix[double] dX, double lr, double beta1, double beta2,
double epsilon, int t, matrix[double] m, matrix[double] v)
return (matrix[double] X, matrix[double] m, matrix[double] v) {
@@ -34,8 +35,8 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double beta1,
*
* Inputs:
* - X: Parameters to update, of shape (any, any).
- * - dX: Gradient of X wrt to a loss function being optimized, of
- * same shape as X.
+ * - dX: Gradient wrt `X` of a loss function being optimized, of
+ * same shape as `X`.
* - lr: Learning rate. Recommended value is 0.001.
* - beta1: Exponential decay rate for the 1st moment estimates.
* Recommended value is 0.9.
@@ -46,32 +47,33 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double beta1,
* - t: Timestep, starting at 0.
* - m: State containing the 1st moment (mean) estimate by
* maintaining exponential moving averages of the gradients, of
- * same shape as X.
+ * same shape as `X`.
* - v: State containing the 2nd raw moment (uncentered variance)
* estimate by maintaining exponential moving averages of the
- * squared gradients, of same shape as X.
+ * squared gradients, of same shape as `X`.
*
* Outputs:
- * - X: Updated parameters X, of same shape as input X.
+ * - X: Updated parameters `X`, of same shape as input `X`.
* - m: Updated state containing the 1st moment (mean) estimate by
* maintaining exponential moving averages of the gradients, of
- * same shape as X.
+ * same shape as `X`.
* - v: Updated state containing the 2nd raw moment (uncentered
* variance) estimate by maintaining exponential moving averages
- * of the squared gradients, of same shape as X.
+ * of the squared gradients, of same shape as `X`.
*/
t = t + 1
- m = beta1 * m + (1 - beta1) * dX # update biased 1st moment estimate
- v = beta2 * v + (1 - beta2) * dX^2 # update biased 2nd raw moment estimate
- #m = m / (1 - beta1^t) # compute bias-corrected 1st moment estimate
- #v = v / (1 - beta2^t) # compute bias-corrected 2nd raw moment estimate
- #X = X - lr * m / (sqrt(v) + epsilon) # param update
+ m = beta1*m + (1-beta1)*dX # update biased 1st moment estimate
+ v = beta2*v + (1-beta2)*dX^2 # update biased 2nd raw moment estimate
+ # m = m / (1-beta1^t) # compute bias-corrected 1st moment estimate
+ # v = v / (1-beta2^t) # compute bias-corrected 2nd raw moment estimate
+ # X = X - (lr * m / (sqrt(v)+epsilon)) # param update
# Simplified for computational efficiency:
- lr = lr * sqrt(1 - beta2^t) / (1 - beta1^t)
- X = X - lr * m / (sqrt(v) + epsilon)
+ lr = lr * sqrt(1-beta2^t) / (1-beta1^t)
+ X = X - (lr * m / (sqrt(v)+epsilon))
}
-init = function(matrix[double] X) return (matrix[double] m, matrix[double] v) {
+init = function(matrix[double] X)
+ return (matrix[double] m, matrix[double] v) {
/*
* Initialize the state for this optimizer.
*
@@ -80,14 +82,14 @@ init = function(matrix[double] X) return (matrix[double] m, matrix[double] v) {
*
* Inputs:
* - X: Parameters to update, of shape (any, any).
- *
+ *
* Outputs:
* - m: Initial state containing the 1st moment (mean) estimate by
* maintaining exponential moving averages of the gradients, of
- * same shape as X.
+ * same shape as `X`.
* - v: Initial state containing the 2nd raw moment (uncentered
* variance) estimate by maintaining exponential moving averages
- * of the squared gradients, of same shape as X.
+ * of the squared gradients, of same shape as `X`.
*/
m = matrix(0, rows=nrow(X), cols=ncol(X))
v = matrix(0, rows=nrow(X), cols=ncol(X))
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml b/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
index e256000..80c75a0 100644
--- a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
@@ -22,6 +22,7 @@
/*
* RMSprop optimizer.
*/
+
update = function(matrix[double] X, matrix[double] dX, double lr, double decay_rate,
double epsilon, matrix[double] cache)
return (matrix[double] X, matrix[double] cache) {
@@ -39,26 +40,27 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double decay_r
*
* Inputs:
* - X: Parameters to update, of shape (any, any).
- * - dX: Gradient of X wrt to a loss function being optimized, of
- * same shape as X.
+ * - dX: Gradient wrt `X` of a loss function being optimized, of
+ * same shape as `X`.
* - lr: Learning rate.
* - decay_rate: Term controlling the rate of the moving average.
* Typical values are in the range of [0.9, 0.999].
* - epsilon: Smoothing term to avoid divide by zero errors.
* Typical values are in the range of [1e-8, 1e-4].
* - cache: State that maintains the moving average of the squared
- * gradients, of same shape as X.
+ * gradients, of same shape as `X`.
*
* Outputs:
- * - X: Updated parameters X, of same shape as input X.
- * - v: Updated velocity of the parameters X, of same shape as
- * input v.
+ * - X: Updated parameters `X`, of same shape as input `X`.
+ * - cache: Updated state that maintains the moving average of the
+ * squared gradients, of same shape as `X`.
*/
- cache = decay_rate * cache + (1 - decay_rate) * dX^2
- X = X - lr * dX / (sqrt(cache) + epsilon)
+ cache = decay_rate*cache + (1-decay_rate)*dX^2
+ X = X - (lr * dX / (sqrt(cache)+epsilon))
}
-init = function(matrix[double] X) return (matrix[double] cache) {
+init = function(matrix[double] X)
+ return (matrix[double] cache) {
/*
* Initialize the state for this optimizer.
*
@@ -67,10 +69,10 @@ init = function(matrix[double] X) return (matrix[double] cache) {
*
* Inputs:
* - X: Parameters to update, of shape (any, any).
- *
+ *
* Outputs:
* - cache: State that maintains the moving average of the squared
- * gradients, of same shape as X.
+ * gradients, of same shape as `X`.
*/
cache = matrix(0, rows=nrow(X), cols=ncol(X))
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/sgd.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd.dml b/scripts/staging/SystemML-NN/nn/optim/sgd.dml
index 554569a..a3fc744 100644
--- a/scripts/staging/SystemML-NN/nn/optim/sgd.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/sgd.dml
@@ -22,19 +22,21 @@
/*
* Stochastic Gradient Descent (SGD) optimizer.
*/
-update = function(matrix[double] X, matrix[double] dX, double lr) return (matrix[double] X) {
+
+update = function(matrix[double] X, matrix[double] dX, double lr)
+ return (matrix[double] X) {
/*
* Performs a vanilla SGD update.
*
* Inputs:
* - X: Parameters to update, of shape (any, any).
- * - dX: Gradient of X wrt to a loss function being optimized, of
- * same shape as X.
+ * - dX: Gradient wrt `X` of a loss function being optimized, of
+ * same shape as `X`.
* - lr: Learning rate.
*
* Outputs:
- * - X: Updated parameters X, of same shape as input X.
+ * - X: Updated parameters `X`, of same shape as input `X`.
*/
- X = X - lr * dX
+ X = X - lr*dX
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
index c2a441b..2cb9890 100644
--- a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
@@ -22,6 +22,7 @@
/*
* Stochastic Gradient Descent with momentum (SGD-momentum) optimizer.
*/
+
update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
return (matrix[double] X, matrix[double] v) {
/*
@@ -33,25 +34,26 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double mu, mat
*
* Inputs:
* - X: Parameters to update, of shape (any, any).
- * - dX: Gradient of X wrt to a loss function being optimized, of
- * same shape as X.
+ * - dX: Gradient wrt `X` of a loss function being optimized, of
+ * same shape as `X`.
* - lr: Learning rate.
* - mu: Momentum value.
* Typical values are in the range of [0.5, 0.99], usually
* started at the lower end and annealed towards the higher end.
- * - v: State maintaining the velocity of the parameters X, of same
- * shape as X.
+ * - v: State maintaining the velocity of the parameters `X`, of same
+ * shape as `X`.
*
* Outputs:
- * - X: Updated parameters X, of same shape as input X.
- * - v: Updated velocity of the parameters X, of same shape as
- * input v.
+ * - X: Updated parameters `X`, of same shape as input `X`.
+ * - v: Updated velocity of the parameters `X`, of same shape as
+ * input `X`.
*/
- v = mu * v - lr * dX # update velocity
+ v = mu*v - lr*dX # update velocity
X = X + v # update position
}
-init = function(matrix[double] X) return (matrix[double] v) {
+init = function(matrix[double] X)
+ return (matrix[double] v) {
/*
* Initialize the state for this optimizer.
*
@@ -60,9 +62,9 @@ init = function(matrix[double] X) return (matrix[double] v) {
*
* Inputs:
* - X: Parameters to update, of shape (any, any).
- *
+ *
* Outputs:
- * - v: Initial velocity of the parameters X.
+ * - v: Initial velocity of the parameters `X`.
*/
v = matrix(0, rows=nrow(X), cols=ncol(X))
}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
index 56c6ab0..fee6585 100644
--- a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
@@ -22,6 +22,7 @@
/*
* Stochastic Gradient Descent with Nesterov momentum (SGD-Nesterov) optimizer.
*/
+
update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
return (matrix[double] X, matrix[double] v) {
/*
@@ -36,19 +37,20 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double mu, mat
* store the parameters in their position after momentum.
*
* Reference:
- * - Advances in optimizing Recurrent Networks, Bengio et al., section 3.5.
+ * - Advances in optimizing Recurrent Networks, Bengio et al.,
+ * section 3.5.
* - http://arxiv.org/abs/1212.0901
*
* Inputs:
* - X: Parameters to update, of shape (any, any).
- * - dX: Gradient of X wrt to a loss function being optimized, of
- * same shape as X.
+ * - dX: Gradient wrt `X` of a loss function being optimized, of
+ * same shape as `X`.
* - lr: Learning rate.
* - mu: Momentum value.
* Typical values are in the range of [0.5, 0.99], usually
* started at the lower end and annealed towards the higher end.
- * - v: State maintaining the velocity of the parameters X, of same
- * shape as X.
+ * - v: State maintaining the velocity of the parameters `X`, of same
+ * shape as `X`.
*
* Outputs:
* - X: Updated parameters X, of same shape as input X.
@@ -56,11 +58,12 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double mu, mat
* input v.
*/
v_prev = v
- v = mu * v - lr * dX # update velocity
- X = X - mu * v_prev + (1 + mu) * v # update position, including momentum
+ v = mu*v - lr*dX # update velocity
+ X = X - mu*v_prev + (1+mu)*v # update position, including momentum
}
-init = function(matrix[double] X) return (matrix[double] v) {
+init = function(matrix[double] X)
+ return (matrix[double] v) {
/*
* Initialize the state for this optimizer.
*
@@ -69,9 +72,9 @@ init = function(matrix[double] X) return (matrix[double] v) {
*
* Inputs:
* - X: Parameters to update, of shape (any, any).
- *
+ *
* Outputs:
- * - v: Initial velocity of the parameters X.
+ * - v: Initial velocity of the parameters `X`.
*/
v = matrix(0, rows=nrow(X), cols=ncol(X))
}