You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/04/01 01:42:35 UTC
[2/7] incubator-systemml git commit: [SYSTEMML-1452] General code cleanup of SystemML-NN

[SYSTEMML-1452] General code cleanup of SystemML-NN

This commmit performs a general code & documentation cleanup across the
library.

Closes #447.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/16b1cbd7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/16b1cbd7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/16b1cbd7

Branch: refs/heads/master
Commit: 16b1cbd72601afbed0b19c1d4125a898fd324b1c
Parents: 2e48d95
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Fri Mar 31 18:38:15 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Fri Mar 31 18:38:16 2017 -0700

----------------------------------------------------------------------
 projects/breast_cancer/hyperparam_tuning.dml    |   8 +-
 projects/breast_cancer/softmax_clf.dml          |  16 +--
 .../staging/SystemML-NN/nn/layers/affine.dml    |  36 ++++---
 .../SystemML-NN/nn/layers/batch_norm.dml        |  17 +--
 scripts/staging/SystemML-NN/nn/layers/conv.dml  |  50 ++++-----
 .../SystemML-NN/nn/layers/conv_builtin.dml      |  63 ++++++-----
 .../nn/layers/cross_entropy_loss.dml            |  29 +++--
 .../staging/SystemML-NN/nn/layers/dropout.dml   |  23 ++--
 .../staging/SystemML-NN/nn/layers/l1_loss.dml   |  29 +++--
 .../staging/SystemML-NN/nn/layers/l1_reg.dml    |  15 +--
 .../staging/SystemML-NN/nn/layers/l2_loss.dml   |  29 +++--
 .../staging/SystemML-NN/nn/layers/l2_reg.dml    |  15 +--
 .../staging/SystemML-NN/nn/layers/log_loss.dml  |  40 ++++---
 scripts/staging/SystemML-NN/nn/layers/lstm.dml  |  65 ++++++------
 .../staging/SystemML-NN/nn/layers/max_pool.dml  |  15 +--
 .../SystemML-NN/nn/layers/max_pool_builtin.dml  |  14 +--
 scripts/staging/SystemML-NN/nn/layers/relu.dml  |  22 ++--
 scripts/staging/SystemML-NN/nn/layers/rnn.dml   |  43 ++++----
 .../staging/SystemML-NN/nn/layers/sigmoid.dml   |  30 ++++--
 .../staging/SystemML-NN/nn/layers/softmax.dml   |  29 ++---
 .../nn/layers/spatial_batch_norm.dml            |  12 +--
 scripts/staging/SystemML-NN/nn/layers/tanh.dml  |  28 ++---
 .../staging/SystemML-NN/nn/optim/adagrad.dml    |  22 ++--
 scripts/staging/SystemML-NN/nn/optim/adam.dml   |  38 +++----
 .../staging/SystemML-NN/nn/optim/rmsprop.dml    |  24 +++--
 scripts/staging/SystemML-NN/nn/optim/sgd.dml    |  12 ++-
 .../SystemML-NN/nn/optim/sgd_momentum.dml       |  24 +++--
 .../SystemML-NN/nn/optim/sgd_nesterov.dml       |  23 ++--
 .../staging/SystemML-NN/nn/test/conv_simple.dml |  51 ++++-----
 .../staging/SystemML-NN/nn/test/grad_check.dml  | 106 ++++++++++---------
 .../SystemML-NN/nn/test/max_pool_simple.dml     |  18 ++--
 scripts/staging/SystemML-NN/nn/util.dml         |  46 ++++----
 32 files changed, 549 insertions(+), 443 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/projects/breast_cancer/hyperparam_tuning.dml
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/hyperparam_tuning.dml b/projects/breast_cancer/hyperparam_tuning.dml
index 464c659..4f054c3 100644
--- a/projects/breast_cancer/hyperparam_tuning.dml
+++ b/projects/breast_cancer/hyperparam_tuning.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -66,7 +66,9 @@ parfor(j in 1:10000) {
   log_interval = 10
 
   # Train
-  [Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2] = clf::train(X, Y, X_val, Y_val, C, Hin, Win, lr, mu, decay, lambda, batch_size, epochs, log_interval, dir)
+  [Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2] =
+      clf::train(X, Y, X_val, Y_val, C, Hin, Win, lr, mu, decay, lambda, batch_size, epochs,
+                 log_interval, dir)
 
   # Eval
   #probs = clf::predict(X, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/projects/breast_cancer/softmax_clf.dml
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/softmax_clf.dml b/projects/breast_cancer/softmax_clf.dml
index e106a36..35fd545 100644
--- a/projects/breast_cancer/softmax_clf.dml
+++ b/projects/breast_cancer/softmax_clf.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -79,7 +79,7 @@ train = function(matrix[double] X, matrix[double] Y,
   accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
   # Output results
   print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
-  
+
   # Optimize
   print("Starting optimization")
   iters = ceil(N / batch_size)
@@ -152,7 +152,7 @@ predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
    */
   N = nrow(X)  # num examples
   K = ncol(W)  # num classes
-  
+
   # Compute forward pass
   ## affine & softmax:
   out = affine::forward(X, W, b)
@@ -185,7 +185,7 @@ eval = function(matrix[double] probs, matrix[double] Y)
 generate_dummy_data = function()
     return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) {
   /*
-   * Generate a dummy dataset similar to the MNIST dataset.
+   * Generate a dummy dataset similar to the breast cancer dataset.
    *
    * Outputs:
    *  - X: Input data matrix, of shape (N, D).
@@ -196,9 +196,9 @@ generate_dummy_data = function()
    */
   # Generate dummy input data
   N = 1024  # num examples
-  C = 1  # num input channels
-  Hin = 28  # input height
-  Win = 28  # input width
+  C = 3  # num input channels
+  Hin = 256  # input height
+  Win = 256  # input width
   T = 10  # num targets
   X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
   classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/affine.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/affine.dml b/scripts/staging/SystemML-NN/nn/layers/affine.dml
index 6a4c210..f9f8559 100644
--- a/scripts/staging/SystemML-NN/nn/layers/affine.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/affine.dml
@@ -22,6 +22,7 @@
 /*
  * Fully-connected (affine) layer.
  */
+
 forward = function(matrix[double] X, matrix[double] W, matrix[double] b)
     return (matrix[double] out) {
   /*
@@ -29,9 +30,9 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b)
    * M neurons.  The input data has N examples, each with D features.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, D).
-   *  - W: Weights (parameters) matrix, of shape (D, M).
-   *  - b: Biases vector, of shape (1, M).
+   *  - X: Inputs, of shape (N, D).
+   *  - W: Weights, of shape (D, M).
+   *  - b: Biases, of shape (1, M).
    *
    * Outputs:
    *  - out: Outputs, of shape (N, M).
@@ -47,15 +48,15 @@ backward = function(matrix[double] dout, matrix[double] X,
    * with M neurons.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, M).
-   *  - X: Previous input data matrix, of shape (N, D).
-   *  - W: Weights (parameters) matrix, of shape (D, M).
-   *  - b: Biases vector, of shape (1, M).
+   *  - dout: Gradient wrt `out` from upstream, of shape (N, M).
+   *  - X: Inputs, of shape (N, D).
+   *  - W: Weights, of shape (D, M).
+   *  - b: Biases, of shape (1, M).
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, D).
-   *  - dW: Gradient wrt W, of shape (D, M).
-   *  - db: Gradient wrt b, of shape (1, M).
+   *  - dX: Gradient wrt `X`, of shape (N, D).
+   *  - dW: Gradient wrt `W`, of shape (D, M).
+   *  - db: Gradient wrt `b`, of shape (1, M).
    */
   dX = dout %*% t(W)
   dW = t(X) %*% dout
@@ -70,18 +71,19 @@ init = function(int D, int M)
    * Note: This is just a convenience function, and parameters
    * may be initialized manually if needed.
    *
-   * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852],
-   * which limits the magnification of inputs/gradients during
-   * forward/backward passes by scaling unit-Gaussian weights by a
-   * factor of sqrt(2/n), under the assumption of relu neurons.
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
    *
    * Inputs:
-   *  - D: Dimensionality of the input features.
+   *  - D: Dimensionality of the input features (number of features).
    *  - M: Number of neurons in this layer.
    *
    * Outputs:
-   *  - W: Weight matrix, of shape (D, M).
-   *  - b: Biases vector, of shape (1, M).
+   *  - W: Weights, of shape (D, M).
+   *  - b: Biases, of shape (1, M).
    */
   W = rand(rows=D, cols=M, pdf="normal") * sqrt(2.0/D)
   b = matrix(0, rows=1, cols=M)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
index d332e8c..82240f7 100644
--- a/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
@@ -22,6 +22,7 @@
 /*
  * Batch normalization layer.
  */
+
 forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
                    string mode, matrix[double] ema_mean, matrix[double] ema_var,
                    double mu, double epsilon)
@@ -36,7 +37,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
    * introduces learnable parameters (gamma, beta) to control the
    * amount of normalization.
    *
-   *    y = ((x-mean) / sqrt(var+eps)) * gamma + beta
+   *   `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
    *
    * This implementation maintains exponential moving averages of the
    * mean and variance during training for use during testing.
@@ -47,7 +48,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
    *    - https://arxiv.org/abs/1502.03167
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, D).
+   *  - X: Inputs, of shape (N, D).
    *  - gamma: Scale parameters, of shape (1, D).
    *  - beta: Shift parameters, of shape (1, D).
    *  - mode: 'train' or 'test' to indicate if the model is currently
@@ -118,7 +119,7 @@ backward = function(matrix[double] dout, matrix[double] out,
    * Computes the backward pass for a batch normalization layer.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, D).
+   *  - dout: Gradient wrt `out` from upstream, of shape (N, D).
    *  - out: Outputs from the forward pass, of shape (N, D).
    *  - ema_mean_upd: Updated exponential moving average of the mean
    *      from the forward pass, of shape (1, D).
@@ -133,7 +134,7 @@ backward = function(matrix[double] dout, matrix[double] out,
    *  - cache_norm: Cache of the normalized inputs from the forward
    *      pass, of shape (N, D).  Note: This is used for performance
    *      during training.
-   *  - X: Input data matrix to the forward pass, of shape (N, D).
+   *  - X: Inputs, of shape (N, D).
    *  - gamma: Scale parameters, of shape (1, D).
    *  - beta: Shift parameters, of shape (1, D).
    *  - mode: 'train' or 'test' to indicate if the model is currently
@@ -151,9 +152,9 @@ backward = function(matrix[double] dout, matrix[double] out,
    *      Typical values are in the range of [1e-5, 1e-3].
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, D).
-   *  - dgamma: Gradient wrt W, of shape (1, D).
-   *  - dbeta: Gradient wrt b, of shape (1, D).
+   *  - dX: Gradient wrt `X`, of shape (N, D).
+   *  - dgamma: Gradient wrt `W`, of shape (1, D).
+   *  - dbeta: Gradient wrt `b`, of shape (1, D).
    *
    */
   N = nrow(X)
@@ -190,7 +191,7 @@ init = function(int D)
    * may be initialized manually if needed.
    *
    * Inputs:
-   *  - D: Dimensionality of the input features.
+   *  - D: Dimensionality of the input features (number of features).
    *
    * Outputs:
    *  - gamma: Scale parameters, of shape (1, D).

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/conv.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv.dml b/scripts/staging/SystemML-NN/nn/layers/conv.dml
index cc60a46..435b3cf 100644
--- a/scripts/staging/SystemML-NN/nn/layers/conv.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/conv.dml
@@ -39,9 +39,9 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    * output maps.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -50,14 +50,14 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    *  - strideh: Stride over height.
    *  - stridew: Stride over width.
    *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set padh = (Hf - 1) / 2,
-   *      assuming strideh = 1.
-   *      More generally, padh = (Hin*(strideh-1) + Hf - strideh) / 2
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
    *      preserves the spatial dimensions of the input.
    *  - padw: Padding for left and right sides.
-   *      For same output width as input, set padw = (Wf - 1) / 2,
-   *      assuming stridew = 1.
-   *      More generally, padw = (Win*(stridew-1) + Wf - stridew) / 2
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
    *      preserves the spatial dimensions of the input.
    *
    * Outputs:
@@ -67,8 +67,8 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    */
   N = nrow(X)
   F = nrow(W)
-  Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
-  Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
 
   # Create output volume
   out = matrix(0, rows=N, cols=F*Hout*Wout)
@@ -101,12 +101,13 @@ backward = function(matrix[double] dout, int Hout, int Wout,
    * This implementation uses `im2col` and `col2im` internally.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, F*Hout*Wout).
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, F*Hout*Wout).
    *  - Hout: Output height.
    *  - Wout: Output width.
-   *  - X: Previous input data matrix, of shape (N, C*Hin*Win).
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -118,9 +119,9 @@ backward = function(matrix[double] dout, int Hout, int Wout,
    *  - padw: Padding for left and right sides.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt W, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt b, of shape (F, 1).
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt `b`, of shape (F, 1).
    */
   N = nrow(X)
   F = nrow(W)
@@ -171,10 +172,11 @@ init = function(int F, int C, int Hf, int Wf)
    * Note: This is just a convenience function, and parameters
    * may be initialized manually if needed.
    *
-   * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852],
-   * which limits the magnification of inputs/gradients during
-   * forward/backward passes by scaling unit-Gaussian weights by a
-   * factor of sqrt(2/n), under the assumption of relu neurons.
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
    *
    * Inputs:
    *  - F: Number of filters.
@@ -183,8 +185,8 @@ init = function(int F, int C, int Hf, int Wf)
    *  - Wf: Filter width.
    *
    * Outputs:
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
    */
   W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
   b = matrix(0, rows=F, cols=1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
index 44df74a..c2b809e 100644
--- a/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
@@ -22,6 +22,7 @@
 /*
  * 2D Convolutional layer.
  */
+
 forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
                    int C, int Hin, int Win, int Hf, int Wf,
                    int strideh, int stridew, int padh, int padw)
@@ -32,10 +33,10 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    * volume unrolled into a single vector.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of depth).
    *  - Hin: Input height.
    *  - Win: Input width.
    *  - Hf: Filter height.
@@ -43,14 +44,14 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    *  - strideh: Stride over height.
    *  - stridew: Stride over width.
    *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set padh = (Hf - 1) / 2,
-   *      assuming strideh = 1.
-   *      More generally, padh = (Hin*(strideh-1) + Hf - strideh) / 2
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
    *      preserves the spatial dimensions of the input.
    *  - padw: Padding for left and right sides.
-   *      For same output width as input, set padw = (Wf - 1) / 2,
-   *      assuming stridew = 1.
-   *      More generally, padw = (Win*(stridew-1) + Wf - stridew) / 2
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
    *      preserves the spatial dimensions of the input.
    *
    * Outputs:
@@ -60,8 +61,8 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    */
   N = nrow(X)
   F = nrow(W)
-  Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
-  Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
 
   # Convolution - built-in implementation
   out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
@@ -81,13 +82,14 @@ backward = function(matrix[double] dout, int Hout, int Wout,
    * with F filters.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, F*Hout*Wout).
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, F*Hout*Wout).
    *  - Hout: Output height.
    *  - Wout: Output width.
-   *  - X: Previous input data matrix, of shape (N, C*Hin*Win).
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of depth).
    *  - Hin: Input height.
    *  - Win: Input width.
    *  - Hf: Filter height.
@@ -95,12 +97,20 @@ backward = function(matrix[double] dout, int Hout, int Wout,
    *  - strideh: Stride over height.
    *  - stridew: Stride over width.
    *  - padh: Padding for top and bottom sides.
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+   *      preserves the spatial dimensions of the input.
    *  - padw: Padding for left and right sides.
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+   *      preserves the spatial dimensions of the input.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt W, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt b, of shape (F, 1).
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt `b`, of shape (F, 1).
    */
   N = nrow(X)
   F = nrow(W)
@@ -123,10 +133,11 @@ init = function(int F, int C, int Hf, int Wf)
    * Note: This is just a convenience function, and parameters
    * may be initialized manually if needed.
    *
-   * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852],
-   * which limits the magnification of inputs/gradients during
-   * forward/backward passes by scaling unit-Gaussian weights by a
-   * factor of sqrt(2/n), under the assumption of relu neurons.
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
    *
    * Inputs:
    *  - F: Number of filters.
@@ -135,8 +146,8 @@ init = function(int F, int C, int Hf, int Wf)
    *  - Wf: Filter width.
    *
    * Outputs:
-   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
-   *  - b: Biases vector, of shape (F, 1).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
    */
   W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
   b = matrix(0, rows=F, cols=1)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
index f9cd507..55552e1 100644
--- a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
@@ -21,11 +21,8 @@
 
 /*
  * Cross-entropy loss function.
- *
- * L_i = -y_i^T * log(pred_i), where y_i and pred_i are K-dimensional
- *  vectors of class probs.
- * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
  */
+
 forward = function(matrix[double] pred, matrix[double] y)
     return (double loss) {
   /*
@@ -33,16 +30,26 @@ forward = function(matrix[double] pred, matrix[double] y)
    * inputs consist of N examples, each with K dimensions corresponding
    * to normalized probabilities of K classes.
    *
+   *   ```
+   *   L_i = -y_i^T * log(pred_i)
+   *   L = (1/N) sum(L_i) for i=1 to N
+   *   ```
+   *
+   * In these equations, `L` is the total loss, `L_i` is the loss for
+   * example `i`, `y_i` is the K-dimensional vector of target class
+   * probabilities, `pred_i` is K-dimensional vector of predicted
+   * class probabilities, and `N` is the number of examples.
+   *
    * This can be interpreted as the negative log-likelihood assuming
    * a Bernoulli distribution generalized to K dimensions, or a
-   * Multinomial with 1 observation.
+   * Multinomial with one observation.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, K).
-   *  - y: Target matrix, of shape (N, K).
+   *  - pred: Predictions, of shape (N, K).
+   *  - y: Targets, of shape (N, K).
    *
    * Outputs:
-   *  - loss: Scalar loss, of shape (1).
+   *  - loss: Average loss.
    */
   N = nrow(y)
   eps = 1e-10  # numerical stability to avoid log(0)
@@ -58,11 +65,11 @@ backward = function(matrix[double] pred, matrix[double] y)
    * to normalized probabilities of K classes.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, K).
-   *  - y: Target matrix, of shape (N, K).
+   *  - pred: Predictions, of shape (N, K).
+   *  - y: Targets, of shape (N, K).
    *
    * Outputs:
-   *  - dpred: Gradient wrt pred, of shape (N, K).
+   *  - dpred: Gradient wrt `pred`, of shape (N, K).
    */
   N = nrow(y)
   eps = 1e-10  # numerical stability to avoid divide-by-zero

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/dropout.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/dropout.dml b/scripts/staging/SystemML-NN/nn/layers/dropout.dml
index 2b1bd1d..b348642 100644
--- a/scripts/staging/SystemML-NN/nn/layers/dropout.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/dropout.dml
@@ -22,6 +22,7 @@
 /*
  * Dropout layer.
  */
+
 forward = function(matrix[double] X, double p, int seed)
     return (matrix[double] out, matrix[double] mask) {
   /*
@@ -32,14 +33,13 @@ forward = function(matrix[double] X, double p, int seed)
    * the outputs of neurons) at test time.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *  - p: Probability of keeping a neuron output.
-   *  - seed: [Optional: -1] Random number generator seed.  Setting this
-   *      allows for deterministic evaluation.  Set to -1 for a random
-   *      seed.
+   *  - seed: [Optional: -1] Random number generator seed to allow for
+   *      deterministic evaluation.  Set to -1 for a random seed.
    *
    * Outputs:
-   *  - out: Ouptuts, of same shape as X.
+   *  - out: Outputs, of same shape as `X`.
    *  - mask: Dropout mask used to compute the output.
    */
   # Normally, we might use something like
@@ -48,8 +48,7 @@ forward = function(matrix[double] X, double p, int seed)
   # the `rand` function that allows use to create a mask directly.
   if (seed == -1) {
     mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p)
-  }
-  else {
+  } else {
     mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed)
   }
   out = X * mask / p
@@ -64,13 +63,13 @@ backward = function(matrix[double] dout, matrix[double] X, double p, matrix[doub
    * maintain the expected values at test time.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of same shape as X.
-   *  - X: Previous input data matrix, of shape (any, any).
-   *  - p: Previous probability of keeping a neuron output.
-   *  - mask: Previous dropout mask used to compute the output.
+   *  - dout: Gradient wrt `out`, of same shape as `X`.
+   *  - X: Inputs, of shape (any, any).
+   *  - p: Probability of keeping a neuron output.
+   *  - mask: Dropout mask used to compute the output.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of same shape as X.
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
    */
   dX = mask / p * dout
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
index 7d6c821..24b15e2 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
@@ -21,28 +21,35 @@
 
 /*
  * L1 loss function.
- *
- * L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j.
- * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
  */
+
 forward = function(matrix[double] pred, matrix[double] y)
     return (double loss) {
   /*
    * Computes the forward pass for an L1 loss function.  The inputs
    * consist of N examples, each with M dimensions to predict.
    *
+   *   ```
+   *   L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j.
+   *   L = (1/N) sum(L_i) for i=1 to N
+   *   ```
+   *
+   * In these equations, `L` is the total loss, `L_i` is the loss for
+   * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
+   * prediction, and `N` is the number of examples.
+   *
    * This can be interpreted as the negative log-likelihood assuming
    * a Laplace distribution.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, M).
-   *  - y: Target matrix, of shape (N, M).
+   *  - pred: Predictions, of shape (N, M).
+   *  - y: Targets, of shape (N, M).
    *
    * Outputs:
-   *  - loss: Scalar loss, of shape (1).
+   *  - loss: Average loss.
    */
   N = nrow(y)
-  losses = rowSums(abs(pred - y))
+  losses = rowSums(abs(pred-y))
   loss = sum(losses) / N
 }
 
@@ -53,13 +60,13 @@ backward = function(matrix[double] pred, matrix[double] y)
    * consist of N examples, each with M dimensions to predict.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, M).
-   *  - y: Target matrix, of shape (N, M).
+   *  - pred: Predictions, of shape (N, M).
+   *  - y: Targets, of shape (N, M).
    *
    * Outputs:
-   *  - dpred: Gradient wrt pred, of shape (N, M).
+   *  - dpred: Gradient wrt `pred`, of shape (N, M).
    */
   N = nrow(y)
-  dpred = sign(pred - y) / N
+  dpred = sign(pred-y) / N
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
index b2175ab..f643274 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
@@ -22,31 +22,34 @@
 /*
  * L1 regularizataion.
  */
-forward = function(matrix[double] X, double lambda) return (double reg_loss) {
+
+forward = function(matrix[double] X, double lambda)
+    return (double reg_loss) {
   /*
    * Computes the forward pass for an L1 regularization function.
    *
    * Inputs:
-   *  - X: Parameters, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *  - lambda: Regularization strength.
    *      A typical value is 0.01.
    *
    * Outputs:
-   *  - reg_loss: Scalar L1 regularization loss, of shape (1).
+   *  - reg_loss: Total regularization loss.
    */
   reg_loss = lambda * sum(abs(X))
 }
 
-backward = function(matrix[double] X, double lambda) return (matrix[double] dX) {
+backward = function(matrix[double] X, double lambda)
+    return (matrix[double] dX) {
   /*
    * Computes the backward pass for an L1 regularization function.
    *
    * Inputs:
-   *  - X: Parameters, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *  - lambda: Regularization strength.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of same shape as X.
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
    */
   dX = lambda * sign(X)
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
index 9f27cc2..df8bc1c 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
@@ -21,28 +21,35 @@
 
 /*
  * L2 loss function.
- *
- * L_i = (1/2) 2norm(pred_i - y_i)^2
- * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
  */
+
 forward = function(matrix[double] pred, matrix[double] y)
     return (double loss) {
   /*
    * Computes the forward pass for an L2 loss function.  The inputs
    * consist of N examples, each with M dimensions to predict.
    *
+   *   ```
+   *   L_i = (1/2) norm(pred_i - y_i)^2
+   *   L = (1/N) sum(L_i) for i=1 to N
+   *   ```
+   *
+   * In these equations, `L` is the total loss, `L_i` is the loss for
+   * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
+   * prediction, and `N` is the number of examples.
+   *
    * This can be interpreted as the negative log-likelihood assuming
    * a Gaussian distribution.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, M).
-   *  - y: Target matrix, of shape (N, M).
+   *  - pred: Predictions, of shape (N, M).
+   *  - y: Targets, of shape (N, M).
    *
    * Outputs:
-   *  - loss: Scalar loss, of shape (1).
+   *  - loss: Average loss.
    */
   N = nrow(y)
-  losses = 0.5 * rowSums((pred - y)^2)
+  losses = 0.5 * rowSums((pred-y)^2)
   loss = sum(losses) / N
 }
 
@@ -53,13 +60,13 @@ backward = function(matrix[double] pred, matrix[double] y)
    * consist of N examples, each with M dimensions to predict.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, M).
-   *  - y: Target matrix, of shape (N, M).
+   *  - pred: Predictions, of shape (N, M).
+   *  - y: Targets, of shape (N, M).
    *
    * Outputs:
-   *  - dpred: Gradient wrt pred, of shape (N, M).
+   *  - dpred: Gradient wrt `pred`, of shape (N, M).
    */
   N = nrow(y)
-  dpred = (pred - y) / N
+  dpred = (pred-y) / N
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
index 44f2a54..5074c06 100644
--- a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
@@ -22,31 +22,34 @@
 /*
  * L2 regularizataion.
  */
-forward = function(matrix[double] X, double lambda) return (double reg_loss) {
+
+forward = function(matrix[double] X, double lambda)
+    return (double reg_loss) {
   /*
    * Computes the forward pass for an L2 regularization function.
    *
    * Inputs:
-   *  - X: Parameters, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *  - lambda: Regularization strength.
    *      A typical value is 0.01.
    *
    * Outputs:
-   *  - reg_loss: Scalar l2 regularization loss, of shape (1).
+   *  - reg_loss: Total regularization loss.
    */
   reg_loss = 0.5 * lambda * sum(X^2)
 }
 
-backward = function(matrix[double] X, double lambda) return (matrix[double] dX) {
+backward = function(matrix[double] X, double lambda)
+    return (matrix[double] dX) {
   /*
    * Computes the backward pass for an L2 regularization function.
    *
    * Inputs:
-   *  - X: Parameters, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *  - lambda: Regularization strength.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of same shape as X.
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
    */
   dX = lambda * X
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
index ad5e561..7dd85d3 100644
--- a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
@@ -21,30 +21,37 @@
 
 /*
  * Log loss function.
- *
- * L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i), where y_i is a
- *  binary target, and pred_i is a probability of y=1.
- * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
  */
+
 forward = function(matrix[double] pred, matrix[double] y)
     return (double loss) {
   /*
    * Computes the forward pass for a log loss function.
    *
+   *   ```
+   *   L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i)
+   *   L = (1/N) sum(L_i) for i=1 to N
+   *   ```
+   *
+   * In these equations, `L` is the total loss, `L_i` is the loss for
+   * example `i`, `y_i` is the binary target, `pred_i` is probability
+   * of the true class (i.e. `y=1`), and `N` is the number of examples.
+   *
    * This can be interpreted as the negative log-likelihood assuming
    * a Bernoulli distribution.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, 1).  Predictions should
-   *      be probabilities that y=1.
-   *  - y: Target matrix, of shape (N, 1).  Targets should be binary
-   *      in the set {0,1}.
+   *  - pred: Predictions, of shape (N, 1).
+   *      Predictions should be probabilities of the true
+   *      class (i.e. probability of `y=1`).
+   *  - y: Targets, of shape (N, 1).
+   *      Targets should be binary in the set {0, 1}.
    *
    * Outputs:
-   *  - loss: Scalar loss, of shape (1).
+   *  - loss: Average loss.
    */
   N = nrow(y)
-  losses = -y * log(pred) - (1-y) * log(1-pred)
+  losses = -y*log(pred) - (1-y)*log(1-pred)
   loss = sum(losses) / N
 }
 
@@ -54,15 +61,16 @@ backward = function(matrix[double] pred, matrix[double] y)
    * Computes the backward pass for a log loss function.
    *
    * Inputs:
-   *  - pred: Prediction matrix, of shape (N, 1).  Predictions should
-   *      be probabilities that y=1.
-   *  - y: Target matrix, of shape (N, 1).  Targets should be binary
-   *      in the set {0,1}.
+   *  - pred: Predictions, of shape (N, 1).
+   *      Predictions should be probabilities of the true
+   *      class (i.e. probability of `y=1`).
+   *  - y: Targets, of shape (N, 1).
+   *      Targets should be binary in the set {0, 1}.
    *
    * Outputs:
-   *  - dpred: Gradient wrt pred, of shape (N, 1).
+   *  - dpred: Gradient wrt `pred`, of shape (N, 1).
    */
   N = nrow(y)
-  dpred = (1/N) * (pred-y) / (pred * (1-pred))
+  dpred = (1/N) * (pred-y) / (pred*(1-pred))
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/lstm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/lstm.dml b/scripts/staging/SystemML-NN/nn/layers/lstm.dml
index 0dd9f4c..44f2ef2 100644
--- a/scripts/staging/SystemML-NN/nn/layers/lstm.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/lstm.dml
@@ -44,16 +44,16 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T,
    *    - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, T*D).
-   *  - W: Weights (parameters) matrix, of shape (D+M, 4M).
-   *  - b: Biases vector, of shape (1, 4M).
+   *  - X: Inputs, of shape (N, T*D).
+   *  - W: Weights, of shape (D+M, 4M).
+   *  - b: Biases, of shape (1, 4M).
    *  - T: Length of example sequences (number of timesteps).
-   *  - D: Dimensionality of the input features.
+   *  - D: Dimensionality of the input features (number of features).
    *  - return_sequences: Whether to return `out` at all timesteps,
    *      or just for the final timestep.
-   *  - out0: Output matrix at previous timestep, of shape (N, M).
+   *  - out0: Outputs from previous timestep, of shape (N, M).
    *      Note: This is *optional* and could just be an empty matrix.
-   *  - c0: Initial cell state matrix, of shape (N, M).
+   *  - c0: Initial cell state, of shape (N, M).
    *      Note: This is *optional* and could just be an empty matrix.
    *
    * Outputs:
@@ -123,23 +123,27 @@ backward = function(matrix[double] dout, matrix[double] dc,
    * Computes the backward pass for an LSTM layer with M neurons.
    *
    * Inputs:
-   *  - dout: Gradient on output from upstream.  If `given_sequences`
-   *      is True, contains gradients on outputs for all timesteps,
-   *      of shape (N, T*M).  Else, contains gradient on output for
-   *      the final timestep, of shape (N, M).
-   *  - dc: Gradient on final (current) cell state from later in time,
-   *      of shape (N, M).
-   *  - X: Input data matrix, of shape (N, T*D).
-   *  - W: Weights (parameters) matrix, of shape (D+M, 4M).
-   *  - b: Biases vector, of shape (1, 4M).
+   *  - dout: Gradient wrt `out`.  If `given_sequences` is `True`,
+   *      contains gradients on outputs for all timesteps, of
+   *      shape (N, T*M). Else, contains the gradient on the output
+   *      for the final timestep, of shape (N, M).
+   *  - dc: Gradient wrt `c` (from later in time), of shape (N, M).
+   *      This would come from later in time if the cell state was used
+   *      downstream as the initial cell state for another LSTM layer.
+   *      Typically, this would be used when a sequence was cut at
+   *      timestep `T` and then continued in the next batch.  If `c`
+   *      was not used downstream, then `dc` would be an empty matrix.
+   *  - X: Inputs, of shape (N, T*D).
+   *  - W: Weights, of shape (D+M, 4M).
+   *  - b: Biases, of shape (1, 4M).
    *  - T: Length of example sequences (number of timesteps).
    *  - D: Dimensionality of the input features.
    *  - given_sequences: Whether `dout` is for all timesteps,
    *      or just for the final timestep.  This is based on whether
    *      `return_sequences` was true in the forward pass.
-   *  - out0: Output matrix at previous timestep, of shape (N, M).
+   *  - out0: Outputs from previous timestep, of shape (N, M).
    *      Note: This is *optional* and could just be an empty matrix.
-   *  - c0: Initial cell state matrix, of shape (N, M).
+   *  - c0: Initial cell state, of shape (N, M).
    *      Note: This is *optional* and could just be an empty matrix.
    *  - cache_out: Cache of outputs, of shape (T, N*M).
    *      Note: This is used for performance during training.
@@ -149,11 +153,11 @@ backward = function(matrix[double] dout, matrix[double] dc,
    *      Note: This is used for performance during training.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, T*D).
-   *  - dW: Gradient wrt W, of shape (D+M, 4M).
-   *  - db: Gradient wrt b, of shape (1, 4M).
-   *  - dout0: Gradient wrt out0, of shape (N, M).
-   *  - dc0: Gradient wrt c0, of shape (N, M).
+   *  - dX: Gradient wrt `X`, of shape (N, T*D).
+   *  - dW: Gradient wrt `W`, of shape (D+M, 4M).
+   *  - db: Gradient wrt `b`, of shape (1, 4M).
+   *  - dout0: Gradient wrt `out0`, of shape (N, M).
+   *  - dc0: Gradient wrt `c0`, of shape (N, M).
    */
   N = nrow(X)
   M = as.integer(ncol(W)/4)
@@ -190,7 +194,7 @@ backward = function(matrix[double] dout, matrix[double] dc,
     g = ifog[,3*M+1:4*M]  # g gate, shape (N, M)
 
     tmp = tanh::backward(dout_t, ct)
-    dct = dct + o * tmp  # shape (N, M)
+    dct = dct + o*tmp  # shape (N, M)
     tmp = tanh::forward(ct)
     do = tmp * dout_t  # output gate, shape (N, M)
     df = c_prev * dct  # forget gate, shape (N, M)
@@ -201,7 +205,7 @@ backward = function(matrix[double] dout, matrix[double] dc,
     di_raw = i * (1-i) * di
     df_raw = f * (1-f) * df
     do_raw = o * (1-o) * do
-    dg_raw = (1 - g^2) * dg
+    dg_raw = (1-g^2) * dg
     difog_raw = cbind(di_raw, cbind(df_raw, cbind(do_raw, dg_raw)))  # shape (N, 4M)
 
     dW = dW + t(input) %*% difog_raw  # shape (D+M, 4M)
@@ -217,7 +221,7 @@ backward = function(matrix[double] dout, matrix[double] dc,
       dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev  # shape (N, M)
       dct = dc_prev  # shape (N, M)
     }
-    t = t-1
+    t = t - 1
   }
 }
 
@@ -232,17 +236,18 @@ init = function(int N, int D, int M)
    * We use the Glorot uniform heuristic which limits the magnification
    * of inputs/gradients during forward/backward passes by scaling
    * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
+   *  - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
    *
    * Inputs:
    *  - N: Number of examples in batch.
-   *  - D: Dimensionality of the input features.
+   *  - D: Dimensionality of the input features (number of features).
    *  - M: Number of neurons in this layer.
    *
    * Outputs:
-   *  - W: Weights (parameters) matrix, of shape (D+M, 4M).
-   *  - b: Biases vector, of shape (1, 4M).
-   *  - out0: Dummy output matrix at previous timestep, of shape (N, M).
-   *  - c0: Initial empty cell state matrix, of shape (N, M).
+   *  - W: Weights, of shape (D+M, 4M).
+   *  - b: Biases, of shape (1, 4M).
+   *  - out0: Empty previous timestep output matrix, of shape (N, M).
+   *  - c0: Empty initial cell state matrix, of shape (N, M).
    */
   fan_in = D+M
   fan_out = 4*M

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
index 22e1747..a12877f 100644
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
@@ -38,7 +38,7 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
    * the output maps.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -57,8 +57,8 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
    *  - Wout: Output width.
    */
   N = nrow(X)
-  Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
-  Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
   pad_value = -1/0  # in max pooling we pad with -infinity
 
   # Create output volume
@@ -96,7 +96,8 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
    * unrolled into a single vector.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, C*Hout*Wout).
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
    *  - Hout: Output height.
    *  - Wout: Output width.
    *  - X: Input data matrix, of shape (N, C*Hin*Win).
@@ -113,7 +114,7 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
    *      A typical value is 0.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
    */
   N = nrow(X)
   pad_value = -1/0  # in max pooling we pad with -infinity
@@ -134,9 +135,9 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
       img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
       dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
       for (hout in 1:Hout, check=0) {  # all output rows
-        hin = (hout-1) * strideh + 1
+        hin = (hout-1)*strideh + 1
         for (wout in 1:Wout) {  # all output columns
-          win = (wout-1) * stridew + 1
+          win = (wout-1)*stridew + 1
           img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
           max_val_ind = img_slice_patch == max(img_slice_patch)  # max value indicator matrix
           # gradient passes through only for the max value(s) in this patch

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
index ae2b4a1..f1cb863 100644
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
@@ -22,6 +22,7 @@
 /*
  * Max pooling layer.
  */
+
 forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
                    int strideh, int stridew, int padh, int padw)
     return (matrix[double] out, int Hout, int Wout) {
@@ -36,7 +37,7 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
    * the output maps.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -55,8 +56,8 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
    *  - Wout: Output width.
    */
   N = nrow(X)
-  Hout = as.integer((Hin - Hf) / strideh + 1)
-  Wout = as.integer((Win - Wf) / stridew + 1)
+  Hout = as.integer((Hin-Hf)/strideh + 1)
+  Wout = as.integer((Win-Wf)/stridew + 1)
 
   # Max pooling - built-in implementation
   out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
@@ -73,10 +74,11 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
    * unrolled into a single vector.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, C*Hout*Wout).
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
    *  - Hout: Output height.
    *  - Wout: Output width.
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
    *  - C: Number of input channels (dimensionality of input depth).
    *  - Hin: Input height.
    *  - Win: Input width.
@@ -90,7 +92,7 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
    *      A typical value is 0.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
    */
   N = nrow(X)
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/relu.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/relu.dml b/scripts/staging/SystemML-NN/nn/layers/relu.dml
index a5c5230..6a4c15c 100644
--- a/scripts/staging/SystemML-NN/nn/layers/relu.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/relu.dml
@@ -22,33 +22,37 @@
 /*
  * Rectified Linear Unit (ReLU) nonlinearity layer.
  */
-forward = function(matrix[double] X) return (matrix[double] out) {
+
+forward = function(matrix[double] X)
+    return (matrix[double] out) {
   /*
    * Computes the forward pass for a ReLU nonlinearity layer.
    *
-   * Performs an element-wise evaluation of f(input) = max(0, input).
+   * Performs an element-wise evaluation of `f(input) = max(0, input)`.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *
    * Outputs:
-   *  - out: Ouptuts, of same shape as X.
+   *  - out: Outputs, of same shape as `X`.
    */
-  out = max(0.0, X)
+  out = max(X, 0)
 }
 
-backward = function(matrix[double] dout, matrix[double] X) return (matrix[double] dX) {
+backward = function(matrix[double] dout, matrix[double] X)
+    return (matrix[double] dX) {
   /*
    * Computes the backward pass for a ReLU nonlinearity layer.
    *
-   * Essentially performs a pass-through of the upstream gradient for cells > 0.
+   * Essentially performs a pass-through of the upstream gradient
+   * for cells > 0.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of same shape as X.
+   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
    *  - X: Previous input data matrix, of shape (any, any).
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of same shape as X.
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
    */
    dX = (X > 0) * dout
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/rnn.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/rnn.dml b/scripts/staging/SystemML-NN/nn/layers/rnn.dml
index cd3eefe..cdceab8 100644
--- a/scripts/staging/SystemML-NN/nn/layers/rnn.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/rnn.dml
@@ -35,14 +35,14 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T,
    * in as an additional input at the current timestep.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, T*D).
-   *  - W: Weights (parameters) matrix, of shape (D+M, M).
-   *  - b: Biases vector, of shape (1, M).
+   *  - X: Inputs, of shape (N, T*D).
+   *  - W: Weights, of shape (D+M, M).
+   *  - b: Biases, of shape (1, M).
    *  - T: Length of example sequences (number of timesteps).
-   *  - D: Dimensionality of the input features.
+   *  - D: Dimensionality of the input features (number of features).
    *  - return_sequences: Whether to return `out` at all timesteps,
    *      or just for the final timestep.
-   *  - out0: Output matrix at previous timestep, of shape (N, M).
+   *  - out0: Output matrix from previous timestep, of shape (N, M).
    *      Note: This is *optional* and could just be an empty matrix.
    *
    * Outputs:
@@ -88,28 +88,28 @@ backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, mat
    * Computes the backward pass for a simple RNN layer with M neurons.
    *
    * Inputs:
-   *  - dout: Gradient on output from upstream.  If `given_sequences`
+   *  - dout: Gradient wrt `out` from upstream.  If `given_sequences`
    *      is True, contains gradients on outputs for all timesteps,
    *      of shape (N, T*M).  Else, contains gradient on output for
    *      the final timestep, of shape (N, M).
-   *  - X: Input data matrix, of shape (N, T*D).
-   *  - W: Weights (parameters) matrix, of shape (D+M, M).
-   *  - b: Biases vector, of shape (1, M).
+   *  - X: Inputs, of shape (N, T*D).
+   *  - W: Weights, of shape (D+M, M).
+   *  - b: Biases, of shape (1, M).
    *  - T: Length of example sequences (number of timesteps).
-   *  - D: Dimensionality of the input features.
+   *  - D: Dimensionality of the input features (number of features).
    *  - given_sequences: Whether `dout` is for all timesteps,
    *      or just for the final timestep.  This is based on whether
    *      `return_sequences` was true in the forward pass.
-   *  - out0: Output matrix at previous timestep, of shape (N, M).
+   *  - out0: Output matrix from previous timestep, of shape (N, M).
    *      Note: This is *optional* and could just be an empty matrix.
    *  - cache_out: Cache of outputs, of shape (T, N*M).
    *      Note: This is used for performance during training.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, T*D).
-   *  - dW: Gradient wrt W, of shape (D+M, 4M).
-   *  - db: Gradient wrt b, of shape (1, 4M).
-   *  - dout0: Gradient wrt out0, of shape (N, M).
+   *  - dX: Gradient wrt `X`, of shape (N, T*D).
+   *  - dW: Gradient wrt `W`, of shape (D+M, 4M).
+   *  - db: Gradient wrt `b`, of shape (1, 4M).
+   *  - dout0: Gradient wrt `out0`, of shape (N, M).
    */
   N = nrow(X)
   M = ncol(W)
@@ -134,7 +134,7 @@ backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, mat
       out_prev = matrix(cache_out[t-1,], rows=N, cols=M)  # shape (N, M)
     }
     input = cbind(X_t, out_prev)  # shape (N, D+M)
-    dout_t_raw = (1 - out_t^2) * dout_t  # into tanh, shape (N, M)
+    dout_t_raw = (1-out_t^2) * dout_t  # into tanh, shape (N, M)
     dW = dW + t(input) %*% dout_t_raw  # shape (D+M, M)
     db = db + colSums(dout_t_raw)  # shape (1, M)
     dinput = dout_t_raw %*% t(W)  # shape (N, D+M)
@@ -146,7 +146,7 @@ backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, mat
     else {
       dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev  # shape (N, M)
     }
-    t = t-1
+    t = t - 1
   }
 }
 
@@ -161,16 +161,17 @@ init = function(int N, int D, int M)
    * We use the Glorot uniform heuristic which limits the magnification
    * of inputs/gradients during forward/backward passes by scaling
    * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
+   *  - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
    *
    * Inputs:
    *  - N: Number of examples in batch.
-   *  - D: Dimensionality of the input features.
+   *  - D: Dimensionality of the input features (number of features).
    *  - M: Number of neurons in this layer.
    *
    * Outputs:
-   *  - W: Weights (parameters) matrix, of shape (D+M, M).
-   *  - b: Biases vector, of shape (1, M).
-   *  - out0: Dummy output matrix at previous timestep, of shape (N, M).
+   *  - W: Weights, of shape (D+M, M).
+   *  - b: Biases, of shape (1, M).
+   *  - out0: Empty previous timestep output matrix, of shape (N, M).
    */
   fan_in = D+M
   fan_out = M

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml b/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
index a7066f2..185befb 100644
--- a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
@@ -22,33 +22,41 @@
 /*
  * Sigmoid nonlinearity layer.
  */
-forward = function(matrix[double] X) return (matrix[double] out) {
+
+forward = function(matrix[double] X)
+    return (matrix[double] out) {
   /*
    * Computes the forward pass for a sigmoid nonlinearity layer.
    *
-   * sigmoid(x) = 1 / (1 + e^-x)
+   *   `sigmoid(x) = 1 / (1 + e^-x)`
+   *
+   * If `X` contains a single feature column, the output of a sigmoid
+   * layer can be interpreted as a predicted probability of a true
+   * class when paired with a log loss function in a binary
+   * classification problem.
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *
    * Outputs:
-   *  - out: Ouptuts, of same shape as X.
+   *  - out: Outputs, of same shape as `X`.
    */
-  out = 1 / (1 + exp(-X))
+  out = 1 / (1+exp(-X))
 }
 
-backward = function(matrix[double] dout, matrix[double] X) return (matrix[double] dX) {
+backward = function(matrix[double] dout, matrix[double] X)
+    return (matrix[double] dX) {
   /*
    * Computes the backward pass for a sigmoid nonlinearity layer.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of same shape as X.
-   *  - X: Previous input data matrix, of shape (any, any).
+   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+   *  - X: Inputs, of shape (any, any).
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of same shape as X.
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
    */
-  out = 1 / (1 + exp(-X))
-  dX = out * (1 - out) * dout
+  out = 1 / (1+exp(-X))
+  dX = out * (1-out) * dout
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/softmax.dml b/scripts/staging/SystemML-NN/nn/layers/softmax.dml
index 854e8a8..1751838 100644
--- a/scripts/staging/SystemML-NN/nn/layers/softmax.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/softmax.dml
@@ -22,7 +22,9 @@
 /*
  * Softmax classifier layer.
  */
-forward = function(matrix[double] scores) return (matrix[double] probs) {
+
+forward = function(matrix[double] scores)
+    return (matrix[double] probs) {
   /*
    * Computes the forward pass for a softmax classifier.  The inputs
    * are interpreted as unnormalized, log-probabilities for each of
@@ -32,10 +34,10 @@ forward = function(matrix[double] scores) return (matrix[double] probs) {
    * This can be interpreted as a generalization of the sigmoid
    * function to multiple classes.
    *
-   * probs_ij = e^scores_ij / sum(e^scores_i)
+   *   `probs_ij = e^scores_ij / sum(e^scores_i)`
    *
    * Inputs:
-   *  - scores: Input data matrix, of shape (N, D).
+   *  - scores: Inputs, of shape (N, D).
    *
    * Outputs:
    *  - probs: Outputs, of shape (N, D).
@@ -56,20 +58,23 @@ backward = function(matrix[double] dprobs, matrix[double] scores)
   /*
    * Computes the backward pass for a softmax classifier.
    *
-   * Note that dscores_ij has multiple sources:
+   * Note that dscores_ij has multiple source branches:
    *
-   * dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij)
-   * dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j
+   *   ```
+   *   dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij)
+   *   dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j
    *
-   * dloss/dscores_ij = dloss/dprobs_ij * dprobs_ij/dscores_ij +
-   *                    sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij)
+   *   dloss/dscores_ij =
+   *      (dloss/dprobs_ij * dprobs_ij/dscores_ij)
+   *      + sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij)
+   *   ```
    *
    * Inputs:
-   *  - dprobs: Derivatives from upstream, of shape (N, D).
-   *  - scores: Previous input data matrix, of shape (N, D).
+   *  - dprobs: Gradient wrt `probs` from upstream, of shape (N, D).
+   *  - scores: Inputs, of shape (N, D).
    *
    * Outputs:
-   *  - dscores: Gradient wrt scores, of shape (N, D).
+   *  - dscores: Gradient wrt `scores`, of shape (N, D).
    */
   scores = scores - rowMaxs(scores)  # numerical stability
   unnorm_probs = exp(scores)  # unnormalized probabilities
@@ -77,6 +82,6 @@ backward = function(matrix[double] dprobs, matrix[double] scores)
   # After some cancellation:
   # dscores = dprobs*probs - probs*rowSums(dprobs*probs)
   dtemp = dprobs * probs
-  dscores = dtemp - probs * rowSums(dtemp)
+  dscores = dtemp - probs*rowSums(dtemp)
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml b/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
index 53ca989..0185a2c 100644
--- a/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
@@ -39,7 +39,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
    * introduces learnable parameters (gamma, beta) to control the
    * amount of normalization.
    *
-   *    y = ((x-mean) / sqrt(var+eps)) * gamma + beta
+   *   `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
    *
    * This implementation maintains exponential moving averages of the
    * mean and variance during training for use during testing.
@@ -50,7 +50,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
    *    - https://arxiv.org/abs/1502.03167
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
    *  - gamma: Scale parameters, of shape (C, 1).
    *  - beta: Shift parameters, of shape (C, 1).
    *  - C: Number of input channels (dimensionality of input depth).
@@ -134,7 +134,7 @@ backward = function(matrix[double] dout, matrix[double] out,
    * Computes the backward pass for a spatial batch normalization layer.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of shape (N, C*Hin*Win).
+   *  - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
    *  - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
    *  - ema_mean_upd: Updated exponential moving average of the mean
    *      from the forward pass, of shape (C, 1).
@@ -171,9 +171,9 @@ backward = function(matrix[double] dout, matrix[double] out,
    *      Typical values are in the range of [1e-5, 1e-3].
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
-   *  - dgamma: Gradient wrt W, of shape (C, 1).
-   *  - dbeta: Gradient wrt b, of shape (C, 1).
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dgamma: Gradient wrt `W`, of shape (C, 1).
+   *  - dbeta: Gradient wrt `b`, of shape (C, 1).
    *
    */
   N = nrow(X)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/tanh.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/tanh.dml b/scripts/staging/SystemML-NN/nn/layers/tanh.dml
index 9308a7c..589a574 100644
--- a/scripts/staging/SystemML-NN/nn/layers/tanh.dml
+++ b/scripts/staging/SystemML-NN/nn/layers/tanh.dml
@@ -24,38 +24,42 @@
  */
 source("nn/layers/sigmoid.dml") as sigmoid
 
-forward = function(matrix[double] X) return (matrix[double] out) {
+forward = function(matrix[double] X)
+    return (matrix[double] out) {
   /*
    * Computes the forward pass for a tanh nonlinearity layer.
    *
-   * tanh(x) = (e^x - e^-x) / (e^x + e^-x)
-   *         = 2 * sigmoid(2x) - 1
+   *   ```
+   *   tanh(x) = (e^x - e^-x) / (e^x + e^-x)
+   *           = 2 * sigmoid(2x) - 1
+   *   ```
    *
    * Inputs:
-   *  - X: Input data matrix, of shape (any, any).
+   *  - X: Inputs, of shape (any, any).
    *
    * Outputs:
-   *  - out: Ouptuts, of same shape as X.
+   *  - out: Outputs, of same shape as `X`.
    */
   # out = (exp(X) - exp(-X)) / (exp(X) + exp(-X))
   # Simplification of the above formulation to use the sigmoid function:
   sigma2X = sigmoid::forward(2*X)
-  out = 2 * sigma2X - 1
+  out = 2*sigma2X - 1
 }
 
-backward = function(matrix[double] dout, matrix[double] X) return (matrix[double] dX) {
+backward = function(matrix[double] dout, matrix[double] X)
+    return (matrix[double] dX) {
   /*
    * Computes the backward pass for a tanh nonlinearity layer.
    *
    * Inputs:
-   *  - dout: Derivatives from upstream, of same shape as X.
-   *  - X: Previous input data matrix, of shape (any, any).
+   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+   *  - X: Inputs, of shape (any, any).
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of same shape as X.
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
    */
   sigma2X = sigmoid::forward(2*X)
-  out = 2 * sigma2X - 1
-  dX = (1 - out^2) * dout
+  out = 2*sigma2X - 1
+  dX = (1-out^2) * dout
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml b/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
index 688109b..20b26c4 100644
--- a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
@@ -22,6 +22,7 @@
 /*
  * Adagrad optimizer.
  */
+
 update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon,
                   matrix[double] cache)
     return (matrix[double] X, matrix[double] cache) {
@@ -39,24 +40,25 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient of X wrt to a loss function being optimized, of
-   *      same shape as X.
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
    *  - lr: Learning rate.
    *  - epsilon: Smoothing term to avoid divide by zero errors.
    *      Typical values are in the range of [1e-8, 1e-4].
    *  - cache: State that maintains per-parameter sum of squared
-   *      gradients, of same shape as X.
+   *      gradients, of same shape as `X`.
    *
    * Outputs:
-   *  - X: Updated parameters X, of same shape as input X.
-   *  - v: Updated velocity of the parameters X, of same shape as
-   *      input v.
+   *  - X: Updated parameters `X`, of same shape as input `X`.
+   *  - cache: State that maintains per-parameter sum of squared
+   *      gradients, of same shape as `X`.
    */
   cache = cache + dX^2
-  X = X - lr * dX / (sqrt(cache) + epsilon)
+  X = X - (lr * dX / (sqrt(cache)+epsilon))
 }
 
-init = function(matrix[double] X) return (matrix[double] cache) {
+init = function(matrix[double] X)
+    return (matrix[double] cache) {
   /*
    * Initialize the state for this optimizer.
    *
@@ -65,10 +67,10 @@ init = function(matrix[double] X) return (matrix[double] cache) {
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   * 
+   *
    * Outputs:
    *  - cache: State that maintains per-parameter sum of squared
-   *      gradients, of same shape as X.
+   *      gradients, of same shape as `X`.
    */
   cache = matrix(0, rows=nrow(X), cols=ncol(X))
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/adam.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/adam.dml b/scripts/staging/SystemML-NN/nn/optim/adam.dml
index a25f74d..0607fa5 100644
--- a/scripts/staging/SystemML-NN/nn/optim/adam.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/adam.dml
@@ -22,6 +22,7 @@
 /*
  * Adam optimizer.
  */
+
 update = function(matrix[double] X, matrix[double] dX, double lr, double beta1, double beta2,
                   double epsilon, int t, matrix[double] m, matrix[double] v)
     return (matrix[double] X, matrix[double] m, matrix[double] v) {
@@ -34,8 +35,8 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double beta1,
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient of X wrt to a loss function being optimized, of
-   *      same shape as X.
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
    *  - lr: Learning rate.  Recommended value is 0.001.
    *  - beta1: Exponential decay rate for the 1st moment estimates.
    *      Recommended value is 0.9.
@@ -46,32 +47,33 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double beta1,
    *  - t: Timestep, starting at 0.
    *  - m: State containing the 1st moment (mean) estimate by
    *      maintaining exponential moving averages of the gradients, of
-   *      same shape as X.
+   *      same shape as `X`.
    *  - v: State containing the 2nd raw moment (uncentered variance)
    *      estimate by maintaining exponential moving averages of the
-   *      squared gradients, of same shape as X.
+   *      squared gradients, of same shape as `X`.
    *
    * Outputs:
-   *  - X: Updated parameters X, of same shape as input X.
+   *  - X: Updated parameters `X`, of same shape as input `X`.
    *  - m: Updated state containing the 1st moment (mean) estimate by
    *      maintaining exponential moving averages of the gradients, of
-   *      same shape as X.
+   *      same shape as `X`.
    *  - v: Updated state containing the 2nd raw moment (uncentered
    *      variance) estimate by maintaining exponential moving averages
-   *      of the squared gradients, of same shape as X.
+   *      of the squared gradients, of same shape as `X`.
    */
   t = t + 1
-  m = beta1 * m + (1 - beta1) * dX  # update biased 1st moment estimate
-  v = beta2 * v + (1 - beta2) * dX^2  # update biased 2nd raw moment estimate
-  #m = m / (1 - beta1^t)  # compute bias-corrected 1st moment estimate
-  #v = v / (1 - beta2^t)  # compute bias-corrected 2nd raw moment estimate
-  #X = X - lr * m / (sqrt(v) + epsilon)  # param update
+  m = beta1*m + (1-beta1)*dX  # update biased 1st moment estimate
+  v = beta2*v + (1-beta2)*dX^2  # update biased 2nd raw moment estimate
+  # m = m / (1-beta1^t)  # compute bias-corrected 1st moment estimate
+  # v = v / (1-beta2^t)  # compute bias-corrected 2nd raw moment estimate
+  # X = X - (lr * m / (sqrt(v)+epsilon))  # param update
   # Simplified for computational efficiency:
-  lr = lr * sqrt(1 - beta2^t) / (1 - beta1^t)
-  X = X - lr * m / (sqrt(v) + epsilon)
+  lr = lr * sqrt(1-beta2^t) / (1-beta1^t)
+  X = X - (lr * m / (sqrt(v)+epsilon))
 }
 
-init = function(matrix[double] X) return (matrix[double] m, matrix[double] v) {
+init = function(matrix[double] X)
+    return (matrix[double] m, matrix[double] v) {
   /*
    * Initialize the state for this optimizer.
    *
@@ -80,14 +82,14 @@ init = function(matrix[double] X) return (matrix[double] m, matrix[double] v) {
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   * 
+   *
    * Outputs:
    *  - m: Initial state containing the 1st moment (mean) estimate by
    *      maintaining exponential moving averages of the gradients, of
-   *      same shape as X.
+   *      same shape as `X`.
    *  - v: Initial state containing the 2nd raw moment (uncentered
    *      variance) estimate by maintaining exponential moving averages
-   *      of the squared gradients, of same shape as X.
+   *      of the squared gradients, of same shape as `X`.
    */
   m = matrix(0, rows=nrow(X), cols=ncol(X))
   v = matrix(0, rows=nrow(X), cols=ncol(X))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml b/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
index e256000..80c75a0 100644
--- a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
@@ -22,6 +22,7 @@
 /*
  * RMSprop optimizer.
  */
+
 update = function(matrix[double] X, matrix[double] dX, double lr, double decay_rate,
                   double epsilon, matrix[double] cache)
     return (matrix[double] X, matrix[double] cache) {
@@ -39,26 +40,27 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double decay_r
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient of X wrt to a loss function being optimized, of
-   *      same shape as X.
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
    *  - lr: Learning rate.
    *  - decay_rate: Term controlling the rate of the moving average.
    *      Typical values are in the range of [0.9, 0.999].
    *  - epsilon: Smoothing term to avoid divide by zero errors.
    *      Typical values are in the range of [1e-8, 1e-4].
    *  - cache: State that maintains the moving average of the squared
-   *      gradients, of same shape as X.
+   *      gradients, of same shape as `X`.
    *
    * Outputs:
-   *  - X: Updated parameters X, of same shape as input X.
-   *  - v: Updated velocity of the parameters X, of same shape as
-   *      input v.
+   *  - X: Updated parameters `X`, of same shape as input `X`.
+   *  - cache: Updated state that maintains the moving average of the
+   *      squared gradients, of same shape as `X`.
    */
-  cache = decay_rate * cache + (1 - decay_rate) * dX^2
-  X = X - lr * dX / (sqrt(cache) + epsilon)
+  cache = decay_rate*cache + (1-decay_rate)*dX^2
+  X = X - (lr * dX / (sqrt(cache)+epsilon))
 }
 
-init = function(matrix[double] X) return (matrix[double] cache) {
+init = function(matrix[double] X)
+    return (matrix[double] cache) {
   /*
    * Initialize the state for this optimizer.
    *
@@ -67,10 +69,10 @@ init = function(matrix[double] X) return (matrix[double] cache) {
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   * 
+   *
    * Outputs:
    *  - cache: State that maintains the moving average of the squared
-   *      gradients, of same shape as X.
+   *      gradients, of same shape as `X`.
    */
   cache = matrix(0, rows=nrow(X), cols=ncol(X))
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/sgd.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd.dml b/scripts/staging/SystemML-NN/nn/optim/sgd.dml
index 554569a..a3fc744 100644
--- a/scripts/staging/SystemML-NN/nn/optim/sgd.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/sgd.dml
@@ -22,19 +22,21 @@
 /*
  * Stochastic Gradient Descent (SGD) optimizer.
  */
-update = function(matrix[double] X, matrix[double] dX, double lr) return (matrix[double] X) {
+
+update = function(matrix[double] X, matrix[double] dX, double lr)
+    return (matrix[double] X) {
   /*
    * Performs a vanilla SGD update.
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient of X wrt to a loss function being optimized, of
-   *      same shape as X.
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
    *  - lr: Learning rate.
    *
    * Outputs:
-   *  - X: Updated parameters X, of same shape as input X.
+   *  - X: Updated parameters `X`, of same shape as input `X`.
    */
-  X = X - lr * dX
+  X = X - lr*dX
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
index c2a441b..2cb9890 100644
--- a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
@@ -22,6 +22,7 @@
 /*
  * Stochastic Gradient Descent with momentum (SGD-momentum) optimizer.
  */
+
 update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
     return (matrix[double] X, matrix[double] v) {
   /*
@@ -33,25 +34,26 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double mu, mat
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient of X wrt to a loss function being optimized, of
-   *      same shape as X.
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
    *  - lr: Learning rate.
    *  - mu: Momentum value.
    *      Typical values are in the range of [0.5, 0.99], usually
    *      started at the lower end and annealed towards the higher end.
-   *  - v: State maintaining the velocity of the parameters X, of same
-   *      shape as X.
+   *  - v: State maintaining the velocity of the parameters `X`, of same
+   *      shape as `X`.
    *
    * Outputs:
-   *  - X: Updated parameters X, of same shape as input X.
-   *  - v: Updated velocity of the parameters X, of same shape as
-   *      input v.
+   *  - X: Updated parameters `X`, of same shape as input `X`.
+   *  - v: Updated velocity of the parameters `X`, of same shape as
+   *      input `X`.
    */
-  v = mu * v - lr * dX  # update velocity
+  v = mu*v - lr*dX  # update velocity
   X = X + v  # update position
 }
 
-init = function(matrix[double] X) return (matrix[double] v) {
+init = function(matrix[double] X)
+    return (matrix[double] v) {
   /*
    * Initialize the state for this optimizer.
    *
@@ -60,9 +62,9 @@ init = function(matrix[double] X) return (matrix[double] v) {
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   * 
+   *
    * Outputs:
-   *  - v: Initial velocity of the parameters X.
+   *  - v: Initial velocity of the parameters `X`.
    */
   v = matrix(0, rows=nrow(X), cols=ncol(X))
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
index 56c6ab0..fee6585 100644
--- a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
+++ b/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
@@ -22,6 +22,7 @@
 /*
  * Stochastic Gradient Descent with Nesterov momentum (SGD-Nesterov) optimizer.
  */
+
 update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
     return (matrix[double] X, matrix[double] v) {
   /*
@@ -36,19 +37,20 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double mu, mat
    * store the parameters in their position after momentum.
    *
    * Reference:
-   *  - Advances in optimizing Recurrent Networks, Bengio et al., section 3.5.
+   *  - Advances in optimizing Recurrent Networks, Bengio et al.,
+   *    section 3.5.
    *    - http://arxiv.org/abs/1212.0901
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient of X wrt to a loss function being optimized, of
-   *      same shape as X.
+   *  - dX: Gradient wrt `X` of a loss function being optimized, of
+   *      same shape as `X`.
    *  - lr: Learning rate.
    *  - mu: Momentum value.
    *      Typical values are in the range of [0.5, 0.99], usually
    *      started at the lower end and annealed towards the higher end.
-   *  - v: State maintaining the velocity of the parameters X, of same
-   *      shape as X.
+   *  - v: State maintaining the velocity of the parameters `X`, of same
+   *      shape as `X`.
    *
    * Outputs:
    *  - X: Updated parameters X, of same shape as input X.
@@ -56,11 +58,12 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double mu, mat
    *      input v.
    */
   v_prev = v
-  v = mu * v - lr * dX  # update velocity
-  X = X - mu * v_prev + (1 + mu) * v  # update position, including momentum
+  v = mu*v - lr*dX  # update velocity
+  X = X - mu*v_prev + (1+mu)*v  # update position, including momentum
 }
 
-init = function(matrix[double] X) return (matrix[double] v) {
+init = function(matrix[double] X)
+    return (matrix[double] v) {
   /*
    * Initialize the state for this optimizer.
    *
@@ -69,9 +72,9 @@ init = function(matrix[double] X) return (matrix[double] v) {
    *
    * Inputs:
    *  - X: Parameters to update, of shape (any, any).
-   * 
+   *
    * Outputs:
-   *  - v: Initial velocity of the parameters X.
+   *  - v: Initial velocity of the parameters `X`.
    */
   v = matrix(0, rows=nrow(X), cols=ncol(X))
 }