You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/04/26 21:42:32 UTC

[06/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn` library to `scripts/nn`

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
deleted file mode 100644
index a529a12..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
+++ /dev/null
@@ -1,178 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * MNIST Softmax Example
- */
-# Imports
-source("nn/layers/affine.dml") as affine
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/softmax.dml") as softmax
-source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
-
-train = function(matrix[double] X, matrix[double] y,
-                 matrix[double] X_val, matrix[double] y_val,
-                 int epochs)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Trains a softmax classifier.
-   *
-   * The input matrix, X, has N examples, each with D features.
-   * The targets, y, have K classes, and are one-hot encoded.
-   *
-   * Inputs:
-   *  - X: Input data matrix, of shape (N, D).
-   *  - y: Target matrix, of shape (N, K).
-   *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
-   *  - y_val: Target validation matrix, of shape (N, K).
-   *  - epochs: Total number of full training loops over the full data set.
-   *
-   * Outputs:
-   *  - W: Weights (parameters) matrix, of shape (D, M).
-   *  - b: Biases vector, of shape (1, M).
-   */
-  N = nrow(X)  # num examples
-  D = ncol(X)  # num features
-  K = ncol(y)  # num classes
-
-  # Create softmax classifier:
-  # affine -> softmax
-  [W, b] = affine::init(D, K)
-  W = W / sqrt(2.0/(D)) * sqrt(1/(D))
-
-  # Initialize SGD w/ Nesterov momentum optimizer
-  lr = 0.2  # learning rate
-  mu = 0  # momentum
-  decay = 0.99  # learning rate decay constant
-  vW = sgd_nesterov::init(W)  # optimizer momentum state for W
-  vb = sgd_nesterov::init(b)  # optimizer momentum state for b
-
-  # Optimize
-  print("Starting optimization")
-  batch_size = 50
-  iters = 1000 #ceil(N / batch_size)
-  for (e in 1:epochs) {
-    for(i in 1:iters) {
-      # Get next batch
-      beg = ((i-1) * batch_size) %% N + 1
-      end = min(N, beg + batch_size - 1)
-      X_batch = X[beg:end,]
-      y_batch = y[beg:end,]
-
-      # Compute forward pass
-      ## affine & softmax:
-      out = affine::forward(X_batch, W, b)
-      probs = softmax::forward(out)
-
-      # Compute loss & accuracy for training & validation data
-      loss = cross_entropy_loss::forward(probs, y_batch)
-      accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
-      probs_val = predict(X_val, W, b)
-      loss_val = cross_entropy_loss::forward(probs_val, y_val)
-      accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
-      print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
-            accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
-
-      # Compute backward pass
-      ## loss:
-      dprobs = cross_entropy_loss::backward(probs, y_batch)
-      ## affine & softmax:
-      dout = softmax::backward(dprobs, out)
-      [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b)
-
-      # Optimize with SGD w/ Nesterov momentum
-      [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW)
-      [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb)
-    }
-    # Anneal momentum towards 0.999
-    mu = mu + (0.999 - mu)/(1+epochs-e)
-    # Decay learning rate
-    lr = lr * decay
-  }
-}
-
-predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
-    return (matrix[double] probs) {
-  /*
-   * Computes the class probability predictions of a softmax classifier.
-   *
-   * The input matrix, X, has N examples, each with D features.
-   *
-   * Inputs:
-   *  - X: Input data matrix, of shape (N, D).
-   *  - W: Weights (parameters) matrix, of shape (D, M).
-   *  - b: Biases vector, of shape (1, M).
-   *
-   * Outputs:
-   *  - probs: Class probabilities, of shape (N, K).
-   */
-  # Compute forward pass
-  ## affine & softmax:
-  out = affine::forward(X, W, b)
-  probs = softmax::forward(out)
-}
-
-eval = function(matrix[double] probs, matrix[double] y)
-    return (double loss, double accuracy) {
-  /*
-   * Evaluates a softmax classifier.
-   *
-   * The probs matrix contains the class probability predictions
-   * of K classes over N examples.  The targets, y, have K classes,
-   * and are one-hot encoded.
-   *
-   * Inputs:
-   *  - probs: Class probabilities, of shape (N, K).
-   *  - y: Target matrix, of shape (N, K).
-   *
-   * Outputs:
-   *  - loss: Scalar loss, of shape (1).
-   *  - accuracy: Scalar accuracy, of shape (1).
-   */
-  # Compute loss & accuracy
-  loss = cross_entropy_loss::forward(probs, y)
-  correct_pred = rowIndexMax(probs) == rowIndexMax(y)
-  accuracy = mean(correct_pred)
-}
-
-generate_dummy_data = function()
-    return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
-  /*
-   * Generate a dummy dataset similar to the MNIST dataset.
-   *
-   * Outputs:
-   *  - X: Input data matrix, of shape (N, D).
-   *  - y: Target matrix, of shape (N, K).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   */
-  # Generate dummy input data
-  N = 1024  # num examples
-  C = 1  # num input channels
-  Hin = 28  # input height
-  Win = 28  # input width
-  T = 10  # num targets
-  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
-  classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
-  y = table(seq(1, N), classes)  # one-hot encoding
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/affine.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/affine.dml b/scripts/staging/SystemML-NN/nn/layers/affine.dml
deleted file mode 100644
index c9a740b..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/affine.dml
+++ /dev/null
@@ -1,92 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Affine (fully-connected) layer.
- */
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b)
-    return (matrix[double] out) {
-  /*
-   * Computes the forward pass for an affine (fully-connected) layer
-   * with M neurons.  The input data has N examples, each with D
-   * features.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, D).
-   *  - W: Weights, of shape (D, M).
-   *  - b: Biases, of shape (1, M).
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, M).
-   */
-  out = X %*% W + b
-}
-
-backward = function(matrix[double] dout, matrix[double] X,
-                    matrix[double] W, matrix[double] b)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
-  /*
-   * Computes the backward pass for a fully-connected (affine) layer
-   * with M neurons.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of shape (N, M).
-   *  - X: Inputs, of shape (N, D).
-   *  - W: Weights, of shape (D, M).
-   *  - b: Biases, of shape (1, M).
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, D).
-   *  - dW: Gradient wrt `W`, of shape (D, M).
-   *  - db: Gradient wrt `b`, of shape (1, M).
-   */
-  dX = dout %*% t(W)
-  dW = t(X) %*% dout
-  db = colSums(dout)
-}
-
-init = function(int D, int M)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * We use the heuristic by He et al., which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * unit-Gaussian weights by a factor of sqrt(2/n), under the
-   * assumption of relu neurons.
-   *  - http://arxiv.org/abs/1502.01852
-   *
-   * Inputs:
-   *  - D: Dimensionality of the input features (number of features).
-   *  - M: Number of neurons in this layer.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (D, M).
-   *  - b: Biases, of shape (1, M).
-   */
-  W = rand(rows=D, cols=M, pdf="normal") * sqrt(2.0/D)
-  b = matrix(0, rows=1, cols=M)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
deleted file mode 100644
index 2ccffdb..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
+++ /dev/null
@@ -1,210 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 1D Batch Normalization layer.
- */
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
-                   string mode, matrix[double] ema_mean, matrix[double] ema_var,
-                   double mu, double epsilon)
-    return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
-            matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
-  /*
-   * Computes the forward pass for a 1D batch normalization layer.
-   * The input data has N examples, each with D features.
-   *
-   * A batch normalization layer uses the per-feature sample mean and
-   * per-feature uncorrected sample variance during training to
-   * normalize each feature of the input data.  Additionally, it
-   * introduces learnable parameters (gamma, beta) to control the
-   * amount of normalization.
-   *
-   *   `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
-   *
-   * This implementation maintains exponential moving averages of the
-   * mean and variance during training for use during testing.
-   *
-   * Reference:
-   *  - Batch Normalization: Accelerating Deep Network Training by
-   *    Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
-   *    - https://arxiv.org/abs/1502.03167
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, D).
-   *  - gamma: Scale parameters, of shape (1, D).
-   *  - beta: Shift parameters, of shape (1, D).
-   *  - mode: 'train' or 'test' to indicate if the model is currently
-   *      being trained or tested.  During training, the current batch
-   *      mean and variance will be used to normalize the inputs, while
-   *      during testing, the exponential average of the mean and
-   *      variance over all previous batches will be used.
-   *  - ema_mean: Exponential moving average of the mean, of
-   *      shape (1, D).
-   *  - ema_var: Exponential moving average of the variance, of
-   *      shape (1, D).
-   *  - mu: Momentum value for moving averages.
-   *      Typical values are in the range of [0.9, 0.999].
-   *  - epsilon: Smoothing term to avoid divide by zero errors.
-   *      Typical values are in the range of [1e-5, 1e-3].
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, D).
-   *  - ema_mean_upd: Updated exponential moving average of the mean,
-   *      of shape (1, D).
-   *  - ema_var_upd: Updated exponential moving average of the variance,
-   *      of shape (1, D).
-   *  - cache_mean: Cache of the batch mean, of shape (1, D).
-   *      Note: This is used for performance during training.
-   *  - cache_var: Cache of the batch variance, of shape (1, D).
-   *      Note: This is used for performance during training.
-   *  - cache_norm: Cache of the normalized inputs, of shape (N, D).
-   *      Note: This is used for performance during training.
-   */
-  N = nrow(X)
-
-  if (mode == 'train') {
-    # Compute feature-wise mean and variance
-    mean = colMeans(X)  # shape (1, D)
-    # var = (1/N) * colSums((X-mean)^2)
-    var = colVars(X) * ((N-1)/N)  # compute uncorrected variance, of shape (1, D)
-    # Update moving averages
-    ema_mean_upd = mu*ema_mean + (1-mu)*mean
-    ema_var_upd = mu*ema_var + (1-mu)*var
-  }
-  else {
-    # Use moving averages of mean and variance during testing
-    mean = ema_mean
-    var = ema_var
-    ema_mean_upd = ema_mean
-    ema_var_upd = ema_var
-  }
-
-  # Normalize, shift, and scale
-  # norm = (X-mean)*(var+epsilon)^(-1/2)
-  norm = (X-mean) / sqrt(var+epsilon)  # shape (N, D)
-  out = norm*gamma + beta  # shape (N, D)
-
-  # Save variable for backward pass
-  cache_mean = mean
-  cache_var = var
-  cache_norm = norm
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
-                    matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
-                    matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
-                    matrix[double] X, matrix[double] gamma, matrix[double] beta,
-                    string mode, matrix[double] ema_mean, matrix[double] ema_var,
-                    double mu, double epsilon)
-      return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
-  /*
-   * Computes the backward pass for a 1D batch normalization layer.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of shape (N, D).
-   *  - out: Outputs from the forward pass, of shape (N, D).
-   *  - ema_mean_upd: Updated exponential moving average of the mean
-   *      from the forward pass, of shape (1, D).
-   *  - ema_var_upd: Updated exponential moving average of the variance
-   *      from the forward pass, of shape (1, D).
-   *  - cache_mean: Cache of the batch mean from the forward pass, of
-   *      shape (1, D).  Note: This is used for performance during
-   *      training.
-   *  - cache_var: Cache of the batch variance from the forward pass,
-   *      of shape (1, D).  Note: This is used for performance during
-   *      training.
-   *  - cache_norm: Cache of the normalized inputs from the forward
-   *      pass, of shape (N, D).  Note: This is used for performance
-   *      during training.
-   *  - X: Inputs, of shape (N, D).
-   *  - gamma: Scale parameters, of shape (1, D).
-   *  - beta: Shift parameters, of shape (1, D).
-   *  - mode: 'train' or 'test' to indicate if the model is currently
-   *      being trained or tested.  During training, the current batch
-   *      mean and variance will be used to normalize the inputs, while
-   *      during testing, the exponential average of the mean and
-   *      variance over all previous batches will be used.
-   *  - ema_mean: Exponential moving average of the mean, of
-   *      shape (1, D).
-   *  - ema_var: Exponential moving average of the variance, of
-   *      shape (1, D).
-   *  - mu: Momentum value for moving averages.
-   *      Typical values are in the range of [0.9, 0.999].
-   *  - epsilon: Smoothing term to avoid divide by zero errors.
-   *      Typical values are in the range of [1e-5, 1e-3].
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, D).
-   *  - dgamma: Gradient wrt `W`, of shape (1, D).
-   *  - dbeta: Gradient wrt `b`, of shape (1, D).
-   *
-   */
-  N = nrow(X)
-  mean = cache_mean
-  var = cache_var
-  norm = cache_norm
-  centered = X-mean
-
-  if (mode == 'train') {
-    # Compute gradients during training
-    dgamma = colSums(dout*norm)  # shape (1, D)
-    dbeta = colSums(dout)  # shape (1, D)
-    dnorm = dout * gamma  # shape (N, D)
-    dvar = (-1/2) * colSums(centered * (var+epsilon)^(-3/2) * dnorm)  # shape (1, D)
-    dmean = colSums((-dnorm/sqrt(var+epsilon)) + ((-2/N)*centered*dvar))  # shape (1, D)
-    dX = (dnorm/sqrt(var+epsilon)) + ((2/N)*centered*dvar) + ((1/N)*dmean)  # shape (N, D)
-  }
-  else {
-    # Compute gradients during testing
-    dgamma = colSums(dout*norm)  # shape (1, D)
-    dbeta = colSums(dout)  # shape (1, D)
-    dnorm = dout * gamma  # shape (N, D)
-    dX = dnorm / sqrt(var+epsilon)  # shape (N, D)
-  }
-}
-
-init = function(int D)
-    return (matrix[double] gamma, matrix[double] beta,
-            matrix[double] ema_mean, matrix[double] ema_var) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - D: Dimensionality of the input features (number of features).
-   *
-   * Outputs:
-   *  - gamma: Scale parameters, of shape (1, D).
-   *  - beta: Shift parameters, of shape (1, D).
-   *  - ema_mean: Exponential moving average of the mean, of
-   *      shape (1, D).
-   *  - ema_var: Exponential moving average of the variance, of
-   *      shape (1, D).
-   */
-   gamma = matrix(1, rows=1, cols=D)
-   beta = matrix(0, rows=1, cols=D)
-   ema_mean = matrix(0, rows=1, cols=D)
-   ema_var = matrix(1, rows=1, cols=D)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
deleted file mode 100644
index 49c6746..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
+++ /dev/null
@@ -1,238 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D (Spatial) Batch Normalization layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
-                   int C, int Hin, int Win, string mode,
-                   matrix[double] ema_mean, matrix[double] ema_var,
-                   double mu, double epsilon)
-    return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
-            matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
-  /*
-   * Computes the forward pass for a 2D (spatial) batch normalization
-   * layer.  The input data has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * A spatial batch normalization layer uses the per-channel sample
-   * mean and per-channel uncorrected sample variance during training
-   * to normalize each channel of the input data.  Additionally, it
-   * introduces learnable parameters (gamma, beta) to control the
-   * amount of normalization.
-   *
-   *   `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
-   *
-   * This implementation maintains exponential moving averages of the
-   * mean and variance during training for use during testing.
-   *
-   * Reference:
-   *  - Batch Normalization: Accelerating Deep Network Training by
-   *    Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
-   *    - https://arxiv.org/abs/1502.03167
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - gamma: Scale parameters, of shape (C, 1).
-   *  - beta: Shift parameters, of shape (C, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - mode: 'train' or 'test' to indicate if the model is currently
-   *      being trained or tested.  During training, the current batch
-   *      mean and variance will be used to normalize the inputs, while
-   *      during testing, the exponential average of the mean and
-   *      variance over all previous batches will be used.
-   *  - ema_mean: Exponential moving average of the mean, of
-   *      shape (C, 1).
-   *  - ema_var: Exponential moving average of the variance, of
-   *      shape (C, 1).
-   *  - mu: Momentum value for moving averages.
-   *      Typical values are in the range of [0.9, 0.999].
-   *  - epsilon: Smoothing term to avoid divide by zero errors.
-   *      Typical values are in the range of [1e-5, 1e-3].
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hin*Win).
-   *  - ema_mean_upd: Updated exponential moving average of the mean,
-   *      of shape (C, 1).
-   *  - ema_var_upd: Updated exponential moving average of the variance,
-   *      of shape (C, 1).
-   *  - cache_mean: Cache of the batch mean, of shape (C, 1).
-   *      Note: This is used for performance during training.
-   *  - cache_var: Cache of the batch variance, of shape (C, 1).
-   *      Note: This is used for performance during training.
-   *  - cache_norm: Cache of the normalized inputs, of
-   *      shape (C, N*Hin*Win). Note: This is used for performance
-   *      during training.
-   */
-  N = nrow(X)
-
-  if (mode == 'train') {
-    # Compute channel-wise mean and variance
-    # Since we don't have tensors, we will compute the means and variances in a piece-wise fashion.
-    #  - mean of total group is mean of subgroup means
-    #  - variance is the mean of the subgroup variances + the variance of the subgroup means
-    subgrp_means = matrix(colMeans(X), rows=C, cols=Hin*Win)
-    subgrp_vars = matrix(colVars(X) * ((N-1)/N), rows=C, cols=Hin*Win)  # uncorrected variances
-    mean = rowMeans(subgrp_means)  # shape (C, 1)
-    var = rowMeans(subgrp_vars) + rowVars(subgrp_means)*(((Hin*Win)-1)/(Hin*Win))  # shape (C, 1)
-    # Update moving averages
-    ema_mean_upd = mu*ema_mean + (1-mu)*mean
-    ema_var_upd = mu*ema_var + (1-mu)*var
-  }
-  else {
-    # Use moving averages of mean and variance during testing
-    mean = ema_mean
-    var = ema_var
-    ema_mean_upd = ema_mean
-    ema_var_upd = ema_var
-  }
-
-  # Normalize, shift, and scale
-  # norm = (X-mean)*(var+epsilon)^(-1/2)
-  #      = (X-mean) / sqrt(var+epsilon)
-  centered = bias_add(X, -mean)  # shape (N, C*Hin*Win)
-  norm = bias_multiply(centered, 1/sqrt(var+epsilon))  # shape (N, C*Hin*Win)
-  # out = norm*gamma + beta
-  scaled = bias_multiply(norm, gamma)  # shape (N, C*Hin*Win)
-  out = bias_add(scaled, beta)  # shape (N, C*Hin*Win)
-
-  # Save variable for backward pass
-  cache_mean = mean
-  cache_var = var
-  cache_norm = norm
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
-                    matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
-                    matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
-                    matrix[double] X, matrix[double] gamma, matrix[double] beta,
-                    int C, int Hin, int Win, string mode,
-                    matrix[double] ema_mean, matrix[double] ema_var,
-                    double mu, double epsilon)
-      return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
-  /*
-   * Computes the backward pass for a 2D (spatial) batch normalization
-   * layer.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
-   *  - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
-   *  - ema_mean_upd: Updated exponential moving average of the mean
-   *      from the forward pass, of shape (C, 1).
-   *  - ema_var_upd: Updated exponential moving average of the variance
-   *      from the forward pass, of shape (C, 1).
-   *  - cache_mean: Cache of the batch mean from the forward pass, of
-   *      shape (C, 1).  Note: This is used for performance during
-   *      training.
-   *  - cache_var: Cache of the batch variance from the forward pass,
-   *      of shape (C, 1).  Note: This is used for performance during
-   *      training.
-   *  - cache_norm: Cache of the normalized inputs from the forward
-   *      pass, of shape (C, N*Hin*Win).  Note: This is used for
-   *      performance during training.
-   *  - X: Input data matrix to the forward pass, of
-   *      shape (N, C*Hin*Win).
-   *  - gamma: Scale parameters, of shape (C, 1).
-   *  - beta: Shift parameters, of shape (C, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - mode: 'train' or 'test' to indicate if the model is currently
-   *      being trained or tested.  During training, the current batch
-   *      mean and variance will be used to normalize the inputs, while
-   *      during testing, the exponential average of the mean and
-   *      variance over all previous batches will be used.
-   *  - ema_mean: Exponential moving average of the mean, of
-   *      shape (C, 1).
-   *  - ema_var: Exponential moving average of the variance, of
-   *      shape (C, 1).
-   *  - mu: Momentum value for moving averages.
-   *      Typical values are in the range of [0.9, 0.999].
-   *  - epsilon: Smoothing term to avoid divide by zero errors.
-   *      Typical values are in the range of [1e-5, 1e-3].
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dgamma: Gradient wrt `W`, of shape (C, 1).
-   *  - dbeta: Gradient wrt `b`, of shape (C, 1).
-   *
-   */
-  N = nrow(X)
-  mean = cache_mean
-  var = cache_var
-  norm = cache_norm
-  centered = bias_add(X, -mean)  # shape (N, C*Hin*Win)
-
-  if (mode == 'train') {
-    # Compute gradients during training
-    dgamma = util::channel_sums(dout*norm, C, Hin, Win)  # shape (C, 1)
-    dbeta = util::channel_sums(dout, C, Hin, Win)  # shape (C, 1)
-    dnorm = bias_multiply(dout, gamma)  # shape (N, C*Hin*Win)
-    dvar = util::channel_sums((-1/2) * bias_multiply(centered, (var+epsilon)^(-3/2)) * dnorm,
-                              C, Hin, Win)  # shape (C, 1)
-    dmean_norm_branch = util::channel_sums(bias_multiply(dnorm, -1/sqrt(var+epsilon)), C, Hin, Win)
-    dmean_var_branch =  util::channel_sums((-2/(N*Hin*Win)) * centered, C, Hin, Win)
-    dmean_var_branch = dmean_var_branch * dvar  # we can't use a function within an expression yet
-    dmean = dmean_norm_branch + dmean_var_branch  # shape (C, 1)
-    dX_norm_branch = bias_multiply(dnorm, 1/sqrt(var+epsilon))
-    dX_mean_branch = (1/(N*Hin*Win)) * bias_add(matrix(0, rows=1, cols=C*Hin*Win), dmean)
-    dX_var_branch = (2/(N*Hin*Win)) * bias_multiply(centered, dvar)
-    dX = dX_norm_branch + dX_mean_branch + dX_var_branch  # shape (N, C*Hin*Win)
-  }
-  else {
-    # Compute gradients during testing
-    dgamma = util::channel_sums(dout*norm, C, Hin, Win)  # shape (C, 1)
-    dbeta = util::channel_sums(dout, C, Hin, Win)  # shape (C, 1)
-    dnorm = bias_multiply(dout, gamma)  # shape (N, C*Hin*Win)
-    dX = bias_multiply(dnorm, 1/sqrt(var+epsilon))  # shape (N, C*Hin*Win)
-  }
-}
-
-init = function(int C)
-    return (matrix[double] gamma, matrix[double] beta,
-            matrix[double] ema_mean, matrix[double] ema_var) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - C: Number of input channels (dimensionality of input depth).
-   *
-   * Outputs:
-   *  - gamma: Scale parameters, of shape (C, 1).
-   *  - beta: Shift parameters, of shape (C, 1).
-   *  - ema_mean: Exponential moving average of the mean, of
-   *      shape (C, 1).
-   *  - ema_var: Exponential moving average of the variance, of
-   *      shape (C, 1).
-   */
-   gamma = matrix(1, rows=C, cols=1)
-   beta = matrix(0, rows=C, cols=1)
-   ema_mean = matrix(0, rows=C, cols=1)
-   ema_var = matrix(1, rows=C, cols=1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d.dml b/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
deleted file mode 100644
index 9d03568..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
+++ /dev/null
@@ -1,194 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
-                   int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial convolutional layer with
-   * F filters.  The input data has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * This implementation uses `im2col` internally for each image to
-   * extract local image regions (patches) into columns, and then
-   * performs a matrix multiplication with the filters to compute the
-   * output maps.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set `padh = (Hf - 1) / 2`,
-   *      assuming `strideh = 1`.
-   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
-   *      preserves the spatial dimensions of the input.
-   *  - padw: Padding for left and right sides.
-   *      For same output width as input, set `padw = (Wf - 1) / 2`,
-   *      assuming `stridew = 1`.
-   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
-   *      preserves the spatial dimensions of the input.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  F = nrow(W)
-  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
-  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
-  # Create output volume
-  out = matrix(0, rows=N, cols=F*Hout*Wout)
-
-  # Convolution - im2col implementation
-  parfor (n in 1:N) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
-
-    # Pad image
-    Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0)  # shape (C, (Hin+2*padh)*(Win+2*padw))
-
-    # Extract local image patches into columns with im2col, of shape (C*Hf*Wf, Hout*Wout)
-    Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
-
-    # Convolve patches with filters
-    outn = W %*% Xn_padded_cols + b  # shape (F, Hout*Wout)
-    out[n,] = matrix(outn, rows=1, cols=F*Hout*Wout)  # reshape
-  }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
-                    matrix[double] X, matrix[double] W, matrix[double] b,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
-  /*
-   * Computes the backward pass for a 2D spatial convolutional layer
-   * with F filters.
-   *
-   * This implementation uses `im2col` and `col2im` internally.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *  - padw: Padding for left and right sides.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt `b`, of shape (F, 1).
-   */
-  N = nrow(X)
-  F = nrow(W)
-
-  # Create gradient volumes
-  # Note: Create convenience gradient volumes for dW and db that will
-  # allow for one gradient to be stored per example, allowing for
-  # parallel computation at the expense of memory.  We will reduce at
-  # the end.
-  dX = matrix(0, rows=N, cols=C*Hin*Win)
-  dWN = matrix(0, rows=N, cols=F*C*Hf*Wf)  # dW = matrix(0, rows=F, cols=C*Hf*Wf)
-  dbN = matrix(0, rows=N, cols=F)  # db = matrix(0, rows=F, cols=1)
-
-  # Partial derivatives for convolution - im2col implementation
-  parfor (n in 1:N) {  # all examples
-    doutn = matrix(dout[n,], rows=F, cols=Hout*Wout)
-
-    # Compute dW
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
-    Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0)  # shape (C, (Hin+2*padh)*(Win+2*padw))
-    Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
-    # dW = dW + doutn %*% t(Xn_padded_cols)
-    dWN[n,] = matrix(doutn %*% t(Xn_padded_cols), rows=1, cols=F*C*Hf*Wf)
-
-    # Compute db
-    # db = db + rowSums(doutn)
-    dbN[n,] = matrix(rowSums(doutn), rows=1, cols=F)
-
-    # Compute dX
-    dXn_padded_cols = t(W) %*% doutn  # shape (C*Hf*Wf, Hout*Wout)
-    dXn_padded = util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, Wf,
-                              strideh, stridew, "add")
-    dXn = util::unpad_image(dXn_padded, Hin, Win, padh, padw)
-    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)  # reshape
-  }
-
-  # Reduce convenience gradient volumes with one gradient per example
-  # into single gradients for W and b.
-  dW = matrix(colSums(dWN), rows=F, cols=C*Hf*Wf)
-  db = matrix(colSums(dbN), rows=F, cols=1)
-}
-
-init = function(int F, int C, int Hf, int Wf)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * We use the heuristic by He et al., which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * unit-Gaussian weights by a factor of sqrt(2/n), under the
-   * assumption of relu neurons.
-   *  - http://arxiv.org/abs/1502.01852
-   *
-   * Inputs:
-   *  - F: Number of filters.
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   */
-  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
-  b = matrix(0, rows=F, cols=1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
deleted file mode 100644
index bda7a9c..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
+++ /dev/null
@@ -1,160 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- *
- * This implementation uses a built-in operator for higher performance.
- */
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
-                   int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial convolutional layer with
-   * F filters.  The input data has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * This implementation uses a built-in operator for higher
-   * performance.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set `padh = (Hf - 1) / 2`,
-   *      assuming `strideh = 1`.
-   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
-   *      preserves the spatial dimensions of the input.
-   *  - padw: Padding for left and right sides.
-   *      For same output width as input, set `padw = (Wf - 1) / 2`,
-   *      assuming `stridew = 1`.
-   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
-   *      preserves the spatial dimensions of the input.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  F = nrow(W)
-  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
-  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
-  # Convolution - built-in implementation
-  out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
-               stride=[strideh,stridew], padding=[padh,padw])
-
-  # Add bias term to each output filter
-  out = bias_add(out, b)
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
-                    matrix[double] X, matrix[double] W, matrix[double] b,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
-  /*
-   * Computes the backward pass for a 2D spatial convolutional layer
-   * with F filters.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set `padh = (Hf - 1) / 2`,
-   *      assuming `strideh = 1`.
-   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
-   *      preserves the spatial dimensions of the input.
-   *  - padw: Padding for left and right sides.
-   *      For same output width as input, set `padw = (Wf - 1) / 2`,
-   *      assuming `stridew = 1`.
-   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
-   *      preserves the spatial dimensions of the input.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt `b`, of shape (F, 1).
-   */
-  N = nrow(X)
-  F = nrow(W)
-
-  # Partial derivatives for convolution - built-in implementation
-  dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
-                              input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
-  dX = conv2d_backward_data(W, dout, stride=[strideh, stridew], padding=[padh,padw],
-                            input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
-
-  # Partial derivatives for bias vector
-  db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))
-}
-
-init = function(int F, int C, int Hf, int Wf)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * We use the heuristic by He et al., which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * unit-Gaussian weights by a factor of sqrt(2/n), under the
-   * assumption of relu neurons.
-   *  - http://arxiv.org/abs/1502.01852
-   *
-   * Inputs:
-   *  - F: Number of filters.
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   */
-  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
-  b = matrix(0, rows=F, cols=1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
deleted file mode 100644
index 63db502..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
+++ /dev/null
@@ -1,78 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Cross-Entropy loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
-    return (double loss) {
-  /*
-   * Computes the forward pass for a cross-entropy loss function.  The
-   * inputs consist of N examples, each with K dimensions corresponding
-   * to normalized probabilities of K classes.
-   *
-   *   ```
-   *   L_i = -y_i^T * log(pred_i)
-   *   L = (1/N) sum(L_i) for i=1 to N
-   *   ```
-   *
-   * In these equations, `L` is the total loss, `L_i` is the loss for
-   * example `i`, `y_i` is the K-dimensional vector of target class
-   * probabilities, `pred_i` is K-dimensional vector of predicted
-   * class probabilities, and `N` is the number of examples.
-   *
-   * This can be interpreted as the negative log-likelihood assuming
-   * a Bernoulli distribution generalized to K dimensions, or a
-   * Multinomial with one observation.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, K).
-   *  - y: Targets, of shape (N, K).
-   *
-   * Outputs:
-   *  - loss: Average loss.
-   */
-  N = nrow(y)
-  eps = 1e-10  # numerical stability to avoid log(0)
-  losses = rowSums(-y * log(pred+eps))
-  loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
-    return (matrix[double] dpred) {
-  /*
-   * Computes the backward pass of a cross-entropy loss function.  The
-   * inputs consist of N examples, each with K dimensions corresponding
-   * to normalized probabilities of K classes.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, K).
-   *  - y: Targets, of shape (N, K).
-   *
-   * Outputs:
-   *  - dpred: Gradient wrt `pred`, of shape (N, K).
-   */
-  N = nrow(y)
-  eps = 1e-10  # numerical stability to avoid divide-by-zero
-  dpred = (1/N) * -y * (1/(pred+eps))
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/dropout.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/dropout.dml b/scripts/staging/SystemML-NN/nn/layers/dropout.dml
deleted file mode 100644
index a36878b..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/dropout.dml
+++ /dev/null
@@ -1,76 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Dropout layer.
- */
-
-forward = function(matrix[double] X, double p, int seed)
-    return (matrix[double] out, matrix[double] mask) {
-  /*
-   * Computes the forward pass for an inverted dropout layer.
-   *
-   * Drops the inputs element-wise with a probability p, and divides
-   * by p to maintain the expected values of those inputs (which are
-   * the outputs of neurons) at test time.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *  - p: Probability of keeping a neuron output.
-   *  - seed: [Optional: -1] Random number generator seed to allow for
-   *      deterministic evaluation.  Set to -1 for a random seed.
-   *
-   * Outputs:
-   *  - out: Outputs, of same shape as `X`.
-   *  - mask: Dropout mask used to compute the output.
-   */
-  # Normally, we might use something like
-  #    `mask = rand(rows=nrow(X), cols=ncol(X), min=0, max=1, seed=seed) <= p`
-  # to create a dropout mask.  Fortunately, SystemML has a `sparsity` parameter on
-  # the `rand` function that allows use to create a mask directly.
-  if (seed == -1) {
-    mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p)
-  } else {
-    mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed)
-  }
-  out = X * mask / p
-}
-
-backward = function(matrix[double] dout, matrix[double] X, double p, matrix[double] mask)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for an inverted dropout layer.
-   *
-   * Applies the mask to the upstream gradient, and divides by p to
-   * maintain the expected values at test time.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out`, of same shape as `X`.
-   *  - X: Inputs, of shape (any, any).
-   *  - p: Probability of keeping a neuron output.
-   *  - mask: Dropout mask used to compute the output.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of same shape as `X`.
-   */
-  dX = mask / p * dout
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
deleted file mode 100644
index b74566d..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
+++ /dev/null
@@ -1,72 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L1 loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
-    return (double loss) {
-  /*
-   * Computes the forward pass for an L1 loss function.  The inputs
-   * consist of N examples, each with M dimensions to predict.
-   *
-   *   ```
-   *   L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j.
-   *   L = (1/N) sum(L_i) for i=1 to N
-   *   ```
-   *
-   * In these equations, `L` is the total loss, `L_i` is the loss for
-   * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
-   * prediction, and `N` is the number of examples.
-   *
-   * This can be interpreted as the negative log-likelihood assuming
-   * a Laplace distribution.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, M).
-   *  - y: Targets, of shape (N, M).
-   *
-   * Outputs:
-   *  - loss: Average loss.
-   */
-  N = nrow(y)
-  losses = rowSums(abs(pred-y))
-  loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
-    return (matrix[double] dpred) {
-  /*
-   * Computes the backward pass for an L1 loss function.  The inputs
-   * consist of N examples, each with M dimensions to predict.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, M).
-   *  - y: Targets, of shape (N, M).
-   *
-   * Outputs:
-   *  - dpred: Gradient wrt `pred`, of shape (N, M).
-   */
-  N = nrow(y)
-  dpred = sign(pred-y) / N
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
deleted file mode 100644
index 2b81c0b..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
+++ /dev/null
@@ -1,56 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L1 regularization.
- */
-
-forward = function(matrix[double] X, double lambda)
-    return (double reg_loss) {
-  /*
-   * Computes the forward pass for an L1 regularization function.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *  - lambda: Regularization strength.
-   *      A typical value is 0.01.
-   *
-   * Outputs:
-   *  - reg_loss: Total regularization loss.
-   */
-  reg_loss = lambda * sum(abs(X))
-}
-
-backward = function(matrix[double] X, double lambda)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for an L1 regularization function.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *  - lambda: Regularization strength.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of same shape as `X`.
-   */
-  dX = lambda * sign(X)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
deleted file mode 100644
index 0482f25..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
+++ /dev/null
@@ -1,72 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L2 loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
-    return (double loss) {
-  /*
-   * Computes the forward pass for an L2 loss function.  The inputs
-   * consist of N examples, each with M dimensions to predict.
-   *
-   *   ```
-   *   L_i = (1/2) norm(pred_i - y_i)^2
-   *   L = (1/N) sum(L_i) for i=1 to N
-   *   ```
-   *
-   * In these equations, `L` is the total loss, `L_i` is the loss for
-   * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
-   * prediction, and `N` is the number of examples.
-   *
-   * This can be interpreted as the negative log-likelihood assuming
-   * a Gaussian distribution.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, M).
-   *  - y: Targets, of shape (N, M).
-   *
-   * Outputs:
-   *  - loss: Average loss.
-   */
-  N = nrow(y)
-  losses = 0.5 * rowSums((pred-y)^2)
-  loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
-    return (matrix[double] dpred) {
-  /*
-   * Computes the backward pass for an L2 loss function.  The inputs
-   * consist of N examples, each with M dimensions to predict.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, M).
-   *  - y: Targets, of shape (N, M).
-   *
-   * Outputs:
-   *  - dpred: Gradient wrt `pred`, of shape (N, M).
-   */
-  N = nrow(y)
-  dpred = (pred-y) / N
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
deleted file mode 100644
index 7255efe..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
+++ /dev/null
@@ -1,56 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L2 regularization.
- */
-
-forward = function(matrix[double] X, double lambda)
-    return (double reg_loss) {
-  /*
-   * Computes the forward pass for an L2 regularization function.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *  - lambda: Regularization strength.
-   *      A typical value is 0.01.
-   *
-   * Outputs:
-   *  - reg_loss: Total regularization loss.
-   */
-  reg_loss = 0.5 * lambda * sum(X^2)
-}
-
-backward = function(matrix[double] X, double lambda)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for an L2 regularization function.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *  - lambda: Regularization strength.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of same shape as `X`.
-   */
-  dX = lambda * X
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
deleted file mode 100644
index 15914f7..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
+++ /dev/null
@@ -1,76 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Log loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
-    return (double loss) {
-  /*
-   * Computes the forward pass for a log loss function.
-   *
-   *   ```
-   *   L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i)
-   *   L = (1/N) sum(L_i) for i=1 to N
-   *   ```
-   *
-   * In these equations, `L` is the total loss, `L_i` is the loss for
-   * example `i`, `y_i` is the binary target, `pred_i` is probability
-   * of the true class (i.e. `y=1`), and `N` is the number of examples.
-   *
-   * This can be interpreted as the negative log-likelihood assuming
-   * a Bernoulli distribution.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, 1).
-   *      Predictions should be probabilities of the true
-   *      class (i.e. probability of `y=1`).
-   *  - y: Targets, of shape (N, 1).
-   *      Targets should be binary in the set {0, 1}.
-   *
-   * Outputs:
-   *  - loss: Average loss.
-   */
-  N = nrow(y)
-  losses = -y*log(pred) - (1-y)*log(1-pred)
-  loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
-    return (matrix[double] dpred) {
-  /*
-   * Computes the backward pass for a log loss function.
-   *
-   * Inputs:
-   *  - pred: Predictions, of shape (N, 1).
-   *      Predictions should be probabilities of the true
-   *      class (i.e. probability of `y=1`).
-   *  - y: Targets, of shape (N, 1).
-   *      Targets should be binary in the set {0, 1}.
-   *
-   * Outputs:
-   *  - dpred: Gradient wrt `pred`, of shape (N, 1).
-   */
-  N = nrow(y)
-  dpred = (1/N) * (pred-y) / (pred*(1-pred))
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/lstm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/lstm.dml b/scripts/staging/SystemML-NN/nn/layers/lstm.dml
deleted file mode 100644
index a75add4..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/lstm.dml
+++ /dev/null
@@ -1,260 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * LSTM layer.
- */
-source("nn/layers/sigmoid.dml") as sigmoid
-source("nn/layers/tanh.dml") as tanh
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
-                   boolean return_sequences, matrix[double] out0, matrix[double] c0)
-    return (matrix[double] out, matrix[double] c,
-            matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog) {
-  /*
-   * Computes the forward pass for an LSTM layer with M neurons.
-   * The input data has N sequences of T examples, each with D features.
-   *
-   * In an LSTM, an internal cell state is maintained, additive
-   * interactions operate over the cell state at each timestep, and
-   * some amount of this cell state is exposed as output at each
-   * timestep.  Additionally, the output of the previous timestep is fed
-   * back in as an additional input at the current timestep.
-   *
-   * Reference:
-   *  - Long Short-Term Memory, Hochreiter, 1997
-   *    - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, T*D).
-   *  - W: Weights, of shape (D+M, 4M).
-   *  - b: Biases, of shape (1, 4M).
-   *  - T: Length of example sequences (number of timesteps).
-   *  - D: Dimensionality of the input features (number of features).
-   *  - return_sequences: Whether to return `out` at all timesteps,
-   *      or just for the final timestep.
-   *  - out0: Outputs from previous timestep, of shape (N, M).
-   *      Note: This is *optional* and could just be an empty matrix.
-   *  - c0: Initial cell state, of shape (N, M).
-   *      Note: This is *optional* and could just be an empty matrix.
-   *
-   * Outputs:
-   *  - out: If `return_sequences` is True, outputs for all timesteps,
-   *      of shape (N, T*M).  Else, outputs for the final timestep, of
-   *      shape (N, M).
-   *  - c: Cell state for final timestep, of shape (N, M).
-   *  - cache_out: Cache of outputs, of shape (T, N*M).
-   *      Note: This is used for performance during training.
-   *  - cache_c: Cache of cell state, of shape (T, N*M).
-   *      Note: This is used for performance during training.
-   *  - cache_ifog: Cache of intermediate values, of shape (T, N*4M).
-   *      Note: This is used for performance during training.
-   */
-  N = nrow(X)
-  M = as.integer(ncol(W)/4)
-  out_prev = out0
-  c_prev = c0
-  c = c_prev
-  if (return_sequences) {
-    out = matrix(0, rows=N, cols=T*M)
-  }
-  else {
-    out = matrix(0, rows=N, cols=M)
-  }
-  # caches to be used during the backward pass for performance
-  cache_out = matrix(0, rows=T, cols=N*M)
-  cache_c = matrix(0, rows=T, cols=N*M)
-  cache_ifog = matrix(0, rows=T, cols=N*4*M)
-
-  for (t in 1:T) {  # each timestep
-    X_t = X[,(t-1)*D+1:t*D]  # shape (N, D)
-    input = cbind(X_t, out_prev)  # shape (N, D+M)
-    ifog = input %*% W + b  # input, forget, output, and g gates; shape (N, 4M)
-    tmp = sigmoid::forward(ifog[,1:3*M])  # i,f,o gates squashed with sigmoid
-    ifog[,1:3*M] = tmp
-    tmp = tanh::forward(ifog[,3*M+1:4*M])  # g gate squashed with tanh
-    ifog[,3*M+1:4*M] = tmp
-    # c_t = f*prev_c + i*g
-    c = ifog[,M+1:2*M]*c_prev + ifog[,1:M]*ifog[,3*M+1:4*M]  # shape (N, M)
-    # out_t = o*tanh(c)
-    tmp = tanh::forward(c)
-    out_t = ifog[,2*M+1:3*M] * tmp  # shape (N, M)
-
-    # store
-    if (return_sequences) {
-      out[,(t-1)*M+1:t*M] = out_t
-    }
-    else {
-      out = out_t
-    }
-    out_prev = out_t
-    c_prev = c
-    cache_out[t,] = matrix(out_t, rows=1, cols=N*M)  # reshape
-    cache_c[t,] = matrix(c, rows=1, cols=N*M)  # reshape
-    cache_ifog[t,] = matrix(ifog, rows=1, cols=N*4*M)  # reshape
-  }
-}
-
-backward = function(matrix[double] dout, matrix[double] dc,
-                    matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
-                    boolean given_sequences, matrix[double] out0, matrix[double] c0,
-                    matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db,
-            matrix[double] dout0, matrix[double] dc0) {
-  /*
-   * Computes the backward pass for an LSTM layer with M neurons.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out`.  If `given_sequences` is `True`,
-   *      contains gradients on outputs for all timesteps, of
-   *      shape (N, T*M). Else, contains the gradient on the output
-   *      for the final timestep, of shape (N, M).
-   *  - dc: Gradient wrt `c` (from later in time), of shape (N, M).
-   *      This would come from later in time if the cell state was used
-   *      downstream as the initial cell state for another LSTM layer.
-   *      Typically, this would be used when a sequence was cut at
-   *      timestep `T` and then continued in the next batch.  If `c`
-   *      was not used downstream, then `dc` would be an empty matrix.
-   *  - X: Inputs, of shape (N, T*D).
-   *  - W: Weights, of shape (D+M, 4M).
-   *  - b: Biases, of shape (1, 4M).
-   *  - T: Length of example sequences (number of timesteps).
-   *  - D: Dimensionality of the input features.
-   *  - given_sequences: Whether `dout` is for all timesteps,
-   *      or just for the final timestep.  This is based on whether
-   *      `return_sequences` was true in the forward pass.
-   *  - out0: Outputs from previous timestep, of shape (N, M).
-   *      Note: This is *optional* and could just be an empty matrix.
-   *  - c0: Initial cell state, of shape (N, M).
-   *      Note: This is *optional* and could just be an empty matrix.
-   *  - cache_out: Cache of outputs, of shape (T, N*M).
-   *      Note: This is used for performance during training.
-   *  - cache_c: Cache of cell state, of shape (T, N*M).
-   *      Note: This is used for performance during training.
-   *  - cache_ifog: Cache of intermediate values, of shape (T, N*4*M).
-   *      Note: This is used for performance during training.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, T*D).
-   *  - dW: Gradient wrt `W`, of shape (D+M, 4M).
-   *  - db: Gradient wrt `b`, of shape (1, 4M).
-   *  - dout0: Gradient wrt `out0`, of shape (N, M).
-   *  - dc0: Gradient wrt `c0`, of shape (N, M).
-   */
-  N = nrow(X)
-  M = as.integer(ncol(W)/4)
-  dX = matrix(0, rows=N, cols=T*D)
-  dW = matrix(0, rows=D+M, cols=4*M)
-  db = matrix(0, rows=1, cols=4*M)
-  dout0 = matrix(0, rows=N, cols=M)
-  dc0 = matrix(0, rows=N, cols=M)
-  dct = dc
-  if (!given_sequences) {
-    # only given dout for output at final timestep, so prepend empty douts for all other timesteps
-    dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout)  # shape (N, T*M)
-  }
-
-  t = T
-  for (iter in 1:T) {  # each timestep in reverse order
-    X_t = X[,(t-1)*D+1:t*D]  # shape (N, D)
-    dout_t = dout[,(t-1)*M+1:t*M]  # shape (N, M)
-    out_t = matrix(cache_out[t,], rows=N, cols=M)  # shape (N, M)
-    ct = matrix(cache_c[t,], rows=N, cols=M)  # shape (N, M)
-    if (t == 1) {
-      out_prev = out0  # shape (N, M)
-      c_prev = c0  # shape (N, M)
-    }
-    else {
-      out_prev = matrix(cache_out[t-1,], rows=N, cols=M)  # shape (N, M)
-      c_prev = matrix(cache_c[t-1,], rows=N, cols=M)  # shape (N, M)
-    }
-    input = cbind(X_t, out_prev)  # shape (N, D+M)
-    ifog = matrix(cache_ifog[t,], rows=N, cols=4*M)
-    i = ifog[,1:M]  # input gate, shape (N, M)
-    f = ifog[,M+1:2*M]  # forget gate, shape (N, M)
-    o = ifog[,2*M+1:3*M]  # output gate, shape (N, M)
-    g = ifog[,3*M+1:4*M]  # g gate, shape (N, M)
-
-    tmp = tanh::backward(dout_t, ct)
-    dct = dct + o*tmp  # shape (N, M)
-    tmp = tanh::forward(ct)
-    do = tmp * dout_t  # output gate, shape (N, M)
-    df = c_prev * dct  # forget gate, shape (N, M)
-    dc_prev = f * dct  # shape (N, M)
-    di = g * dct  # input gate, shape (N, M)
-    dg = i * dct  # g gate, shape (N, M)
-
-    di_raw = i * (1-i) * di
-    df_raw = f * (1-f) * df
-    do_raw = o * (1-o) * do
-    dg_raw = (1-g^2) * dg
-    difog_raw = cbind(di_raw, cbind(df_raw, cbind(do_raw, dg_raw)))  # shape (N, 4M)
-
-    dW = dW + t(input) %*% difog_raw  # shape (D+M, 4M)
-    db = db + colSums(difog_raw)  # shape (1, 4M)
-    dinput = difog_raw %*% t(W)  # shape (N, D+M)
-    dX[,(t-1)*D+1:t*D] = dinput[,1:D]
-    dout_prev = dinput[,D+1:D+M]  # shape (N, M)
-    if (t == 1) {
-      dout0 = dout_prev  # shape (N, M)
-      dc0 = dc_prev  # shape (N, M)
-    }
-    else {
-      dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev  # shape (N, M)
-      dct = dc_prev  # shape (N, M)
-    }
-    t = t - 1
-  }
-}
-
-init = function(int N, int D, int M)
-    return (matrix[double] W, matrix[double] b, matrix[double] out0, matrix[double] c0) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * We use the Glorot uniform heuristic which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
-   *  - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
-   *
-   * Inputs:
-   *  - N: Number of examples in batch.
-   *  - D: Dimensionality of the input features (number of features).
-   *  - M: Number of neurons in this layer.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (D+M, 4M).
-   *  - b: Biases, of shape (1, 4M).
-   *  - out0: Empty previous timestep output matrix, of shape (N, M).
-   *  - c0: Empty initial cell state matrix, of shape (N, M).
-   */
-  fan_in = D+M
-  fan_out = 4*M
-  scale = sqrt(6/(fan_in+fan_out))
-  W = rand(rows=D+M, cols=4*M, min=-scale, max=scale, pdf="uniform")
-  b = matrix(0, rows=1, cols=4*M)
-  out0 = matrix(0, rows=N, cols=M)
-  c0 = matrix(0, rows=N, cols=M)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
deleted file mode 100644
index fba1a4c..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
+++ /dev/null
@@ -1,159 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Max Pooling layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * This implementation uses `im2col` internally for each image to
-   * extract local image regions (patches) of each channel slice into
-   * columns, and then performs max pooling over the patches to compute
-   * the output maps.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
-  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-  pad_value = -1/0  # in max pooling we pad with -infinity
-
-  # Create output volume
-  out = matrix(0, rows=N, cols=C*Hout*Wout)
-
-  # Max pooling - im2col implementation
-  parfor (n in 1:N) {  # all examples
-    img = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
-
-    if (padh > 0 | padw > 0) {
-      # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
-      img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
-    }
-
-    img_maxes = matrix(0, rows=C, cols=Hout*Wout)  # zeros
-    parfor (c in 1:C) {  # all channels
-      # Extract local image slice patches into columns with im2col, of shape (Hf*Wf, Hout*Wout)
-      img_slice_cols = util::im2col(img[c,], Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
-
-      # Max pooling on patches
-      img_maxes[c,] = colMaxs(img_slice_cols)
-    }
-
-    out[n,] = matrix(img_maxes, rows=1, cols=C*Hout*Wout)
-  }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   */
-  N = nrow(X)
-  pad_value = -1/0  # in max pooling we pad with -infinity
-
-  # Create gradient volume
-  dX = matrix(0, rows=N, cols=C*Hin*Win)
-
-  # Gradient of max pooling
-  parfor (n in 1:N, check=0) {  # all examples
-    img = matrix(X[n,], rows=C, cols=Hin*Win)
-    if (padh > 0 | padw > 0) {
-      # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
-      img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
-    }
-
-    dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
-    parfor (c in 1:C, check=0) {  # all channels
-      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
-      for (hout in 1:Hout, check=0) {  # all output rows
-        hin = (hout-1)*strideh + 1
-        for (wout in 1:Wout) {  # all output columns
-          win = (wout-1)*stridew + 1
-          img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
-          max_val_ind = img_slice_patch == max(img_slice_patch)  # max value indicator matrix
-          # gradient passes through only for the max value(s) in this patch
-          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
-          dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
-                                                   + dimg_slice_patch
-        }
-      }
-      dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
-    }
-
-    if (padh > 0 | padw > 0) {
-      # Unpad image gradient
-      dimg = util::unpad_image(dimg, Hin, Win, padh, padw)  # shape (C, (Hin+2*padh)*(Win+2*padw))
-    }
-    dX[n,] = matrix(dimg, rows=1, cols=C*Hin*Win)
-  }
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
deleted file mode 100644
index 880f818..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
+++ /dev/null
@@ -1,103 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Max Pooling layer.
- *
- * This implementation uses a built-in operator for higher performance.
- */
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * This implementation uses a built-in operator for higher
-   * performance.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
-  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
-  # Max pooling - built-in implementation
-  out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
-                 stride=[strideh,stridew], padding=[padh,padw])
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   */
-  N = nrow(X)
-
-  # Gradient of max pooling
-  dX = max_pool_backward(X, dout, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
-                         stride=[strideh,stridew], padding=[padh,padw])
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/relu.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/relu.dml b/scripts/staging/SystemML-NN/nn/layers/relu.dml
deleted file mode 100644
index 93a6e90..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/relu.dml
+++ /dev/null
@@ -1,59 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Rectified Linear Unit (ReLU) nonlinearity layer.
- */
-
-forward = function(matrix[double] X)
-    return (matrix[double] out) {
-  /*
-   * Computes the forward pass for a ReLU nonlinearity layer.
-   *
-   * Performs an element-wise evaluation of `f(input) = max(0, input)`.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *
-   * Outputs:
-   *  - out: Outputs, of same shape as `X`.
-   */
-  out = max(X, 0)
-}
-
-backward = function(matrix[double] dout, matrix[double] X)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a ReLU nonlinearity layer.
-   *
-   * Essentially performs a pass-through of the upstream gradient
-   * for cells > 0.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
-   *  - X: Previous input data matrix, of shape (any, any).
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of same shape as `X`.
-   */
-   dX = (X > 0) * dout
-}
-