You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/04/26 21:42:31 UTC
[05/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn` library to `scripts/nn`

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/rnn.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/rnn.dml b/scripts/staging/SystemML-NN/nn/layers/rnn.dml
deleted file mode 100644
index 3c6faae..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/rnn.dml
+++ /dev/null
@@ -1,183 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Simple (Vanilla) RNN layer.
- */
-source("nn/layers/tanh.dml") as tanh
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
-                   boolean return_sequences, matrix[double] out0)
-    return (matrix[double] out, matrix[double] cache_out) {
-  /*
-   * Computes the forward pass for a simple RNN layer with M neurons.
-   * The input data has N sequences of T examples, each with D features.
-   *
-   * In a simple RNN, the output of the previous timestep is fed back
-   * in as an additional input at the current timestep.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, T*D).
-   *  - W: Weights, of shape (D+M, M).
-   *  - b: Biases, of shape (1, M).
-   *  - T: Length of example sequences (number of timesteps).
-   *  - D: Dimensionality of the input features (number of features).
-   *  - return_sequences: Whether to return `out` at all timesteps,
-   *      or just for the final timestep.
-   *  - out0: Output matrix from previous timestep, of shape (N, M).
-   *      Note: This is *optional* and could just be an empty matrix.
-   *
-   * Outputs:
-   *  - out: If `return_sequences` is True, outputs for all timesteps,
-   *      of shape (N, T*M).  Else, outputs for the final timestep, of
-   *      shape (N, M).
-   *  - cache_out: Cache of outputs, of shape (T, N*M).
-   *      Note: This is used for performance during training.
-   */
-  N = nrow(X)
-  M = ncol(W)
-  out_prev = out0
-  if (return_sequences) {
-    out = matrix(0, rows=N, cols=T*M)
-  }
-  else {
-    out = matrix(0, rows=N, cols=M)
-  }
-  # caches to be used during the backward pass for performance
-  cache_out = matrix(0, rows=T, cols=N*M)
-
-  for (t in 1:T) {  # each timestep
-    X_t = X[,(t-1)*D+1:t*D]  # shape (N, D)
-    input = cbind(X_t, out_prev)  # shape (N, D+M)
-    out_t = tanh::forward(input %*% W + b)  # shape (N, M)
-    # store
-    if (return_sequences) {
-      out[,(t-1)*M+1:t*M] = out_t
-    }
-    else {
-      out = out_t
-    }
-    out_prev = out_t
-    cache_out[t,] = matrix(out_t, rows=1, cols=N*M)  # reshape
-  }
-}
-
-backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, matrix[double] b,
-                    int T, int D, boolean given_sequences, matrix[double] out0,
-                    matrix[double] cache_out)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db, matrix[double] dout0) {
-  /*
-   * Computes the backward pass for a simple RNN layer with M neurons.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream.  If `given_sequences`
-   *      is True, contains gradients on outputs for all timesteps,
-   *      of shape (N, T*M).  Else, contains gradient on output for
-   *      the final timestep, of shape (N, M).
-   *  - X: Inputs, of shape (N, T*D).
-   *  - W: Weights, of shape (D+M, M).
-   *  - b: Biases, of shape (1, M).
-   *  - T: Length of example sequences (number of timesteps).
-   *  - D: Dimensionality of the input features (number of features).
-   *  - given_sequences: Whether `dout` is for all timesteps,
-   *      or just for the final timestep.  This is based on whether
-   *      `return_sequences` was true in the forward pass.
-   *  - out0: Output matrix from previous timestep, of shape (N, M).
-   *      Note: This is *optional* and could just be an empty matrix.
-   *  - cache_out: Cache of outputs, of shape (T, N*M).
-   *      Note: This is used for performance during training.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, T*D).
-   *  - dW: Gradient wrt `W`, of shape (D+M, 4M).
-   *  - db: Gradient wrt `b`, of shape (1, 4M).
-   *  - dout0: Gradient wrt `out0`, of shape (N, M).
-   */
-  N = nrow(X)
-  M = ncol(W)
-  dX = matrix(0, rows=N, cols=T*D)
-  dW = matrix(0, rows=D+M, cols=M)
-  db = matrix(0, rows=1, cols=M)
-  dout0 = matrix(0, rows=N, cols=M)
-  if (!given_sequences) {
-    # only given dout for output at final timestep, so prepend empty douts for all other timesteps
-    dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout)  # shape (N, T*M)
-  }
-
-  t = T
-  for (iter in 1:T) {  # each timestep in reverse order
-    X_t = X[,(t-1)*D+1:t*D]  # shape (N, D)
-    dout_t = dout[,(t-1)*M+1:t*M]  # shape (N, M)
-    out_t = matrix(cache_out[t,], rows=N, cols=M)  # shape (N, M)
-    if (t == 1) {
-      out_prev = out0  # shape (N, M)
-    }
-    else {
-      out_prev = matrix(cache_out[t-1,], rows=N, cols=M)  # shape (N, M)
-    }
-    input = cbind(X_t, out_prev)  # shape (N, D+M)
-    dout_t_raw = (1-out_t^2) * dout_t  # into tanh, shape (N, M)
-    dW = dW + t(input) %*% dout_t_raw  # shape (D+M, M)
-    db = db + colSums(dout_t_raw)  # shape (1, M)
-    dinput = dout_t_raw %*% t(W)  # shape (N, D+M)
-    dX[,(t-1)*D+1:t*D] = dinput[,1:D]
-    dout_prev = dinput[,D+1:D+M]  # shape (N, M)
-    if (t == 1) {
-      dout0 = dout_prev  # shape (N, M)
-    }
-    else {
-      dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev  # shape (N, M)
-    }
-    t = t - 1
-  }
-}
-
-init = function(int N, int D, int M)
-    return (matrix[double] W, matrix[double] b, matrix[double] out0) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * We use the Glorot uniform heuristic which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
-   *  - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
-   *
-   * Inputs:
-   *  - N: Number of examples in batch.
-   *  - D: Dimensionality of the input features (number of features).
-   *  - M: Number of neurons in this layer.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (D+M, M).
-   *  - b: Biases, of shape (1, M).
-   *  - out0: Empty previous timestep output matrix, of shape (N, M).
-   */
-  fan_in = D+M
-  fan_out = M
-  scale = sqrt(6/(fan_in+fan_out))
-  W = rand(rows=D+M, cols=M, min=-scale, max=scale, pdf="uniform")
-  b = matrix(0, rows=1, cols=M)
-  out0 = matrix(0, rows=N, cols=M)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml b/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml
deleted file mode 100644
index 7e162a3..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml
+++ /dev/null
@@ -1,95 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 1D Scale & Shift layer.
- */
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta)
-    return (matrix[double] out) {
-  /*
-   * Computes the forward pass for a 1D scale & shift layer. The input
-   * data has N examples, each with D features.
-   *
-   * A 1D scale & shift layer introduces learnable parameters
-   * (gamma, beta) to scale and shift the input on a per-feature basis.
-   *
-   *   `y = x*gamma + beta`
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, D).
-   *  - gamma: Scale parameters, of shape (1, D).
-   *  - beta: Shift parameters, of shape (1, D).
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, D).
-   */
-  # Scale and shift
-  out = X*gamma + beta  # shape (N, D)
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
-                    matrix[double] X, matrix[double] gamma, matrix[double] beta)
-      return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
-  /*
-   * Computes the backward pass for a 1D scale & shift layer.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of shape (N, D).
-   *  - out: Outputs from the forward pass, of shape (N, D).
-   *  - X: Inputs, of shape (N, D).
-   *  - gamma: Scale parameters, of shape (1, D).
-   *  - beta: Shift parameters, of shape (1, D).
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, D).
-   *  - dgamma: Gradient wrt `W`, of shape (1, D).
-   *  - dbeta: Gradient wrt `b`, of shape (1, D).
-   *
-   */
-  # Compute gradients during training
-  dgamma = colSums(dout*X)  # shape (1, D)
-  dbeta = colSums(dout)  # shape (1, D)
-  dX = dout * gamma  # shape (N, D)
-}
-
-init = function(int D)
-    return (matrix[double] gamma, matrix[double] beta) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * By default, we initialize to an identity function, with a scale
-   * filler of `1`, and a shift filler of `0`.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - D: Dimensionality of the input features (number of features).
-   *
-   * Outputs:
-   *  - gamma: Scale parameters, of shape (1, D).
-   *  - beta: Shift parameters, of shape (1, D).
-   */
-   gamma = matrix(1, rows=1, cols=D)
-   beta = matrix(0, rows=1, cols=D)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml b/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml
deleted file mode 100644
index 79c884a..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml
+++ /dev/null
@@ -1,107 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Scale & Shift layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
-                   int C, int Hin, int Win)
-    return (matrix[double] out) {
-  /*
-   * Computes the forward pass for a 2D scale & shift layer.  The input
-   * data has N examples, each represented as a 3D volume unrolled into
-   * a single vector.
-   *
-   * A 2D scale & shift layer introduces learnable parameters
-   * (gamma, beta) to scale and shift the input on a per-channel basis.
-   *
-   *   `y = x*gamma + beta`
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - gamma: Scale parameters, of shape (C, 1).
-   *  - beta: Shift parameters, of shape (C, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hin*Win).
-   */
-  # Scale and shift
-  scaled = bias_multiply(X, gamma)  # shape (N, C*Hin*Win)
-  out = bias_add(scaled, beta)  # shape (N, C*Hin*Win)
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
-                    matrix[double] X, matrix[double] gamma, matrix[double] beta,
-                    int C, int Hin, int Win)
-      return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
-  /*
-   * Computes the backward pass for a 2D scale & shift layer.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
-   *  - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
-   *  - X: Input data matrix to the forward pass, of
-   *      shape (N, C*Hin*Win).
-   *  - gamma: Scale parameters, of shape (C, 1).
-   *  - beta: Shift parameters, of shape (C, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dgamma: Gradient wrt `W`, of shape (C, 1).
-   *  - dbeta: Gradient wrt `b`, of shape (C, 1).
-   *
-   */
-  # Compute gradients during training
-  dgamma = util::channel_sums(dout*X, C, Hin, Win)  # shape (C, 1)
-  dbeta = util::channel_sums(dout, C, Hin, Win)  # shape (C, 1)
-  dX = bias_multiply(dout, gamma)  # shape (N, C*Hin*Win)
-}
-
-init = function(int C)
-    return (matrix[double] gamma, matrix[double] beta) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * By default, we initialize to an identity function, with a scale
-   * filler of `1`, and a shift filler of `0`.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - C: Number of input channels (dimensionality of input depth).
-   *
-   * Outputs:
-   *  - gamma: Scale parameters, of shape (C, 1).
-   *  - beta: Shift parameters, of shape (C, 1).
-   */
-   gamma = matrix(1, rows=C, cols=1)
-   beta = matrix(0, rows=C, cols=1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml b/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
deleted file mode 100644
index 2d85adc..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml
+++ /dev/null
@@ -1,62 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Sigmoid nonlinearity layer.
- */
-
-forward = function(matrix[double] X)
-    return (matrix[double] out) {
-  /*
-   * Computes the forward pass for a sigmoid nonlinearity layer.
-   *
-   *   `sigmoid(x) = 1 / (1 + e^-x)`
-   *
-   * If `X` contains a single feature column, the output of a sigmoid
-   * layer can be interpreted as a predicted probability of a true
-   * class when paired with a log loss function in a binary
-   * classification problem.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *
-   * Outputs:
-   *  - out: Outputs, of same shape as `X`.
-   */
-  out = 1 / (1+exp(-X))
-}
-
-backward = function(matrix[double] dout, matrix[double] X)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a sigmoid nonlinearity layer.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
-   *  - X: Inputs, of shape (any, any).
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of same shape as `X`.
-   */
-  out = 1 / (1+exp(-X))
-  dX = out * (1-out) * dout
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/softmax.dml b/scripts/staging/SystemML-NN/nn/layers/softmax.dml
deleted file mode 100644
index 68a7bc7..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/softmax.dml
+++ /dev/null
@@ -1,87 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Softmax classifier layer.
- */
-
-forward = function(matrix[double] scores)
-    return (matrix[double] probs) {
-  /*
-   * Computes the forward pass for a softmax classifier.  The inputs
-   * are interpreted as unnormalized, log-probabilities for each of
-   * N examples, and the softmax function transforms them to normalized
-   * probabilities.
-   *
-   * This can be interpreted as a generalization of the sigmoid
-   * function to multiple classes.
-   *
-   *   `probs_ij = e^scores_ij / sum(e^scores_i)`
-   *
-   * Inputs:
-   *  - scores: Inputs, of shape (N, D).
-   *
-   * Outputs:
-   *  - probs: Outputs, of shape (N, D).
-   */
-  # For numerical stability, we subtract the max score of an example from all scores for that
-  # example.  This is equivalent to the original formulation:
-  # e^scores_i / sum(e^scores_i) == C*e^scores_i / C*sum(e^scores_i)
-  #                              == e^(scores_i+log(C)) / sum(e^(scores_i+log(C))
-  # set log(C) = -max(scores_i):
-  #                              == e^(scores_i-max(scores_i)) / sum(e^(scores_i-max(scores_i))
-  scores = scores - rowMaxs(scores)  # numerical stability
-  unnorm_probs = exp(scores)  # unnormalized probabilities
-  probs = unnorm_probs / rowSums(unnorm_probs)  # normalized probabilities
-}
-
-backward = function(matrix[double] dprobs, matrix[double] scores)
-    return (matrix[double] dscores) {
-  /*
-   * Computes the backward pass for a softmax classifier.
-   *
-   * Note that dscores_ij has multiple source branches:
-   *
-   *   ```
-   *   dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij)
-   *   dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j
-   *
-   *   dloss/dscores_ij =
-   *      (dloss/dprobs_ij * dprobs_ij/dscores_ij)
-   *      + sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij)
-   *   ```
-   *
-   * Inputs:
-   *  - dprobs: Gradient wrt `probs` from upstream, of shape (N, D).
-   *  - scores: Inputs, of shape (N, D).
-   *
-   * Outputs:
-   *  - dscores: Gradient wrt `scores`, of shape (N, D).
-   */
-  scores = scores - rowMaxs(scores)  # numerical stability
-  unnorm_probs = exp(scores)  # unnormalized probabilities
-  probs = unnorm_probs / rowSums(unnorm_probs)  # normalized probabilities
-  # After some cancellation:
-  # dscores = dprobs*probs - probs*rowSums(dprobs*probs)
-  dtemp = dprobs * probs
-  dscores = dtemp - probs*rowSums(dtemp)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/tanh.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/tanh.dml b/scripts/staging/SystemML-NN/nn/layers/tanh.dml
deleted file mode 100644
index d849d70..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/tanh.dml
+++ /dev/null
@@ -1,65 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Tanh nonlinearity layer.
- */
-source("nn/layers/sigmoid.dml") as sigmoid
-
-forward = function(matrix[double] X)
-    return (matrix[double] out) {
-  /*
-   * Computes the forward pass for a tanh nonlinearity layer.
-   *
-   *   ```
-   *   tanh(x) = (e^x - e^-x) / (e^x + e^-x)
-   *           = 2 * sigmoid(2x) - 1
-   *   ```
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (any, any).
-   *
-   * Outputs:
-   *  - out: Outputs, of same shape as `X`.
-   */
-  # out = (exp(X) - exp(-X)) / (exp(X) + exp(-X))
-  # Simplification of the above formulation to use the sigmoid function:
-  sigma2X = sigmoid::forward(2*X)
-  out = 2*sigma2X - 1
-}
-
-backward = function(matrix[double] dout, matrix[double] X)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a tanh nonlinearity layer.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
-   *  - X: Inputs, of shape (any, any).
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of same shape as `X`.
-   */
-  sigma2X = sigmoid::forward(2*X)
-  out = 2*sigma2X - 1
-  dX = (1-out^2) * dout
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml b/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
deleted file mode 100644
index 85b1c41..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml
+++ /dev/null
@@ -1,77 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Adagrad optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon,
-                  matrix[double] cache)
-    return (matrix[double] X, matrix[double] cache) {
-  /*
-   * Performs an Adagrad update.
-   *
-   * This is an adaptive learning rate optimizer that maintains the
-   * sum of squared gradients to automatically adjust the effective
-   * learning rate.
-   *
-   * Reference:
-   *  - Adaptive Subgradient Methods for Online Learning and Stochastic
-   *    Optimization, Duchi et al.
-   *      - http://jmlr.org/papers/v12/duchi11a.html
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient wrt `X` of a loss function being optimized, of
-   *      same shape as `X`.
-   *  - lr: Learning rate.
-   *  - epsilon: Smoothing term to avoid divide by zero errors.
-   *      Typical values are in the range of [1e-8, 1e-4].
-   *  - cache: State that maintains per-parameter sum of squared
-   *      gradients, of same shape as `X`.
-   *
-   * Outputs:
-   *  - X: Updated parameters `X`, of same shape as input `X`.
-   *  - cache: State that maintains per-parameter sum of squared
-   *      gradients, of same shape as `X`.
-   */
-  cache = cache + dX^2
-  X = X - (lr * dX / (sqrt(cache)+epsilon))
-}
-
-init = function(matrix[double] X)
-    return (matrix[double] cache) {
-  /*
-   * Initialize the state for this optimizer.
-   *
-   * Note: This is just a convenience function, and state
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *
-   * Outputs:
-   *  - cache: State that maintains per-parameter sum of squared
-   *      gradients, of same shape as `X`.
-   */
-  cache = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/adam.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/adam.dml b/scripts/staging/SystemML-NN/nn/optim/adam.dml
deleted file mode 100644
index 4b6fa2a..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/adam.dml
+++ /dev/null
@@ -1,97 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Adam optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double beta1, double beta2,
-                  double epsilon, int t, matrix[double] m, matrix[double] v)
-    return (matrix[double] X, matrix[double] m, matrix[double] v) {
-  /*
-   * Performs an Adam update.
-   *
-   * Reference:
-   *  - Adam: A Method for Stochastic Optimization, Kingma, Ba.
-   *    - http://arxiv.org/abs/1412.6980
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient wrt `X` of a loss function being optimized, of
-   *      same shape as `X`.
-   *  - lr: Learning rate.  Recommended value is 0.001.
-   *  - beta1: Exponential decay rate for the 1st moment estimates.
-   *      Recommended value is 0.9.
-   *  - beta2: Exponential decay rate for the 2nd moment estimates.
-   *      Recommended value is 0.999.
-   *  - epsilon: Smoothing term to avoid divide by zero errors.
-   *      Recommended value is 1e-8.
-   *  - t: Timestep, starting at 0.
-   *  - m: State containing the 1st moment (mean) estimate by
-   *      maintaining exponential moving averages of the gradients, of
-   *      same shape as `X`.
-   *  - v: State containing the 2nd raw moment (uncentered variance)
-   *      estimate by maintaining exponential moving averages of the
-   *      squared gradients, of same shape as `X`.
-   *
-   * Outputs:
-   *  - X: Updated parameters `X`, of same shape as input `X`.
-   *  - m: Updated state containing the 1st moment (mean) estimate by
-   *      maintaining exponential moving averages of the gradients, of
-   *      same shape as `X`.
-   *  - v: Updated state containing the 2nd raw moment (uncentered
-   *      variance) estimate by maintaining exponential moving averages
-   *      of the squared gradients, of same shape as `X`.
-   */
-  t = t + 1
-  m = beta1*m + (1-beta1)*dX  # update biased 1st moment estimate
-  v = beta2*v + (1-beta2)*dX^2  # update biased 2nd raw moment estimate
-  # m = m / (1-beta1^t)  # compute bias-corrected 1st moment estimate
-  # v = v / (1-beta2^t)  # compute bias-corrected 2nd raw moment estimate
-  # X = X - (lr * m / (sqrt(v)+epsilon))  # param update
-  # Simplified for computational efficiency:
-  lr = lr * sqrt(1-beta2^t) / (1-beta1^t)
-  X = X - (lr * m / (sqrt(v)+epsilon))
-}
-
-init = function(matrix[double] X)
-    return (matrix[double] m, matrix[double] v) {
-  /*
-   * Initialize the state for this optimizer.
-   *
-   * Note: This is just a convenience function, and state
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *
-   * Outputs:
-   *  - m: Initial state containing the 1st moment (mean) estimate by
-   *      maintaining exponential moving averages of the gradients, of
-   *      same shape as `X`.
-   *  - v: Initial state containing the 2nd raw moment (uncentered
-   *      variance) estimate by maintaining exponential moving averages
-   *      of the squared gradients, of same shape as `X`.
-   */
-  m = matrix(0, rows=nrow(X), cols=ncol(X))
-  v = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml b/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
deleted file mode 100644
index 1feccaf..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml
+++ /dev/null
@@ -1,79 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * RMSprop optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double decay_rate,
-                  double epsilon, matrix[double] cache)
-    return (matrix[double] X, matrix[double] cache) {
-  /*
-   * Performs an RMSprop update.
-   *
-   * This is an adaptive learning rate optimizer that can be viewed
-   * as an adjustment of the Adagrad method to use a moving average
-   * of the sum of squared gradients in order to improve convergence.
-   *
-   * Reference:
-   *  - Neural Networks for Machine Learning, Lecture 6a, Hinton,
-   *    slide 29.
-   *    - http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient wrt `X` of a loss function being optimized, of
-   *      same shape as `X`.
-   *  - lr: Learning rate.
-   *  - decay_rate: Term controlling the rate of the moving average.
-   *      Typical values are in the range of [0.9, 0.999].
-   *  - epsilon: Smoothing term to avoid divide by zero errors.
-   *      Typical values are in the range of [1e-8, 1e-4].
-   *  - cache: State that maintains the moving average of the squared
-   *      gradients, of same shape as `X`.
-   *
-   * Outputs:
-   *  - X: Updated parameters `X`, of same shape as input `X`.
-   *  - cache: Updated state that maintains the moving average of the
-   *      squared gradients, of same shape as `X`.
-   */
-  cache = decay_rate*cache + (1-decay_rate)*dX^2
-  X = X - (lr * dX / (sqrt(cache)+epsilon))
-}
-
-init = function(matrix[double] X)
-    return (matrix[double] cache) {
-  /*
-   * Initialize the state for this optimizer.
-   *
-   * Note: This is just a convenience function, and state
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *
-   * Outputs:
-   *  - cache: State that maintains the moving average of the squared
-   *      gradients, of same shape as `X`.
-   */
-  cache = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/sgd.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd.dml b/scripts/staging/SystemML-NN/nn/optim/sgd.dml
deleted file mode 100644
index 3ba7eba..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/sgd.dml
+++ /dev/null
@@ -1,42 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Stochastic Gradient Descent (SGD) optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr)
-    return (matrix[double] X) {
-  /*
-   * Performs a vanilla SGD update.
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient wrt `X` of a loss function being optimized, of
-   *      same shape as `X`.
-   *  - lr: Learning rate.
-   *
-   * Outputs:
-   *  - X: Updated parameters `X`, of same shape as input `X`.
-   */
-  X = X - lr*dX
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
deleted file mode 100644
index 85922da..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml
+++ /dev/null
@@ -1,71 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Stochastic Gradient Descent with momentum (SGD-momentum) optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
-    return (matrix[double] X, matrix[double] v) {
-  /*
-   * Performs an SGD update with momentum.
-   *
-   * In SGD with momentum, we assume that the parameters have a velocity
-   * that continues with some momentum, and that is influenced by the
-   * gradient.
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient wrt `X` of a loss function being optimized, of
-   *      same shape as `X`.
-   *  - lr: Learning rate.
-   *  - mu: Momentum value.
-   *      Typical values are in the range of [0.5, 0.99], usually
-   *      started at the lower end and annealed towards the higher end.
-   *  - v: State maintaining the velocity of the parameters `X`, of same
-   *      shape as `X`.
-   *
-   * Outputs:
-   *  - X: Updated parameters `X`, of same shape as input `X`.
-   *  - v: Updated velocity of the parameters `X`, of same shape as
-   *      input `X`.
-   */
-  v = mu*v - lr*dX  # update velocity
-  X = X + v  # update position
-}
-
-init = function(matrix[double] X)
-    return (matrix[double] v) {
-  /*
-   * Initialize the state for this optimizer.
-   *
-   * Note: This is just a convenience function, and state
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *
-   * Outputs:
-   *  - v: Initial velocity of the parameters `X`.
-   */
-  v = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
deleted file mode 100644
index 3b62c6e..0000000
--- a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml
+++ /dev/null
@@ -1,81 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Stochastic Gradient Descent with Nesterov momentum (SGD-Nesterov) optimizer.
- */
-
-update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v)
-    return (matrix[double] X, matrix[double] v) {
-  /*
-   * Performs an SGD update with Nesterov momentum.
-   *
-   * As with regular SGD with momentum, in SGD with Nesterov momentum,
-   * we assume that the parameters have a velocity that continues
-   * with some momentum, and that is influenced by the gradient.
-   * In this view specifically, we perform the position update from the
-   * position that the momentum is about to carry the parameters to,
-   * rather than from the previous position.  Additionally, we always
-   * store the parameters in their position after momentum.
-   *
-   * Reference:
-   *  - Advances in optimizing Recurrent Networks, Bengio et al.,
-   *    section 3.5.
-   *    - http://arxiv.org/abs/1212.0901
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *  - dX: Gradient wrt `X` of a loss function being optimized, of
-   *      same shape as `X`.
-   *  - lr: Learning rate.
-   *  - mu: Momentum value.
-   *      Typical values are in the range of [0.5, 0.99], usually
-   *      started at the lower end and annealed towards the higher end.
-   *  - v: State maintaining the velocity of the parameters `X`, of same
-   *      shape as `X`.
-   *
-   * Outputs:
-   *  - X: Updated parameters X, of same shape as input X.
-   *  - v: Updated velocity of the parameters X, of same shape as
-   *      input v.
-   */
-  v_prev = v
-  v = mu*v - lr*dX  # update velocity
-  X = X - mu*v_prev + (1+mu)*v  # update position, including momentum
-}
-
-init = function(matrix[double] X)
-    return (matrix[double] v) {
-  /*
-   * Initialize the state for this optimizer.
-   *
-   * Note: This is just a convenience function, and state
-   * may be initialized manually if needed.
-   *
-   * Inputs:
-   *  - X: Parameters to update, of shape (any, any).
-   *
-   * Outputs:
-   *  - v: Initial velocity of the parameters `X`.
-   */
-  v = matrix(0, rows=nrow(X), cols=ncol(X))
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/README.md
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/README.md b/scripts/staging/SystemML-NN/nn/test/README.md
deleted file mode 100644
index b714d50..0000000
--- a/scripts/staging/SystemML-NN/nn/test/README.md
+++ /dev/null
@@ -1,32 +0,0 @@
-<!--
-{% comment %}
-Licensed to the Apache Software Foundation (ASF) under one or more
-contributor license agreements.  See the NOTICE file distributed with
-this work for additional information regarding copyright ownership.
-The ASF licenses this file to you under the Apache License, Version 2.0
-(the "License"); you may not use this file except in compliance with
-the License.  You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-{% endcomment %}
--->
-
-# SystemML-NN Tests
-
-#### This folder contains tests for the *SystemML-NN* (`nn`) deep learning library.
-
----
-## Tests
-#### All layers are tested for correct derivatives ("gradient-checking"), and many layers also have correctness tests against simpler reference implementations.
-* `grad_check.dml` - Contains gradient-checks for all layers as individual DML functions.
-* `test.dml` - Contains correctness tests for several of the more complicated layers by checking against simple reference implementations, such as `conv_simple.dml`.  All tests are formulated as individual DML functions.
-* `tests.dml` - A DML script that runs all of the tests in `grad_check.dml` and `test.dml`.
-
-## Execution
-* `spark-submit SystemML.jar -f nn/test/tests.dml` from the base of the project.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml b/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
deleted file mode 100644
index 9f126d0..0000000
--- a/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
+++ /dev/null
@@ -1,213 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- *
- * This implementation is intended to be a simple, reference version.
- */
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
-                   int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial convolutional layer with
-   * F filters.  The input data has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * This implementation is intended to be a simple, reference version.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *  - padw: Padding for left and right sides.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  F = nrow(W)
-  Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
-  Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
-  # Create output volume
-  out = matrix(0, rows=N, cols=F*Hout*Wout)
-
-  # Convolution - Simple reference implementation
-  parfor (n in 1:N) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
-    # Pad image
-    Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
-    parfor (c in 1:C) {
-      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
-      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
-    }
-    # Convolve image with filters
-    parfor (f in 1:F, check=0) {  # all filters
-      parfor (hout in 1:Hout, check=0) {  # all output rows
-        h0 = (hout-1)*strideh + 1
-        parfor (wout in 1:Wout, check=0) {  # all output columns
-          w0 = (wout-1)*stridew + 1
-          # Create a patch of the input example corresponding spatially to the filter sizes
-          Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
-          parfor (c in 1:C, check=0) {
-            Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)  # reshape
-            Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf], rows=1,
-                                         cols=Hf*Wf)  # reshape
-          }
-          out[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] =
-              W[f,] %*% matrix(Xn_padded_patch, rows=C*Hf*Wf, cols=1) + b[f,]
-        }
-      }
-    }
-  }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
-                    matrix[double] X, matrix[double] W, matrix[double] b,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
-  /*
-   * Computes the backward pass for a 2D spatial convolutional layer
-   * with F filters.
-   *
-   * This implementation is intended to be a simple, reference version.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *  - padw: Padding for left and right sides.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt `b`, of shape (F, 1).
-   */
-  N = nrow(X)
-  F = nrow(W)
-
-  # Create gradient volumes
-  dX = matrix(0, rows=N, cols=C*Hin*Win)
-  dW = matrix(0, rows=F, cols=C*Hf*Wf)
-  db = matrix(0, rows=F, cols=1)
-
-  # Partial derivatives for convolution - Simple reference implementation
-  for (n in 1:N) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
-    # Pad image
-    Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
-    parfor (c in 1:C) {
-      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
-      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
-      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))  # reshape
-    }
-    dXn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
-    for (f in 1:F) {  # all filters
-      for (hout in 1:Hout) {  # all output rows
-        h0 = (hout-1) * strideh + 1
-        for (wout in 1:Wout) {  # all output columns
-          w0 = (wout-1) * stridew + 1
-          # Create a patch of the input example corresponding spatially to the filter sizes
-          Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
-          dXn_padded_patch = matrix(W[f,] * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout],
-                                    rows=C, cols=Hf*Wf)  # reshape
-          for (c in 1:C) {
-            Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)  # reshape
-            Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf],
-                                         rows=1, cols=Hf*Wf)  # reshape
-            dXn_padded_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
-            dXn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf] = matrix(dXn_padded_patch[c,],
-                                                              rows=Hf, cols=Wf)  # reshape
-            dXn_padded[c,] = dXn_padded[c,] + matrix(dXn_padded_slice,
-                                                     rows=1, cols=(Hin+2*padh)*(Win+2*padw))
-          }
-          dW[f,] = dW[f,]
-                   + matrix(Xn_padded_patch, rows=1, cols=C*Hf*Wf)
-                   * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
-          db[f,] = db[f,] + dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
-        }
-      }
-    }
-    # Unpad derivs on input
-    dXn = matrix(0, rows=C, cols=Hin*Win)
-    parfor (c in 1:C, check=0) {
-      dXn_padded_slice = matrix(dXn_padded[c,], rows=(Hin+2*padh), cols=(Win+2*padw))
-      dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
-      dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
-    }
-    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
-  }
-}
-
-init = function(int F, int C, int Hf, int Wf)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * We use the heuristic by He et al., which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * unit-Gaussian weights by a factor of sqrt(2/n), under the
-   * assumption of relu neurons.
-   *  - http://arxiv.org/abs/1502.01852
-   *
-   * Inputs:
-   *  - F: Number of filters.
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   */
-  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
-  b = matrix(0, rows=F, cols=1)
-}
-