You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/04/26 21:42:36 UTC
[10/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn`
library to `scripts/nn`
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/batch_norm2d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/batch_norm2d.dml b/scripts/nn/layers/batch_norm2d.dml
new file mode 100644
index 0000000..49c6746
--- /dev/null
+++ b/scripts/nn/layers/batch_norm2d.dml
@@ -0,0 +1,238 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D (Spatial) Batch Normalization layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ int C, int Hin, int Win, string mode,
+ matrix[double] ema_mean, matrix[double] ema_var,
+ double mu, double epsilon)
+ return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
+ matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
+ /*
+ * Computes the forward pass for a 2D (spatial) batch normalization
+ * layer. The input data has N examples, each represented as a 3D
+ * volume unrolled into a single vector.
+ *
+ * A spatial batch normalization layer uses the per-channel sample
+ * mean and per-channel uncorrected sample variance during training
+ * to normalize each channel of the input data. Additionally, it
+ * introduces learnable parameters (gamma, beta) to control the
+ * amount of normalization.
+ *
+ * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
+ *
+ * This implementation maintains exponential moving averages of the
+ * mean and variance during training for use during testing.
+ *
+ * Reference:
+ * - Batch Normalization: Accelerating Deep Network Training by
+ * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
+ * - https://arxiv.org/abs/1502.03167
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - mode: 'train' or 'test' to indicate if the model is currently
+ * being trained or tested. During training, the current batch
+ * mean and variance will be used to normalize the inputs, while
+ * during testing, the exponential average of the mean and
+ * variance over all previous batches will be used.
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (C, 1).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (C, 1).
+ * - mu: Momentum value for moving averages.
+ * Typical values are in the range of [0.9, 0.999].
+ * - epsilon: Smoothing term to avoid divide by zero errors.
+ * Typical values are in the range of [1e-5, 1e-3].
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, C*Hin*Win).
+ * - ema_mean_upd: Updated exponential moving average of the mean,
+ * of shape (C, 1).
+ * - ema_var_upd: Updated exponential moving average of the variance,
+ * of shape (C, 1).
+ * - cache_mean: Cache of the batch mean, of shape (C, 1).
+ * Note: This is used for performance during training.
+ * - cache_var: Cache of the batch variance, of shape (C, 1).
+ * Note: This is used for performance during training.
+ * - cache_norm: Cache of the normalized inputs, of
+ * shape (C, N*Hin*Win). Note: This is used for performance
+ * during training.
+ */
+ N = nrow(X)
+
+ if (mode == 'train') {
+ # Compute channel-wise mean and variance
+ # Since we don't have tensors, we will compute the means and variances in a piece-wise fashion.
+ # - mean of total group is mean of subgroup means
+ # - variance is the mean of the subgroup variances + the variance of the subgroup means
+ subgrp_means = matrix(colMeans(X), rows=C, cols=Hin*Win)
+ subgrp_vars = matrix(colVars(X) * ((N-1)/N), rows=C, cols=Hin*Win) # uncorrected variances
+ mean = rowMeans(subgrp_means) # shape (C, 1)
+ var = rowMeans(subgrp_vars) + rowVars(subgrp_means)*(((Hin*Win)-1)/(Hin*Win)) # shape (C, 1)
+ # Update moving averages
+ ema_mean_upd = mu*ema_mean + (1-mu)*mean
+ ema_var_upd = mu*ema_var + (1-mu)*var
+ }
+ else {
+ # Use moving averages of mean and variance during testing
+ mean = ema_mean
+ var = ema_var
+ ema_mean_upd = ema_mean
+ ema_var_upd = ema_var
+ }
+
+ # Normalize, shift, and scale
+ # norm = (X-mean)*(var+epsilon)^(-1/2)
+ # = (X-mean) / sqrt(var+epsilon)
+ centered = bias_add(X, -mean) # shape (N, C*Hin*Win)
+ norm = bias_multiply(centered, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win)
+ # out = norm*gamma + beta
+ scaled = bias_multiply(norm, gamma) # shape (N, C*Hin*Win)
+ out = bias_add(scaled, beta) # shape (N, C*Hin*Win)
+
+ # Save variable for backward pass
+ cache_mean = mean
+ cache_var = var
+ cache_norm = norm
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+ matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
+ matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
+ matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ int C, int Hin, int Win, string mode,
+ matrix[double] ema_mean, matrix[double] ema_var,
+ double mu, double epsilon)
+ return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+ /*
+ * Computes the backward pass for a 2D (spatial) batch normalization
+ * layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
+ * - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
+ * - ema_mean_upd: Updated exponential moving average of the mean
+ * from the forward pass, of shape (C, 1).
+ * - ema_var_upd: Updated exponential moving average of the variance
+ * from the forward pass, of shape (C, 1).
+ * - cache_mean: Cache of the batch mean from the forward pass, of
+ * shape (C, 1). Note: This is used for performance during
+ * training.
+ * - cache_var: Cache of the batch variance from the forward pass,
+ * of shape (C, 1). Note: This is used for performance during
+ * training.
+ * - cache_norm: Cache of the normalized inputs from the forward
+ * pass, of shape (C, N*Hin*Win). Note: This is used for
+ * performance during training.
+ * - X: Input data matrix to the forward pass, of
+ * shape (N, C*Hin*Win).
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - mode: 'train' or 'test' to indicate if the model is currently
+ * being trained or tested. During training, the current batch
+ * mean and variance will be used to normalize the inputs, while
+ * during testing, the exponential average of the mean and
+ * variance over all previous batches will be used.
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (C, 1).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (C, 1).
+ * - mu: Momentum value for moving averages.
+ * Typical values are in the range of [0.9, 0.999].
+ * - epsilon: Smoothing term to avoid divide by zero errors.
+ * Typical values are in the range of [1e-5, 1e-3].
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ * - dgamma: Gradient wrt `W`, of shape (C, 1).
+ * - dbeta: Gradient wrt `b`, of shape (C, 1).
+ *
+ */
+ N = nrow(X)
+ mean = cache_mean
+ var = cache_var
+ norm = cache_norm
+ centered = bias_add(X, -mean) # shape (N, C*Hin*Win)
+
+ if (mode == 'train') {
+ # Compute gradients during training
+ dgamma = util::channel_sums(dout*norm, C, Hin, Win) # shape (C, 1)
+ dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
+ dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
+ dvar = util::channel_sums((-1/2) * bias_multiply(centered, (var+epsilon)^(-3/2)) * dnorm,
+ C, Hin, Win) # shape (C, 1)
+ dmean_norm_branch = util::channel_sums(bias_multiply(dnorm, -1/sqrt(var+epsilon)), C, Hin, Win)
+ dmean_var_branch = util::channel_sums((-2/(N*Hin*Win)) * centered, C, Hin, Win)
+ dmean_var_branch = dmean_var_branch * dvar # we can't use a function within an expression yet
+ dmean = dmean_norm_branch + dmean_var_branch # shape (C, 1)
+ dX_norm_branch = bias_multiply(dnorm, 1/sqrt(var+epsilon))
+ dX_mean_branch = (1/(N*Hin*Win)) * bias_add(matrix(0, rows=1, cols=C*Hin*Win), dmean)
+ dX_var_branch = (2/(N*Hin*Win)) * bias_multiply(centered, dvar)
+ dX = dX_norm_branch + dX_mean_branch + dX_var_branch # shape (N, C*Hin*Win)
+ }
+ else {
+ # Compute gradients during testing
+ dgamma = util::channel_sums(dout*norm, C, Hin, Win) # shape (C, 1)
+ dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
+ dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
+ dX = bias_multiply(dnorm, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win)
+ }
+}
+
+init = function(int C)
+ return (matrix[double] gamma, matrix[double] beta,
+ matrix[double] ema_mean, matrix[double] ema_var) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - C: Number of input channels (dimensionality of input depth).
+ *
+ * Outputs:
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (C, 1).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (C, 1).
+ */
+ gamma = matrix(1, rows=C, cols=1)
+ beta = matrix(0, rows=C, cols=1)
+ ema_mean = matrix(0, rows=C, cols=1)
+ ema_var = matrix(1, rows=C, cols=1)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/conv2d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/conv2d.dml b/scripts/nn/layers/conv2d.dml
new file mode 100644
index 0000000..9d03568
--- /dev/null
+++ b/scripts/nn/layers/conv2d.dml
@@ -0,0 +1,194 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] out, int Hout, int Wout) {
+ /*
+ * Computes the forward pass for a 2D spatial convolutional layer with
+ * F filters. The input data has N examples, each represented as a 3D
+ * volume unrolled into a single vector.
+ *
+ * This implementation uses `im2col` internally for each image to
+ * extract local image regions (patches) into columns, and then
+ * performs a matrix multiplication with the filters to compute the
+ * output maps.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * For same output height as input, set `padh = (Hf - 1) / 2`,
+ * assuming `strideh = 1`.
+ * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+ * preserves the spatial dimensions of the input.
+ * - padw: Padding for left and right sides.
+ * For same output width as input, set `padw = (Wf - 1) / 2`,
+ * assuming `stridew = 1`.
+ * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+ * preserves the spatial dimensions of the input.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, F*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ */
+ N = nrow(X)
+ F = nrow(W)
+ Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+ Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+ # Create output volume
+ out = matrix(0, rows=N, cols=F*Hout*Wout)
+
+ # Convolution - im2col implementation
+ parfor (n in 1:N) { # all examples
+ Xn = matrix(X[n,], rows=C, cols=Hin*Win) # reshape
+
+ # Pad image
+ Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0) # shape (C, (Hin+2*padh)*(Win+2*padw))
+
+ # Extract local image patches into columns with im2col, of shape (C*Hf*Wf, Hout*Wout)
+ Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
+
+ # Convolve patches with filters
+ outn = W %*% Xn_padded_cols + b # shape (F, Hout*Wout)
+ out[n,] = matrix(outn, rows=1, cols=F*Hout*Wout) # reshape
+ }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+ matrix[double] X, matrix[double] W, matrix[double] b,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+ /*
+ * Computes the backward pass for a 2D spatial convolutional layer
+ * with F filters.
+ *
+ * This implementation uses `im2col` and `col2im` internally.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, F*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * - padw: Padding for left and right sides.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+ * - db: Gradient wrt `b`, of shape (F, 1).
+ */
+ N = nrow(X)
+ F = nrow(W)
+
+ # Create gradient volumes
+ # Note: Create convenience gradient volumes for dW and db that will
+ # allow for one gradient to be stored per example, allowing for
+ # parallel computation at the expense of memory. We will reduce at
+ # the end.
+ dX = matrix(0, rows=N, cols=C*Hin*Win)
+ dWN = matrix(0, rows=N, cols=F*C*Hf*Wf) # dW = matrix(0, rows=F, cols=C*Hf*Wf)
+ dbN = matrix(0, rows=N, cols=F) # db = matrix(0, rows=F, cols=1)
+
+ # Partial derivatives for convolution - im2col implementation
+ parfor (n in 1:N) { # all examples
+ doutn = matrix(dout[n,], rows=F, cols=Hout*Wout)
+
+ # Compute dW
+ Xn = matrix(X[n,], rows=C, cols=Hin*Win) # reshape
+ Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0) # shape (C, (Hin+2*padh)*(Win+2*padw))
+ Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
+ # dW = dW + doutn %*% t(Xn_padded_cols)
+ dWN[n,] = matrix(doutn %*% t(Xn_padded_cols), rows=1, cols=F*C*Hf*Wf)
+
+ # Compute db
+ # db = db + rowSums(doutn)
+ dbN[n,] = matrix(rowSums(doutn), rows=1, cols=F)
+
+ # Compute dX
+ dXn_padded_cols = t(W) %*% doutn # shape (C*Hf*Wf, Hout*Wout)
+ dXn_padded = util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, Wf,
+ strideh, stridew, "add")
+ dXn = util::unpad_image(dXn_padded, Hin, Win, padh, padw)
+ dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win) # reshape
+ }
+
+ # Reduce convenience gradient volumes with one gradient per example
+ # into single gradients for W and b.
+ dW = matrix(colSums(dWN), rows=F, cols=C*Hf*Wf)
+ db = matrix(colSums(dbN), rows=F, cols=1)
+}
+
+init = function(int F, int C, int Hf, int Wf)
+ return (matrix[double] W, matrix[double] b) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * We use the heuristic by He et al., which limits the magnification
+ * of inputs/gradients during forward/backward passes by scaling
+ * unit-Gaussian weights by a factor of sqrt(2/n), under the
+ * assumption of relu neurons.
+ * - http://arxiv.org/abs/1502.01852
+ *
+ * Inputs:
+ * - F: Number of filters.
+ * - C: Number of input channels (dimensionality of depth).
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ *
+ * Outputs:
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ */
+ W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+ b = matrix(0, rows=F, cols=1)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/conv2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/conv2d_builtin.dml b/scripts/nn/layers/conv2d_builtin.dml
new file mode 100644
index 0000000..bda7a9c
--- /dev/null
+++ b/scripts/nn/layers/conv2d_builtin.dml
@@ -0,0 +1,160 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ *
+ * This implementation uses a built-in operator for higher performance.
+ */
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] out, int Hout, int Wout) {
+ /*
+ * Computes the forward pass for a 2D spatial convolutional layer with
+ * F filters. The input data has N examples, each represented as a 3D
+ * volume unrolled into a single vector.
+ *
+ * This implementation uses a built-in operator for higher
+ * performance.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ * - C: Number of input channels (dimensionality of depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * For same output height as input, set `padh = (Hf - 1) / 2`,
+ * assuming `strideh = 1`.
+ * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+ * preserves the spatial dimensions of the input.
+ * - padw: Padding for left and right sides.
+ * For same output width as input, set `padw = (Wf - 1) / 2`,
+ * assuming `stridew = 1`.
+ * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+ * preserves the spatial dimensions of the input.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, F*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ */
+ N = nrow(X)
+ F = nrow(W)
+ Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+ Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+ # Convolution - built-in implementation
+ out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
+ stride=[strideh,stridew], padding=[padh,padw])
+
+ # Add bias term to each output filter
+ out = bias_add(out, b)
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+ matrix[double] X, matrix[double] W, matrix[double] b,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+ /*
+ * Computes the backward pass for a 2D spatial convolutional layer
+ * with F filters.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, F*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ * - C: Number of input channels (dimensionality of depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * For same output height as input, set `padh = (Hf - 1) / 2`,
+ * assuming `strideh = 1`.
+ * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+ * preserves the spatial dimensions of the input.
+ * - padw: Padding for left and right sides.
+ * For same output width as input, set `padw = (Wf - 1) / 2`,
+ * assuming `stridew = 1`.
+ * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+ * preserves the spatial dimensions of the input.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+ * - db: Gradient wrt `b`, of shape (F, 1).
+ */
+ N = nrow(X)
+ F = nrow(W)
+
+ # Partial derivatives for convolution - built-in implementation
+ dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
+ input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
+ dX = conv2d_backward_data(W, dout, stride=[strideh, stridew], padding=[padh,padw],
+ input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
+
+ # Partial derivatives for bias vector
+ db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))
+}
+
+init = function(int F, int C, int Hf, int Wf)
+ return (matrix[double] W, matrix[double] b) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * We use the heuristic by He et al., which limits the magnification
+ * of inputs/gradients during forward/backward passes by scaling
+ * unit-Gaussian weights by a factor of sqrt(2/n), under the
+ * assumption of relu neurons.
+ * - http://arxiv.org/abs/1502.01852
+ *
+ * Inputs:
+ * - F: Number of filters.
+ * - C: Number of input channels (dimensionality of depth).
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ *
+ * Outputs:
+ * - W: Weights, of shape (F, C*Hf*Wf).
+ * - b: Biases, of shape (F, 1).
+ */
+ W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+ b = matrix(0, rows=F, cols=1)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/cross_entropy_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/cross_entropy_loss.dml b/scripts/nn/layers/cross_entropy_loss.dml
new file mode 100644
index 0000000..63db502
--- /dev/null
+++ b/scripts/nn/layers/cross_entropy_loss.dml
@@ -0,0 +1,78 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Cross-Entropy loss function.
+ */
+
+forward = function(matrix[double] pred, matrix[double] y)
+ return (double loss) {
+ /*
+ * Computes the forward pass for a cross-entropy loss function. The
+ * inputs consist of N examples, each with K dimensions corresponding
+ * to normalized probabilities of K classes.
+ *
+ * ```
+ * L_i = -y_i^T * log(pred_i)
+ * L = (1/N) sum(L_i) for i=1 to N
+ * ```
+ *
+ * In these equations, `L` is the total loss, `L_i` is the loss for
+ * example `i`, `y_i` is the K-dimensional vector of target class
+ * probabilities, `pred_i` is K-dimensional vector of predicted
+ * class probabilities, and `N` is the number of examples.
+ *
+ * This can be interpreted as the negative log-likelihood assuming
+ * a Bernoulli distribution generalized to K dimensions, or a
+ * Multinomial with one observation.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, K).
+ * - y: Targets, of shape (N, K).
+ *
+ * Outputs:
+ * - loss: Average loss.
+ */
+ N = nrow(y)
+ eps = 1e-10 # numerical stability to avoid log(0)
+ losses = rowSums(-y * log(pred+eps))
+ loss = sum(losses) / N
+}
+
+backward = function(matrix[double] pred, matrix[double] y)
+ return (matrix[double] dpred) {
+ /*
+ * Computes the backward pass of a cross-entropy loss function. The
+ * inputs consist of N examples, each with K dimensions corresponding
+ * to normalized probabilities of K classes.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, K).
+ * - y: Targets, of shape (N, K).
+ *
+ * Outputs:
+ * - dpred: Gradient wrt `pred`, of shape (N, K).
+ */
+ N = nrow(y)
+ eps = 1e-10 # numerical stability to avoid divide-by-zero
+ dpred = (1/N) * -y * (1/(pred+eps))
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/dropout.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/dropout.dml b/scripts/nn/layers/dropout.dml
new file mode 100644
index 0000000..a36878b
--- /dev/null
+++ b/scripts/nn/layers/dropout.dml
@@ -0,0 +1,76 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Dropout layer.
+ */
+
+forward = function(matrix[double] X, double p, int seed)
+ return (matrix[double] out, matrix[double] mask) {
+ /*
+ * Computes the forward pass for an inverted dropout layer.
+ *
+ * Drops the inputs element-wise with a probability p, and divides
+ * by p to maintain the expected values of those inputs (which are
+ * the outputs of neurons) at test time.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ * - p: Probability of keeping a neuron output.
+ * - seed: [Optional: -1] Random number generator seed to allow for
+ * deterministic evaluation. Set to -1 for a random seed.
+ *
+ * Outputs:
+ * - out: Outputs, of same shape as `X`.
+ * - mask: Dropout mask used to compute the output.
+ */
+ # Normally, we might use something like
+ # `mask = rand(rows=nrow(X), cols=ncol(X), min=0, max=1, seed=seed) <= p`
+ # to create a dropout mask. Fortunately, SystemML has a `sparsity` parameter on
+ # the `rand` function that allows use to create a mask directly.
+ if (seed == -1) {
+ mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p)
+ } else {
+ mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed)
+ }
+ out = X * mask / p
+}
+
+backward = function(matrix[double] dout, matrix[double] X, double p, matrix[double] mask)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for an inverted dropout layer.
+ *
+ * Applies the mask to the upstream gradient, and divides by p to
+ * maintain the expected values at test time.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out`, of same shape as `X`.
+ * - X: Inputs, of shape (any, any).
+ * - p: Probability of keeping a neuron output.
+ * - mask: Dropout mask used to compute the output.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ */
+ dX = mask / p * dout
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l1_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/l1_loss.dml b/scripts/nn/layers/l1_loss.dml
new file mode 100644
index 0000000..b74566d
--- /dev/null
+++ b/scripts/nn/layers/l1_loss.dml
@@ -0,0 +1,72 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * L1 loss function.
+ */
+
+forward = function(matrix[double] pred, matrix[double] y)
+ return (double loss) {
+ /*
+ * Computes the forward pass for an L1 loss function. The inputs
+ * consist of N examples, each with M dimensions to predict.
+ *
+ * ```
+ * L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j.
+ * L = (1/N) sum(L_i) for i=1 to N
+ * ```
+ *
+ * In these equations, `L` is the total loss, `L_i` is the loss for
+ * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
+ * prediction, and `N` is the number of examples.
+ *
+ * This can be interpreted as the negative log-likelihood assuming
+ * a Laplace distribution.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, M).
+ * - y: Targets, of shape (N, M).
+ *
+ * Outputs:
+ * - loss: Average loss.
+ */
+ N = nrow(y)
+ losses = rowSums(abs(pred-y))
+ loss = sum(losses) / N
+}
+
+backward = function(matrix[double] pred, matrix[double] y)
+ return (matrix[double] dpred) {
+ /*
+ * Computes the backward pass for an L1 loss function. The inputs
+ * consist of N examples, each with M dimensions to predict.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, M).
+ * - y: Targets, of shape (N, M).
+ *
+ * Outputs:
+ * - dpred: Gradient wrt `pred`, of shape (N, M).
+ */
+ N = nrow(y)
+ dpred = sign(pred-y) / N
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l1_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/l1_reg.dml b/scripts/nn/layers/l1_reg.dml
new file mode 100644
index 0000000..2b81c0b
--- /dev/null
+++ b/scripts/nn/layers/l1_reg.dml
@@ -0,0 +1,56 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * L1 regularization.
+ */
+
+forward = function(matrix[double] X, double lambda)
+ return (double reg_loss) {
+ /*
+ * Computes the forward pass for an L1 regularization function.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ * - lambda: Regularization strength.
+ * A typical value is 0.01.
+ *
+ * Outputs:
+ * - reg_loss: Total regularization loss.
+ */
+ reg_loss = lambda * sum(abs(X))
+}
+
+backward = function(matrix[double] X, double lambda)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for an L1 regularization function.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ * - lambda: Regularization strength.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ */
+ dX = lambda * sign(X)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l2_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/l2_loss.dml b/scripts/nn/layers/l2_loss.dml
new file mode 100644
index 0000000..0482f25
--- /dev/null
+++ b/scripts/nn/layers/l2_loss.dml
@@ -0,0 +1,72 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * L2 loss function.
+ */
+
+forward = function(matrix[double] pred, matrix[double] y)
+ return (double loss) {
+ /*
+ * Computes the forward pass for an L2 loss function. The inputs
+ * consist of N examples, each with M dimensions to predict.
+ *
+ * ```
+ * L_i = (1/2) norm(pred_i - y_i)^2
+ * L = (1/N) sum(L_i) for i=1 to N
+ * ```
+ *
+ * In these equations, `L` is the total loss, `L_i` is the loss for
+ * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
+ * prediction, and `N` is the number of examples.
+ *
+ * This can be interpreted as the negative log-likelihood assuming
+ * a Gaussian distribution.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, M).
+ * - y: Targets, of shape (N, M).
+ *
+ * Outputs:
+ * - loss: Average loss.
+ */
+ N = nrow(y)
+ losses = 0.5 * rowSums((pred-y)^2)
+ loss = sum(losses) / N
+}
+
+backward = function(matrix[double] pred, matrix[double] y)
+ return (matrix[double] dpred) {
+ /*
+ * Computes the backward pass for an L2 loss function. The inputs
+ * consist of N examples, each with M dimensions to predict.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, M).
+ * - y: Targets, of shape (N, M).
+ *
+ * Outputs:
+ * - dpred: Gradient wrt `pred`, of shape (N, M).
+ */
+ N = nrow(y)
+ dpred = (pred-y) / N
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l2_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/l2_reg.dml b/scripts/nn/layers/l2_reg.dml
new file mode 100644
index 0000000..7255efe
--- /dev/null
+++ b/scripts/nn/layers/l2_reg.dml
@@ -0,0 +1,56 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * L2 regularization.
+ */
+
+forward = function(matrix[double] X, double lambda)
+ return (double reg_loss) {
+ /*
+ * Computes the forward pass for an L2 regularization function.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ * - lambda: Regularization strength.
+ * A typical value is 0.01.
+ *
+ * Outputs:
+ * - reg_loss: Total regularization loss.
+ */
+ reg_loss = 0.5 * lambda * sum(X^2)
+}
+
+backward = function(matrix[double] X, double lambda)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for an L2 regularization function.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ * - lambda: Regularization strength.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ */
+ dX = lambda * X
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/log_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/log_loss.dml b/scripts/nn/layers/log_loss.dml
new file mode 100644
index 0000000..15914f7
--- /dev/null
+++ b/scripts/nn/layers/log_loss.dml
@@ -0,0 +1,76 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Log loss function.
+ */
+
+forward = function(matrix[double] pred, matrix[double] y)
+ return (double loss) {
+ /*
+ * Computes the forward pass for a log loss function.
+ *
+ * ```
+ * L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i)
+ * L = (1/N) sum(L_i) for i=1 to N
+ * ```
+ *
+ * In these equations, `L` is the total loss, `L_i` is the loss for
+ * example `i`, `y_i` is the binary target, `pred_i` is probability
+ * of the true class (i.e. `y=1`), and `N` is the number of examples.
+ *
+ * This can be interpreted as the negative log-likelihood assuming
+ * a Bernoulli distribution.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, 1).
+ * Predictions should be probabilities of the true
+ * class (i.e. probability of `y=1`).
+ * - y: Targets, of shape (N, 1).
+ * Targets should be binary in the set {0, 1}.
+ *
+ * Outputs:
+ * - loss: Average loss.
+ */
+ N = nrow(y)
+ losses = -y*log(pred) - (1-y)*log(1-pred)
+ loss = sum(losses) / N
+}
+
+backward = function(matrix[double] pred, matrix[double] y)
+ return (matrix[double] dpred) {
+ /*
+ * Computes the backward pass for a log loss function.
+ *
+ * Inputs:
+ * - pred: Predictions, of shape (N, 1).
+ * Predictions should be probabilities of the true
+ * class (i.e. probability of `y=1`).
+ * - y: Targets, of shape (N, 1).
+ * Targets should be binary in the set {0, 1}.
+ *
+ * Outputs:
+ * - dpred: Gradient wrt `pred`, of shape (N, 1).
+ */
+ N = nrow(y)
+ dpred = (1/N) * (pred-y) / (pred*(1-pred))
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/lstm.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/lstm.dml b/scripts/nn/layers/lstm.dml
new file mode 100644
index 0000000..a75add4
--- /dev/null
+++ b/scripts/nn/layers/lstm.dml
@@ -0,0 +1,260 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * LSTM layer.
+ */
+source("nn/layers/sigmoid.dml") as sigmoid
+source("nn/layers/tanh.dml") as tanh
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
+ boolean return_sequences, matrix[double] out0, matrix[double] c0)
+ return (matrix[double] out, matrix[double] c,
+ matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog) {
+ /*
+ * Computes the forward pass for an LSTM layer with M neurons.
+ * The input data has N sequences of T examples, each with D features.
+ *
+ * In an LSTM, an internal cell state is maintained, additive
+ * interactions operate over the cell state at each timestep, and
+ * some amount of this cell state is exposed as output at each
+ * timestep. Additionally, the output of the previous timestep is fed
+ * back in as an additional input at the current timestep.
+ *
+ * Reference:
+ * - Long Short-Term Memory, Hochreiter, 1997
+ * - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, T*D).
+ * - W: Weights, of shape (D+M, 4M).
+ * - b: Biases, of shape (1, 4M).
+ * - T: Length of example sequences (number of timesteps).
+ * - D: Dimensionality of the input features (number of features).
+ * - return_sequences: Whether to return `out` at all timesteps,
+ * or just for the final timestep.
+ * - out0: Outputs from previous timestep, of shape (N, M).
+ * Note: This is *optional* and could just be an empty matrix.
+ * - c0: Initial cell state, of shape (N, M).
+ * Note: This is *optional* and could just be an empty matrix.
+ *
+ * Outputs:
+ * - out: If `return_sequences` is True, outputs for all timesteps,
+ * of shape (N, T*M). Else, outputs for the final timestep, of
+ * shape (N, M).
+ * - c: Cell state for final timestep, of shape (N, M).
+ * - cache_out: Cache of outputs, of shape (T, N*M).
+ * Note: This is used for performance during training.
+ * - cache_c: Cache of cell state, of shape (T, N*M).
+ * Note: This is used for performance during training.
+ * - cache_ifog: Cache of intermediate values, of shape (T, N*4M).
+ * Note: This is used for performance during training.
+ */
+ N = nrow(X)
+ M = as.integer(ncol(W)/4)
+ out_prev = out0
+ c_prev = c0
+ c = c_prev
+ if (return_sequences) {
+ out = matrix(0, rows=N, cols=T*M)
+ }
+ else {
+ out = matrix(0, rows=N, cols=M)
+ }
+ # caches to be used during the backward pass for performance
+ cache_out = matrix(0, rows=T, cols=N*M)
+ cache_c = matrix(0, rows=T, cols=N*M)
+ cache_ifog = matrix(0, rows=T, cols=N*4*M)
+
+ for (t in 1:T) { # each timestep
+ X_t = X[,(t-1)*D+1:t*D] # shape (N, D)
+ input = cbind(X_t, out_prev) # shape (N, D+M)
+ ifog = input %*% W + b # input, forget, output, and g gates; shape (N, 4M)
+ tmp = sigmoid::forward(ifog[,1:3*M]) # i,f,o gates squashed with sigmoid
+ ifog[,1:3*M] = tmp
+ tmp = tanh::forward(ifog[,3*M+1:4*M]) # g gate squashed with tanh
+ ifog[,3*M+1:4*M] = tmp
+ # c_t = f*prev_c + i*g
+ c = ifog[,M+1:2*M]*c_prev + ifog[,1:M]*ifog[,3*M+1:4*M] # shape (N, M)
+ # out_t = o*tanh(c)
+ tmp = tanh::forward(c)
+ out_t = ifog[,2*M+1:3*M] * tmp # shape (N, M)
+
+ # store
+ if (return_sequences) {
+ out[,(t-1)*M+1:t*M] = out_t
+ }
+ else {
+ out = out_t
+ }
+ out_prev = out_t
+ c_prev = c
+ cache_out[t,] = matrix(out_t, rows=1, cols=N*M) # reshape
+ cache_c[t,] = matrix(c, rows=1, cols=N*M) # reshape
+ cache_ifog[t,] = matrix(ifog, rows=1, cols=N*4*M) # reshape
+ }
+}
+
+backward = function(matrix[double] dout, matrix[double] dc,
+ matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
+ boolean given_sequences, matrix[double] out0, matrix[double] c0,
+ matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog)
+ return (matrix[double] dX, matrix[double] dW, matrix[double] db,
+ matrix[double] dout0, matrix[double] dc0) {
+ /*
+ * Computes the backward pass for an LSTM layer with M neurons.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out`. If `given_sequences` is `True`,
+ * contains gradients on outputs for all timesteps, of
+ * shape (N, T*M). Else, contains the gradient on the output
+ * for the final timestep, of shape (N, M).
+ * - dc: Gradient wrt `c` (from later in time), of shape (N, M).
+ * This would come from later in time if the cell state was used
+ * downstream as the initial cell state for another LSTM layer.
+ * Typically, this would be used when a sequence was cut at
+ * timestep `T` and then continued in the next batch. If `c`
+ * was not used downstream, then `dc` would be an empty matrix.
+ * - X: Inputs, of shape (N, T*D).
+ * - W: Weights, of shape (D+M, 4M).
+ * - b: Biases, of shape (1, 4M).
+ * - T: Length of example sequences (number of timesteps).
+ * - D: Dimensionality of the input features.
+ * - given_sequences: Whether `dout` is for all timesteps,
+ * or just for the final timestep. This is based on whether
+ * `return_sequences` was true in the forward pass.
+ * - out0: Outputs from previous timestep, of shape (N, M).
+ * Note: This is *optional* and could just be an empty matrix.
+ * - c0: Initial cell state, of shape (N, M).
+ * Note: This is *optional* and could just be an empty matrix.
+ * - cache_out: Cache of outputs, of shape (T, N*M).
+ * Note: This is used for performance during training.
+ * - cache_c: Cache of cell state, of shape (T, N*M).
+ * Note: This is used for performance during training.
+ * - cache_ifog: Cache of intermediate values, of shape (T, N*4*M).
+ * Note: This is used for performance during training.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, T*D).
+ * - dW: Gradient wrt `W`, of shape (D+M, 4M).
+ * - db: Gradient wrt `b`, of shape (1, 4M).
+ * - dout0: Gradient wrt `out0`, of shape (N, M).
+ * - dc0: Gradient wrt `c0`, of shape (N, M).
+ */
+ N = nrow(X)
+ M = as.integer(ncol(W)/4)
+ dX = matrix(0, rows=N, cols=T*D)
+ dW = matrix(0, rows=D+M, cols=4*M)
+ db = matrix(0, rows=1, cols=4*M)
+ dout0 = matrix(0, rows=N, cols=M)
+ dc0 = matrix(0, rows=N, cols=M)
+ dct = dc
+ if (!given_sequences) {
+ # only given dout for output at final timestep, so prepend empty douts for all other timesteps
+ dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout) # shape (N, T*M)
+ }
+
+ t = T
+ for (iter in 1:T) { # each timestep in reverse order
+ X_t = X[,(t-1)*D+1:t*D] # shape (N, D)
+ dout_t = dout[,(t-1)*M+1:t*M] # shape (N, M)
+ out_t = matrix(cache_out[t,], rows=N, cols=M) # shape (N, M)
+ ct = matrix(cache_c[t,], rows=N, cols=M) # shape (N, M)
+ if (t == 1) {
+ out_prev = out0 # shape (N, M)
+ c_prev = c0 # shape (N, M)
+ }
+ else {
+ out_prev = matrix(cache_out[t-1,], rows=N, cols=M) # shape (N, M)
+ c_prev = matrix(cache_c[t-1,], rows=N, cols=M) # shape (N, M)
+ }
+ input = cbind(X_t, out_prev) # shape (N, D+M)
+ ifog = matrix(cache_ifog[t,], rows=N, cols=4*M)
+ i = ifog[,1:M] # input gate, shape (N, M)
+ f = ifog[,M+1:2*M] # forget gate, shape (N, M)
+ o = ifog[,2*M+1:3*M] # output gate, shape (N, M)
+ g = ifog[,3*M+1:4*M] # g gate, shape (N, M)
+
+ tmp = tanh::backward(dout_t, ct)
+ dct = dct + o*tmp # shape (N, M)
+ tmp = tanh::forward(ct)
+ do = tmp * dout_t # output gate, shape (N, M)
+ df = c_prev * dct # forget gate, shape (N, M)
+ dc_prev = f * dct # shape (N, M)
+ di = g * dct # input gate, shape (N, M)
+ dg = i * dct # g gate, shape (N, M)
+
+ di_raw = i * (1-i) * di
+ df_raw = f * (1-f) * df
+ do_raw = o * (1-o) * do
+ dg_raw = (1-g^2) * dg
+ difog_raw = cbind(di_raw, cbind(df_raw, cbind(do_raw, dg_raw))) # shape (N, 4M)
+
+ dW = dW + t(input) %*% difog_raw # shape (D+M, 4M)
+ db = db + colSums(difog_raw) # shape (1, 4M)
+ dinput = difog_raw %*% t(W) # shape (N, D+M)
+ dX[,(t-1)*D+1:t*D] = dinput[,1:D]
+ dout_prev = dinput[,D+1:D+M] # shape (N, M)
+ if (t == 1) {
+ dout0 = dout_prev # shape (N, M)
+ dc0 = dc_prev # shape (N, M)
+ }
+ else {
+ dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M)
+ dct = dc_prev # shape (N, M)
+ }
+ t = t - 1
+ }
+}
+
+init = function(int N, int D, int M)
+ return (matrix[double] W, matrix[double] b, matrix[double] out0, matrix[double] c0) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * We use the Glorot uniform heuristic which limits the magnification
+ * of inputs/gradients during forward/backward passes by scaling
+ * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
+ * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
+ *
+ * Inputs:
+ * - N: Number of examples in batch.
+ * - D: Dimensionality of the input features (number of features).
+ * - M: Number of neurons in this layer.
+ *
+ * Outputs:
+ * - W: Weights, of shape (D+M, 4M).
+ * - b: Biases, of shape (1, 4M).
+ * - out0: Empty previous timestep output matrix, of shape (N, M).
+ * - c0: Empty initial cell state matrix, of shape (N, M).
+ */
+ fan_in = D+M
+ fan_out = 4*M
+ scale = sqrt(6/(fan_in+fan_out))
+ W = rand(rows=D+M, cols=4*M, min=-scale, max=scale, pdf="uniform")
+ b = matrix(0, rows=1, cols=4*M)
+ out0 = matrix(0, rows=N, cols=M)
+ c0 = matrix(0, rows=N, cols=M)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/max_pool2d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/max_pool2d.dml b/scripts/nn/layers/max_pool2d.dml
new file mode 100644
index 0000000..fba1a4c
--- /dev/null
+++ b/scripts/nn/layers/max_pool2d.dml
@@ -0,0 +1,159 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Max Pooling layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] out, int Hout, int Wout) {
+ /*
+ * Computes the forward pass for a 2D spatial max pooling layer.
+ * The input data has N examples, each represented as a 3D volume
+ * unrolled into a single vector.
+ *
+ * This implementation uses `im2col` internally for each image to
+ * extract local image regions (patches) of each channel slice into
+ * columns, and then performs max pooling over the patches to compute
+ * the output maps.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * A typical value is 0.
+ * - padw: Padding for left and right sides.
+ * A typical value is 0.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, C*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ */
+ N = nrow(X)
+ Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+ Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+ pad_value = -1/0 # in max pooling we pad with -infinity
+
+ # Create output volume
+ out = matrix(0, rows=N, cols=C*Hout*Wout)
+
+ # Max pooling - im2col implementation
+ parfor (n in 1:N) { # all examples
+ img = matrix(X[n,], rows=C, cols=Hin*Win) # reshape
+
+ if (padh > 0 | padw > 0) {
+ # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
+ img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
+ }
+
+ img_maxes = matrix(0, rows=C, cols=Hout*Wout) # zeros
+ parfor (c in 1:C) { # all channels
+ # Extract local image slice patches into columns with im2col, of shape (Hf*Wf, Hout*Wout)
+ img_slice_cols = util::im2col(img[c,], Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
+
+ # Max pooling on patches
+ img_maxes[c,] = colMaxs(img_slice_cols)
+ }
+
+ out[n,] = matrix(img_maxes, rows=1, cols=C*Hout*Wout)
+ }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for a 2D spatial max pooling layer.
+ * The input data has N examples, each represented as a 3D volume
+ * unrolled into a single vector.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, C*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ * - X: Input data matrix, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * A typical value is 0.
+ * - padw: Padding for left and right sides.
+ * A typical value is 0.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ */
+ N = nrow(X)
+ pad_value = -1/0 # in max pooling we pad with -infinity
+
+ # Create gradient volume
+ dX = matrix(0, rows=N, cols=C*Hin*Win)
+
+ # Gradient of max pooling
+ parfor (n in 1:N, check=0) { # all examples
+ img = matrix(X[n,], rows=C, cols=Hin*Win)
+ if (padh > 0 | padw > 0) {
+ # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
+ img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
+ }
+
+ dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+ parfor (c in 1:C, check=0) { # all channels
+ img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+ dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+ for (hout in 1:Hout, check=0) { # all output rows
+ hin = (hout-1)*strideh + 1
+ for (wout in 1:Wout) { # all output columns
+ win = (wout-1)*stridew + 1
+ img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
+ max_val_ind = img_slice_patch == max(img_slice_patch) # max value indicator matrix
+ # gradient passes through only for the max value(s) in this patch
+ dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
+ dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
+ + dimg_slice_patch
+ }
+ }
+ dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
+ }
+
+ if (padh > 0 | padw > 0) {
+ # Unpad image gradient
+ dimg = util::unpad_image(dimg, Hin, Win, padh, padw) # shape (C, (Hin+2*padh)*(Win+2*padw))
+ }
+ dX[n,] = matrix(dimg, rows=1, cols=C*Hin*Win)
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/max_pool2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/max_pool2d_builtin.dml b/scripts/nn/layers/max_pool2d_builtin.dml
new file mode 100644
index 0000000..880f818
--- /dev/null
+++ b/scripts/nn/layers/max_pool2d_builtin.dml
@@ -0,0 +1,103 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Max Pooling layer.
+ *
+ * This implementation uses a built-in operator for higher performance.
+ */
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] out, int Hout, int Wout) {
+ /*
+ * Computes the forward pass for a 2D spatial max pooling layer.
+ * The input data has N examples, each represented as a 3D volume
+ * unrolled into a single vector.
+ *
+ * This implementation uses a built-in operator for higher
+ * performance.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * A typical value is 0.
+ * - padw: Padding for left and right sides.
+ * A typical value is 0.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, C*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ */
+ N = nrow(X)
+ Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
+ Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
+
+ # Max pooling - built-in implementation
+ out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
+ stride=[strideh,stridew], padding=[padh,padw])
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+ int C, int Hin, int Win, int Hf, int Wf,
+ int strideh, int stridew, int padh, int padw)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for a 2D spatial max pooling layer.
+ * The input data has N examples, each represented as a 3D volume
+ * unrolled into a single vector.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, C*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * A typical value is 0.
+ * - padw: Padding for left and right sides.
+ * A typical value is 0.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ */
+ N = nrow(X)
+
+ # Gradient of max pooling
+ dX = max_pool_backward(X, dout, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
+ stride=[strideh,stridew], padding=[padh,padw])
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/relu.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/relu.dml b/scripts/nn/layers/relu.dml
new file mode 100644
index 0000000..93a6e90
--- /dev/null
+++ b/scripts/nn/layers/relu.dml
@@ -0,0 +1,59 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Rectified Linear Unit (ReLU) nonlinearity layer.
+ */
+
+forward = function(matrix[double] X)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for a ReLU nonlinearity layer.
+ *
+ * Performs an element-wise evaluation of `f(input) = max(0, input)`.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ *
+ * Outputs:
+ * - out: Outputs, of same shape as `X`.
+ */
+ out = max(X, 0)
+}
+
+backward = function(matrix[double] dout, matrix[double] X)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for a ReLU nonlinearity layer.
+ *
+ * Essentially performs a pass-through of the upstream gradient
+ * for cells > 0.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+ * - X: Previous input data matrix, of shape (any, any).
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ */
+ dX = (X > 0) * dout
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/rnn.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/rnn.dml b/scripts/nn/layers/rnn.dml
new file mode 100644
index 0000000..3c6faae
--- /dev/null
+++ b/scripts/nn/layers/rnn.dml
@@ -0,0 +1,183 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Simple (Vanilla) RNN layer.
+ */
+source("nn/layers/tanh.dml") as tanh
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
+ boolean return_sequences, matrix[double] out0)
+ return (matrix[double] out, matrix[double] cache_out) {
+ /*
+ * Computes the forward pass for a simple RNN layer with M neurons.
+ * The input data has N sequences of T examples, each with D features.
+ *
+ * In a simple RNN, the output of the previous timestep is fed back
+ * in as an additional input at the current timestep.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, T*D).
+ * - W: Weights, of shape (D+M, M).
+ * - b: Biases, of shape (1, M).
+ * - T: Length of example sequences (number of timesteps).
+ * - D: Dimensionality of the input features (number of features).
+ * - return_sequences: Whether to return `out` at all timesteps,
+ * or just for the final timestep.
+ * - out0: Output matrix from previous timestep, of shape (N, M).
+ * Note: This is *optional* and could just be an empty matrix.
+ *
+ * Outputs:
+ * - out: If `return_sequences` is True, outputs for all timesteps,
+ * of shape (N, T*M). Else, outputs for the final timestep, of
+ * shape (N, M).
+ * - cache_out: Cache of outputs, of shape (T, N*M).
+ * Note: This is used for performance during training.
+ */
+ N = nrow(X)
+ M = ncol(W)
+ out_prev = out0
+ if (return_sequences) {
+ out = matrix(0, rows=N, cols=T*M)
+ }
+ else {
+ out = matrix(0, rows=N, cols=M)
+ }
+ # caches to be used during the backward pass for performance
+ cache_out = matrix(0, rows=T, cols=N*M)
+
+ for (t in 1:T) { # each timestep
+ X_t = X[,(t-1)*D+1:t*D] # shape (N, D)
+ input = cbind(X_t, out_prev) # shape (N, D+M)
+ out_t = tanh::forward(input %*% W + b) # shape (N, M)
+ # store
+ if (return_sequences) {
+ out[,(t-1)*M+1:t*M] = out_t
+ }
+ else {
+ out = out_t
+ }
+ out_prev = out_t
+ cache_out[t,] = matrix(out_t, rows=1, cols=N*M) # reshape
+ }
+}
+
+backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, matrix[double] b,
+ int T, int D, boolean given_sequences, matrix[double] out0,
+ matrix[double] cache_out)
+ return (matrix[double] dX, matrix[double] dW, matrix[double] db, matrix[double] dout0) {
+ /*
+ * Computes the backward pass for a simple RNN layer with M neurons.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream. If `given_sequences`
+ * is True, contains gradients on outputs for all timesteps,
+ * of shape (N, T*M). Else, contains gradient on output for
+ * the final timestep, of shape (N, M).
+ * - X: Inputs, of shape (N, T*D).
+ * - W: Weights, of shape (D+M, M).
+ * - b: Biases, of shape (1, M).
+ * - T: Length of example sequences (number of timesteps).
+ * - D: Dimensionality of the input features (number of features).
+ * - given_sequences: Whether `dout` is for all timesteps,
+ * or just for the final timestep. This is based on whether
+ * `return_sequences` was true in the forward pass.
+ * - out0: Output matrix from previous timestep, of shape (N, M).
+ * Note: This is *optional* and could just be an empty matrix.
+ * - cache_out: Cache of outputs, of shape (T, N*M).
+ * Note: This is used for performance during training.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, T*D).
+ * - dW: Gradient wrt `W`, of shape (D+M, 4M).
+ * - db: Gradient wrt `b`, of shape (1, 4M).
+ * - dout0: Gradient wrt `out0`, of shape (N, M).
+ */
+ N = nrow(X)
+ M = ncol(W)
+ dX = matrix(0, rows=N, cols=T*D)
+ dW = matrix(0, rows=D+M, cols=M)
+ db = matrix(0, rows=1, cols=M)
+ dout0 = matrix(0, rows=N, cols=M)
+ if (!given_sequences) {
+ # only given dout for output at final timestep, so prepend empty douts for all other timesteps
+ dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout) # shape (N, T*M)
+ }
+
+ t = T
+ for (iter in 1:T) { # each timestep in reverse order
+ X_t = X[,(t-1)*D+1:t*D] # shape (N, D)
+ dout_t = dout[,(t-1)*M+1:t*M] # shape (N, M)
+ out_t = matrix(cache_out[t,], rows=N, cols=M) # shape (N, M)
+ if (t == 1) {
+ out_prev = out0 # shape (N, M)
+ }
+ else {
+ out_prev = matrix(cache_out[t-1,], rows=N, cols=M) # shape (N, M)
+ }
+ input = cbind(X_t, out_prev) # shape (N, D+M)
+ dout_t_raw = (1-out_t^2) * dout_t # into tanh, shape (N, M)
+ dW = dW + t(input) %*% dout_t_raw # shape (D+M, M)
+ db = db + colSums(dout_t_raw) # shape (1, M)
+ dinput = dout_t_raw %*% t(W) # shape (N, D+M)
+ dX[,(t-1)*D+1:t*D] = dinput[,1:D]
+ dout_prev = dinput[,D+1:D+M] # shape (N, M)
+ if (t == 1) {
+ dout0 = dout_prev # shape (N, M)
+ }
+ else {
+ dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M)
+ }
+ t = t - 1
+ }
+}
+
+init = function(int N, int D, int M)
+ return (matrix[double] W, matrix[double] b, matrix[double] out0) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * We use the Glorot uniform heuristic which limits the magnification
+ * of inputs/gradients during forward/backward passes by scaling
+ * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
+ * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
+ *
+ * Inputs:
+ * - N: Number of examples in batch.
+ * - D: Dimensionality of the input features (number of features).
+ * - M: Number of neurons in this layer.
+ *
+ * Outputs:
+ * - W: Weights, of shape (D+M, M).
+ * - b: Biases, of shape (1, M).
+ * - out0: Empty previous timestep output matrix, of shape (N, M).
+ */
+ fan_in = D+M
+ fan_out = M
+ scale = sqrt(6/(fan_in+fan_out))
+ W = rand(rows=D+M, cols=M, min=-scale, max=scale, pdf="uniform")
+ b = matrix(0, rows=1, cols=M)
+ out0 = matrix(0, rows=N, cols=M)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/scale_shift1d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/scale_shift1d.dml b/scripts/nn/layers/scale_shift1d.dml
new file mode 100644
index 0000000..7e162a3
--- /dev/null
+++ b/scripts/nn/layers/scale_shift1d.dml
@@ -0,0 +1,95 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 1D Scale & Shift layer.
+ */
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for a 1D scale & shift layer. The input
+ * data has N examples, each with D features.
+ *
+ * A 1D scale & shift layer introduces learnable parameters
+ * (gamma, beta) to scale and shift the input on a per-feature basis.
+ *
+ * `y = x*gamma + beta`
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, D).
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, D).
+ */
+ # Scale and shift
+ out = X*gamma + beta # shape (N, D)
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+ matrix[double] X, matrix[double] gamma, matrix[double] beta)
+ return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+ /*
+ * Computes the backward pass for a 1D scale & shift layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of shape (N, D).
+ * - out: Outputs from the forward pass, of shape (N, D).
+ * - X: Inputs, of shape (N, D).
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, D).
+ * - dgamma: Gradient wrt `W`, of shape (1, D).
+ * - dbeta: Gradient wrt `b`, of shape (1, D).
+ *
+ */
+ # Compute gradients during training
+ dgamma = colSums(dout*X) # shape (1, D)
+ dbeta = colSums(dout) # shape (1, D)
+ dX = dout * gamma # shape (N, D)
+}
+
+init = function(int D)
+ return (matrix[double] gamma, matrix[double] beta) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * By default, we initialize to an identity function, with a scale
+ * filler of `1`, and a shift filler of `0`.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - D: Dimensionality of the input features (number of features).
+ *
+ * Outputs:
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ */
+ gamma = matrix(1, rows=1, cols=D)
+ beta = matrix(0, rows=1, cols=D)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/scale_shift2d.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/scale_shift2d.dml b/scripts/nn/layers/scale_shift2d.dml
new file mode 100644
index 0000000..79c884a
--- /dev/null
+++ b/scripts/nn/layers/scale_shift2d.dml
@@ -0,0 +1,107 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Scale & Shift layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ int C, int Hin, int Win)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for a 2D scale & shift layer. The input
+ * data has N examples, each represented as a 3D volume unrolled into
+ * a single vector.
+ *
+ * A 2D scale & shift layer introduces learnable parameters
+ * (gamma, beta) to scale and shift the input on a per-channel basis.
+ *
+ * `y = x*gamma + beta`
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, C*Hin*Win).
+ */
+ # Scale and shift
+ scaled = bias_multiply(X, gamma) # shape (N, C*Hin*Win)
+ out = bias_add(scaled, beta) # shape (N, C*Hin*Win)
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+ matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ int C, int Hin, int Win)
+ return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+ /*
+ * Computes the backward pass for a 2D scale & shift layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
+ * - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
+ * - X: Input data matrix to the forward pass, of
+ * shape (N, C*Hin*Win).
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ * - dgamma: Gradient wrt `W`, of shape (C, 1).
+ * - dbeta: Gradient wrt `b`, of shape (C, 1).
+ *
+ */
+ # Compute gradients during training
+ dgamma = util::channel_sums(dout*X, C, Hin, Win) # shape (C, 1)
+ dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
+ dX = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
+}
+
+init = function(int C)
+ return (matrix[double] gamma, matrix[double] beta) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * By default, we initialize to an identity function, with a scale
+ * filler of `1`, and a shift filler of `0`.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - C: Number of input channels (dimensionality of input depth).
+ *
+ * Outputs:
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ */
+ gamma = matrix(1, rows=C, cols=1)
+ beta = matrix(0, rows=C, cols=1)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/sigmoid.dml
----------------------------------------------------------------------
diff --git a/scripts/nn/layers/sigmoid.dml b/scripts/nn/layers/sigmoid.dml
new file mode 100644
index 0000000..2d85adc
--- /dev/null
+++ b/scripts/nn/layers/sigmoid.dml
@@ -0,0 +1,62 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Sigmoid nonlinearity layer.
+ */
+
+forward = function(matrix[double] X)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for a sigmoid nonlinearity layer.
+ *
+ * `sigmoid(x) = 1 / (1 + e^-x)`
+ *
+ * If `X` contains a single feature column, the output of a sigmoid
+ * layer can be interpreted as a predicted probability of a true
+ * class when paired with a log loss function in a binary
+ * classification problem.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ *
+ * Outputs:
+ * - out: Outputs, of same shape as `X`.
+ */
+ out = 1 / (1+exp(-X))
+}
+
+backward = function(matrix[double] dout, matrix[double] X)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for a sigmoid nonlinearity layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+ * - X: Inputs, of shape (any, any).
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ */
+ out = 1 / (1+exp(-X))
+ dX = out * (1-out) * dout
+}
+