You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/04/26 21:42:32 UTC
[06/11] incubator-systemml git commit: [SYSTEMML-1524] Graduate `nn`
library to `scripts/nn`
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
deleted file mode 100644
index a529a12..0000000
--- a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml
+++ /dev/null
@@ -1,178 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * MNIST Softmax Example
- */
-# Imports
-source("nn/layers/affine.dml") as affine
-source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
-source("nn/layers/softmax.dml") as softmax
-source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
-
-train = function(matrix[double] X, matrix[double] y,
- matrix[double] X_val, matrix[double] y_val,
- int epochs)
- return (matrix[double] W, matrix[double] b) {
- /*
- * Trains a softmax classifier.
- *
- * The input matrix, X, has N examples, each with D features.
- * The targets, y, have K classes, and are one-hot encoded.
- *
- * Inputs:
- * - X: Input data matrix, of shape (N, D).
- * - y: Target matrix, of shape (N, K).
- * - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
- * - y_val: Target validation matrix, of shape (N, K).
- * - epochs: Total number of full training loops over the full data set.
- *
- * Outputs:
- * - W: Weights (parameters) matrix, of shape (D, M).
- * - b: Biases vector, of shape (1, M).
- */
- N = nrow(X) # num examples
- D = ncol(X) # num features
- K = ncol(y) # num classes
-
- # Create softmax classifier:
- # affine -> softmax
- [W, b] = affine::init(D, K)
- W = W / sqrt(2.0/(D)) * sqrt(1/(D))
-
- # Initialize SGD w/ Nesterov momentum optimizer
- lr = 0.2 # learning rate
- mu = 0 # momentum
- decay = 0.99 # learning rate decay constant
- vW = sgd_nesterov::init(W) # optimizer momentum state for W
- vb = sgd_nesterov::init(b) # optimizer momentum state for b
-
- # Optimize
- print("Starting optimization")
- batch_size = 50
- iters = 1000 #ceil(N / batch_size)
- for (e in 1:epochs) {
- for(i in 1:iters) {
- # Get next batch
- beg = ((i-1) * batch_size) %% N + 1
- end = min(N, beg + batch_size - 1)
- X_batch = X[beg:end,]
- y_batch = y[beg:end,]
-
- # Compute forward pass
- ## affine & softmax:
- out = affine::forward(X_batch, W, b)
- probs = softmax::forward(out)
-
- # Compute loss & accuracy for training & validation data
- loss = cross_entropy_loss::forward(probs, y_batch)
- accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
- probs_val = predict(X_val, W, b)
- loss_val = cross_entropy_loss::forward(probs_val, y_val)
- accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
- print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " +
- accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
-
- # Compute backward pass
- ## loss:
- dprobs = cross_entropy_loss::backward(probs, y_batch)
- ## affine & softmax:
- dout = softmax::backward(dprobs, out)
- [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b)
-
- # Optimize with SGD w/ Nesterov momentum
- [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW)
- [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb)
- }
- # Anneal momentum towards 0.999
- mu = mu + (0.999 - mu)/(1+epochs-e)
- # Decay learning rate
- lr = lr * decay
- }
-}
-
-predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
- return (matrix[double] probs) {
- /*
- * Computes the class probability predictions of a softmax classifier.
- *
- * The input matrix, X, has N examples, each with D features.
- *
- * Inputs:
- * - X: Input data matrix, of shape (N, D).
- * - W: Weights (parameters) matrix, of shape (D, M).
- * - b: Biases vector, of shape (1, M).
- *
- * Outputs:
- * - probs: Class probabilities, of shape (N, K).
- */
- # Compute forward pass
- ## affine & softmax:
- out = affine::forward(X, W, b)
- probs = softmax::forward(out)
-}
-
-eval = function(matrix[double] probs, matrix[double] y)
- return (double loss, double accuracy) {
- /*
- * Evaluates a softmax classifier.
- *
- * The probs matrix contains the class probability predictions
- * of K classes over N examples. The targets, y, have K classes,
- * and are one-hot encoded.
- *
- * Inputs:
- * - probs: Class probabilities, of shape (N, K).
- * - y: Target matrix, of shape (N, K).
- *
- * Outputs:
- * - loss: Scalar loss, of shape (1).
- * - accuracy: Scalar accuracy, of shape (1).
- */
- # Compute loss & accuracy
- loss = cross_entropy_loss::forward(probs, y)
- correct_pred = rowIndexMax(probs) == rowIndexMax(y)
- accuracy = mean(correct_pred)
-}
-
-generate_dummy_data = function()
- return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) {
- /*
- * Generate a dummy dataset similar to the MNIST dataset.
- *
- * Outputs:
- * - X: Input data matrix, of shape (N, D).
- * - y: Target matrix, of shape (N, K).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- */
- # Generate dummy input data
- N = 1024 # num examples
- C = 1 # num input channels
- Hin = 28 # input height
- Win = 28 # input width
- T = 10 # num targets
- X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
- classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
- y = table(seq(1, N), classes) # one-hot encoding
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/affine.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/affine.dml b/scripts/staging/SystemML-NN/nn/layers/affine.dml
deleted file mode 100644
index c9a740b..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/affine.dml
+++ /dev/null
@@ -1,92 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Affine (fully-connected) layer.
- */
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b)
- return (matrix[double] out) {
- /*
- * Computes the forward pass for an affine (fully-connected) layer
- * with M neurons. The input data has N examples, each with D
- * features.
- *
- * Inputs:
- * - X: Inputs, of shape (N, D).
- * - W: Weights, of shape (D, M).
- * - b: Biases, of shape (1, M).
- *
- * Outputs:
- * - out: Outputs, of shape (N, M).
- */
- out = X %*% W + b
-}
-
-backward = function(matrix[double] dout, matrix[double] X,
- matrix[double] W, matrix[double] b)
- return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
- /*
- * Computes the backward pass for a fully-connected (affine) layer
- * with M neurons.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of shape (N, M).
- * - X: Inputs, of shape (N, D).
- * - W: Weights, of shape (D, M).
- * - b: Biases, of shape (1, M).
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, D).
- * - dW: Gradient wrt `W`, of shape (D, M).
- * - db: Gradient wrt `b`, of shape (1, M).
- */
- dX = dout %*% t(W)
- dW = t(X) %*% dout
- db = colSums(dout)
-}
-
-init = function(int D, int M)
- return (matrix[double] W, matrix[double] b) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * We use the heuristic by He et al., which limits the magnification
- * of inputs/gradients during forward/backward passes by scaling
- * unit-Gaussian weights by a factor of sqrt(2/n), under the
- * assumption of relu neurons.
- * - http://arxiv.org/abs/1502.01852
- *
- * Inputs:
- * - D: Dimensionality of the input features (number of features).
- * - M: Number of neurons in this layer.
- *
- * Outputs:
- * - W: Weights, of shape (D, M).
- * - b: Biases, of shape (1, M).
- */
- W = rand(rows=D, cols=M, pdf="normal") * sqrt(2.0/D)
- b = matrix(0, rows=1, cols=M)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
deleted file mode 100644
index 2ccffdb..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
+++ /dev/null
@@ -1,210 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 1D Batch Normalization layer.
- */
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
- string mode, matrix[double] ema_mean, matrix[double] ema_var,
- double mu, double epsilon)
- return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
- matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
- /*
- * Computes the forward pass for a 1D batch normalization layer.
- * The input data has N examples, each with D features.
- *
- * A batch normalization layer uses the per-feature sample mean and
- * per-feature uncorrected sample variance during training to
- * normalize each feature of the input data. Additionally, it
- * introduces learnable parameters (gamma, beta) to control the
- * amount of normalization.
- *
- * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
- *
- * This implementation maintains exponential moving averages of the
- * mean and variance during training for use during testing.
- *
- * Reference:
- * - Batch Normalization: Accelerating Deep Network Training by
- * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
- * - https://arxiv.org/abs/1502.03167
- *
- * Inputs:
- * - X: Inputs, of shape (N, D).
- * - gamma: Scale parameters, of shape (1, D).
- * - beta: Shift parameters, of shape (1, D).
- * - mode: 'train' or 'test' to indicate if the model is currently
- * being trained or tested. During training, the current batch
- * mean and variance will be used to normalize the inputs, while
- * during testing, the exponential average of the mean and
- * variance over all previous batches will be used.
- * - ema_mean: Exponential moving average of the mean, of
- * shape (1, D).
- * - ema_var: Exponential moving average of the variance, of
- * shape (1, D).
- * - mu: Momentum value for moving averages.
- * Typical values are in the range of [0.9, 0.999].
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Typical values are in the range of [1e-5, 1e-3].
- *
- * Outputs:
- * - out: Outputs, of shape (N, D).
- * - ema_mean_upd: Updated exponential moving average of the mean,
- * of shape (1, D).
- * - ema_var_upd: Updated exponential moving average of the variance,
- * of shape (1, D).
- * - cache_mean: Cache of the batch mean, of shape (1, D).
- * Note: This is used for performance during training.
- * - cache_var: Cache of the batch variance, of shape (1, D).
- * Note: This is used for performance during training.
- * - cache_norm: Cache of the normalized inputs, of shape (N, D).
- * Note: This is used for performance during training.
- */
- N = nrow(X)
-
- if (mode == 'train') {
- # Compute feature-wise mean and variance
- mean = colMeans(X) # shape (1, D)
- # var = (1/N) * colSums((X-mean)^2)
- var = colVars(X) * ((N-1)/N) # compute uncorrected variance, of shape (1, D)
- # Update moving averages
- ema_mean_upd = mu*ema_mean + (1-mu)*mean
- ema_var_upd = mu*ema_var + (1-mu)*var
- }
- else {
- # Use moving averages of mean and variance during testing
- mean = ema_mean
- var = ema_var
- ema_mean_upd = ema_mean
- ema_var_upd = ema_var
- }
-
- # Normalize, shift, and scale
- # norm = (X-mean)*(var+epsilon)^(-1/2)
- norm = (X-mean) / sqrt(var+epsilon) # shape (N, D)
- out = norm*gamma + beta # shape (N, D)
-
- # Save variable for backward pass
- cache_mean = mean
- cache_var = var
- cache_norm = norm
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
- matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
- matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
- matrix[double] X, matrix[double] gamma, matrix[double] beta,
- string mode, matrix[double] ema_mean, matrix[double] ema_var,
- double mu, double epsilon)
- return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
- /*
- * Computes the backward pass for a 1D batch normalization layer.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of shape (N, D).
- * - out: Outputs from the forward pass, of shape (N, D).
- * - ema_mean_upd: Updated exponential moving average of the mean
- * from the forward pass, of shape (1, D).
- * - ema_var_upd: Updated exponential moving average of the variance
- * from the forward pass, of shape (1, D).
- * - cache_mean: Cache of the batch mean from the forward pass, of
- * shape (1, D). Note: This is used for performance during
- * training.
- * - cache_var: Cache of the batch variance from the forward pass,
- * of shape (1, D). Note: This is used for performance during
- * training.
- * - cache_norm: Cache of the normalized inputs from the forward
- * pass, of shape (N, D). Note: This is used for performance
- * during training.
- * - X: Inputs, of shape (N, D).
- * - gamma: Scale parameters, of shape (1, D).
- * - beta: Shift parameters, of shape (1, D).
- * - mode: 'train' or 'test' to indicate if the model is currently
- * being trained or tested. During training, the current batch
- * mean and variance will be used to normalize the inputs, while
- * during testing, the exponential average of the mean and
- * variance over all previous batches will be used.
- * - ema_mean: Exponential moving average of the mean, of
- * shape (1, D).
- * - ema_var: Exponential moving average of the variance, of
- * shape (1, D).
- * - mu: Momentum value for moving averages.
- * Typical values are in the range of [0.9, 0.999].
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Typical values are in the range of [1e-5, 1e-3].
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, D).
- * - dgamma: Gradient wrt `W`, of shape (1, D).
- * - dbeta: Gradient wrt `b`, of shape (1, D).
- *
- */
- N = nrow(X)
- mean = cache_mean
- var = cache_var
- norm = cache_norm
- centered = X-mean
-
- if (mode == 'train') {
- # Compute gradients during training
- dgamma = colSums(dout*norm) # shape (1, D)
- dbeta = colSums(dout) # shape (1, D)
- dnorm = dout * gamma # shape (N, D)
- dvar = (-1/2) * colSums(centered * (var+epsilon)^(-3/2) * dnorm) # shape (1, D)
- dmean = colSums((-dnorm/sqrt(var+epsilon)) + ((-2/N)*centered*dvar)) # shape (1, D)
- dX = (dnorm/sqrt(var+epsilon)) + ((2/N)*centered*dvar) + ((1/N)*dmean) # shape (N, D)
- }
- else {
- # Compute gradients during testing
- dgamma = colSums(dout*norm) # shape (1, D)
- dbeta = colSums(dout) # shape (1, D)
- dnorm = dout * gamma # shape (N, D)
- dX = dnorm / sqrt(var+epsilon) # shape (N, D)
- }
-}
-
-init = function(int D)
- return (matrix[double] gamma, matrix[double] beta,
- matrix[double] ema_mean, matrix[double] ema_var) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * Inputs:
- * - D: Dimensionality of the input features (number of features).
- *
- * Outputs:
- * - gamma: Scale parameters, of shape (1, D).
- * - beta: Shift parameters, of shape (1, D).
- * - ema_mean: Exponential moving average of the mean, of
- * shape (1, D).
- * - ema_var: Exponential moving average of the variance, of
- * shape (1, D).
- */
- gamma = matrix(1, rows=1, cols=D)
- beta = matrix(0, rows=1, cols=D)
- ema_mean = matrix(0, rows=1, cols=D)
- ema_var = matrix(1, rows=1, cols=D)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
deleted file mode 100644
index 49c6746..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
+++ /dev/null
@@ -1,238 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D (Spatial) Batch Normalization layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
- int C, int Hin, int Win, string mode,
- matrix[double] ema_mean, matrix[double] ema_var,
- double mu, double epsilon)
- return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
- matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
- /*
- * Computes the forward pass for a 2D (spatial) batch normalization
- * layer. The input data has N examples, each represented as a 3D
- * volume unrolled into a single vector.
- *
- * A spatial batch normalization layer uses the per-channel sample
- * mean and per-channel uncorrected sample variance during training
- * to normalize each channel of the input data. Additionally, it
- * introduces learnable parameters (gamma, beta) to control the
- * amount of normalization.
- *
- * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
- *
- * This implementation maintains exponential moving averages of the
- * mean and variance during training for use during testing.
- *
- * Reference:
- * - Batch Normalization: Accelerating Deep Network Training by
- * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
- * - https://arxiv.org/abs/1502.03167
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - gamma: Scale parameters, of shape (C, 1).
- * - beta: Shift parameters, of shape (C, 1).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - mode: 'train' or 'test' to indicate if the model is currently
- * being trained or tested. During training, the current batch
- * mean and variance will be used to normalize the inputs, while
- * during testing, the exponential average of the mean and
- * variance over all previous batches will be used.
- * - ema_mean: Exponential moving average of the mean, of
- * shape (C, 1).
- * - ema_var: Exponential moving average of the variance, of
- * shape (C, 1).
- * - mu: Momentum value for moving averages.
- * Typical values are in the range of [0.9, 0.999].
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Typical values are in the range of [1e-5, 1e-3].
- *
- * Outputs:
- * - out: Outputs, of shape (N, C*Hin*Win).
- * - ema_mean_upd: Updated exponential moving average of the mean,
- * of shape (C, 1).
- * - ema_var_upd: Updated exponential moving average of the variance,
- * of shape (C, 1).
- * - cache_mean: Cache of the batch mean, of shape (C, 1).
- * Note: This is used for performance during training.
- * - cache_var: Cache of the batch variance, of shape (C, 1).
- * Note: This is used for performance during training.
- * - cache_norm: Cache of the normalized inputs, of
- * shape (C, N*Hin*Win). Note: This is used for performance
- * during training.
- */
- N = nrow(X)
-
- if (mode == 'train') {
- # Compute channel-wise mean and variance
- # Since we don't have tensors, we will compute the means and variances in a piece-wise fashion.
- # - mean of total group is mean of subgroup means
- # - variance is the mean of the subgroup variances + the variance of the subgroup means
- subgrp_means = matrix(colMeans(X), rows=C, cols=Hin*Win)
- subgrp_vars = matrix(colVars(X) * ((N-1)/N), rows=C, cols=Hin*Win) # uncorrected variances
- mean = rowMeans(subgrp_means) # shape (C, 1)
- var = rowMeans(subgrp_vars) + rowVars(subgrp_means)*(((Hin*Win)-1)/(Hin*Win)) # shape (C, 1)
- # Update moving averages
- ema_mean_upd = mu*ema_mean + (1-mu)*mean
- ema_var_upd = mu*ema_var + (1-mu)*var
- }
- else {
- # Use moving averages of mean and variance during testing
- mean = ema_mean
- var = ema_var
- ema_mean_upd = ema_mean
- ema_var_upd = ema_var
- }
-
- # Normalize, shift, and scale
- # norm = (X-mean)*(var+epsilon)^(-1/2)
- # = (X-mean) / sqrt(var+epsilon)
- centered = bias_add(X, -mean) # shape (N, C*Hin*Win)
- norm = bias_multiply(centered, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win)
- # out = norm*gamma + beta
- scaled = bias_multiply(norm, gamma) # shape (N, C*Hin*Win)
- out = bias_add(scaled, beta) # shape (N, C*Hin*Win)
-
- # Save variable for backward pass
- cache_mean = mean
- cache_var = var
- cache_norm = norm
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
- matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
- matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
- matrix[double] X, matrix[double] gamma, matrix[double] beta,
- int C, int Hin, int Win, string mode,
- matrix[double] ema_mean, matrix[double] ema_var,
- double mu, double epsilon)
- return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
- /*
- * Computes the backward pass for a 2D (spatial) batch normalization
- * layer.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
- * - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
- * - ema_mean_upd: Updated exponential moving average of the mean
- * from the forward pass, of shape (C, 1).
- * - ema_var_upd: Updated exponential moving average of the variance
- * from the forward pass, of shape (C, 1).
- * - cache_mean: Cache of the batch mean from the forward pass, of
- * shape (C, 1). Note: This is used for performance during
- * training.
- * - cache_var: Cache of the batch variance from the forward pass,
- * of shape (C, 1). Note: This is used for performance during
- * training.
- * - cache_norm: Cache of the normalized inputs from the forward
- * pass, of shape (C, N*Hin*Win). Note: This is used for
- * performance during training.
- * - X: Input data matrix to the forward pass, of
- * shape (N, C*Hin*Win).
- * - gamma: Scale parameters, of shape (C, 1).
- * - beta: Shift parameters, of shape (C, 1).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - mode: 'train' or 'test' to indicate if the model is currently
- * being trained or tested. During training, the current batch
- * mean and variance will be used to normalize the inputs, while
- * during testing, the exponential average of the mean and
- * variance over all previous batches will be used.
- * - ema_mean: Exponential moving average of the mean, of
- * shape (C, 1).
- * - ema_var: Exponential moving average of the variance, of
- * shape (C, 1).
- * - mu: Momentum value for moving averages.
- * Typical values are in the range of [0.9, 0.999].
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Typical values are in the range of [1e-5, 1e-3].
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
- * - dgamma: Gradient wrt `W`, of shape (C, 1).
- * - dbeta: Gradient wrt `b`, of shape (C, 1).
- *
- */
- N = nrow(X)
- mean = cache_mean
- var = cache_var
- norm = cache_norm
- centered = bias_add(X, -mean) # shape (N, C*Hin*Win)
-
- if (mode == 'train') {
- # Compute gradients during training
- dgamma = util::channel_sums(dout*norm, C, Hin, Win) # shape (C, 1)
- dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
- dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
- dvar = util::channel_sums((-1/2) * bias_multiply(centered, (var+epsilon)^(-3/2)) * dnorm,
- C, Hin, Win) # shape (C, 1)
- dmean_norm_branch = util::channel_sums(bias_multiply(dnorm, -1/sqrt(var+epsilon)), C, Hin, Win)
- dmean_var_branch = util::channel_sums((-2/(N*Hin*Win)) * centered, C, Hin, Win)
- dmean_var_branch = dmean_var_branch * dvar # we can't use a function within an expression yet
- dmean = dmean_norm_branch + dmean_var_branch # shape (C, 1)
- dX_norm_branch = bias_multiply(dnorm, 1/sqrt(var+epsilon))
- dX_mean_branch = (1/(N*Hin*Win)) * bias_add(matrix(0, rows=1, cols=C*Hin*Win), dmean)
- dX_var_branch = (2/(N*Hin*Win)) * bias_multiply(centered, dvar)
- dX = dX_norm_branch + dX_mean_branch + dX_var_branch # shape (N, C*Hin*Win)
- }
- else {
- # Compute gradients during testing
- dgamma = util::channel_sums(dout*norm, C, Hin, Win) # shape (C, 1)
- dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
- dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
- dX = bias_multiply(dnorm, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win)
- }
-}
-
-init = function(int C)
- return (matrix[double] gamma, matrix[double] beta,
- matrix[double] ema_mean, matrix[double] ema_var) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * Inputs:
- * - C: Number of input channels (dimensionality of input depth).
- *
- * Outputs:
- * - gamma: Scale parameters, of shape (C, 1).
- * - beta: Shift parameters, of shape (C, 1).
- * - ema_mean: Exponential moving average of the mean, of
- * shape (C, 1).
- * - ema_var: Exponential moving average of the variance, of
- * shape (C, 1).
- */
- gamma = matrix(1, rows=C, cols=1)
- beta = matrix(0, rows=C, cols=1)
- ema_mean = matrix(0, rows=C, cols=1)
- ema_var = matrix(1, rows=C, cols=1)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d.dml b/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
deleted file mode 100644
index 9d03568..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
+++ /dev/null
@@ -1,194 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] out, int Hout, int Wout) {
- /*
- * Computes the forward pass for a 2D spatial convolutional layer with
- * F filters. The input data has N examples, each represented as a 3D
- * volume unrolled into a single vector.
- *
- * This implementation uses `im2col` internally for each image to
- * extract local image regions (patches) into columns, and then
- * performs a matrix multiplication with the filters to compute the
- * output maps.
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * For same output height as input, set `padh = (Hf - 1) / 2`,
- * assuming `strideh = 1`.
- * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
- * preserves the spatial dimensions of the input.
- * - padw: Padding for left and right sides.
- * For same output width as input, set `padw = (Wf - 1) / 2`,
- * assuming `stridew = 1`.
- * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
- * preserves the spatial dimensions of the input.
- *
- * Outputs:
- * - out: Outputs, of shape (N, F*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- */
- N = nrow(X)
- F = nrow(W)
- Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
- Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
- # Create output volume
- out = matrix(0, rows=N, cols=F*Hout*Wout)
-
- # Convolution - im2col implementation
- parfor (n in 1:N) { # all examples
- Xn = matrix(X[n,], rows=C, cols=Hin*Win) # reshape
-
- # Pad image
- Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0) # shape (C, (Hin+2*padh)*(Win+2*padw))
-
- # Extract local image patches into columns with im2col, of shape (C*Hf*Wf, Hout*Wout)
- Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
-
- # Convolve patches with filters
- outn = W %*% Xn_padded_cols + b # shape (F, Hout*Wout)
- out[n,] = matrix(outn, rows=1, cols=F*Hout*Wout) # reshape
- }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
- matrix[double] X, matrix[double] W, matrix[double] b,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
- /*
- * Computes the backward pass for a 2D spatial convolutional layer
- * with F filters.
- *
- * This implementation uses `im2col` and `col2im` internally.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of
- * shape (N, F*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * - padw: Padding for left and right sides.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
- * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
- * - db: Gradient wrt `b`, of shape (F, 1).
- */
- N = nrow(X)
- F = nrow(W)
-
- # Create gradient volumes
- # Note: Create convenience gradient volumes for dW and db that will
- # allow for one gradient to be stored per example, allowing for
- # parallel computation at the expense of memory. We will reduce at
- # the end.
- dX = matrix(0, rows=N, cols=C*Hin*Win)
- dWN = matrix(0, rows=N, cols=F*C*Hf*Wf) # dW = matrix(0, rows=F, cols=C*Hf*Wf)
- dbN = matrix(0, rows=N, cols=F) # db = matrix(0, rows=F, cols=1)
-
- # Partial derivatives for convolution - im2col implementation
- parfor (n in 1:N) { # all examples
- doutn = matrix(dout[n,], rows=F, cols=Hout*Wout)
-
- # Compute dW
- Xn = matrix(X[n,], rows=C, cols=Hin*Win) # reshape
- Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0) # shape (C, (Hin+2*padh)*(Win+2*padw))
- Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
- # dW = dW + doutn %*% t(Xn_padded_cols)
- dWN[n,] = matrix(doutn %*% t(Xn_padded_cols), rows=1, cols=F*C*Hf*Wf)
-
- # Compute db
- # db = db + rowSums(doutn)
- dbN[n,] = matrix(rowSums(doutn), rows=1, cols=F)
-
- # Compute dX
- dXn_padded_cols = t(W) %*% doutn # shape (C*Hf*Wf, Hout*Wout)
- dXn_padded = util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, Wf,
- strideh, stridew, "add")
- dXn = util::unpad_image(dXn_padded, Hin, Win, padh, padw)
- dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win) # reshape
- }
-
- # Reduce convenience gradient volumes with one gradient per example
- # into single gradients for W and b.
- dW = matrix(colSums(dWN), rows=F, cols=C*Hf*Wf)
- db = matrix(colSums(dbN), rows=F, cols=1)
-}
-
-init = function(int F, int C, int Hf, int Wf)
- return (matrix[double] W, matrix[double] b) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * We use the heuristic by He et al., which limits the magnification
- * of inputs/gradients during forward/backward passes by scaling
- * unit-Gaussian weights by a factor of sqrt(2/n), under the
- * assumption of relu neurons.
- * - http://arxiv.org/abs/1502.01852
- *
- * Inputs:
- * - F: Number of filters.
- * - C: Number of input channels (dimensionality of depth).
- * - Hf: Filter height.
- * - Wf: Filter width.
- *
- * Outputs:
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- */
- W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
- b = matrix(0, rows=F, cols=1)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
deleted file mode 100644
index bda7a9c..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
+++ /dev/null
@@ -1,160 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- *
- * This implementation uses a built-in operator for higher performance.
- */
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] out, int Hout, int Wout) {
- /*
- * Computes the forward pass for a 2D spatial convolutional layer with
- * F filters. The input data has N examples, each represented as a 3D
- * volume unrolled into a single vector.
- *
- * This implementation uses a built-in operator for higher
- * performance.
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- * - C: Number of input channels (dimensionality of depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * For same output height as input, set `padh = (Hf - 1) / 2`,
- * assuming `strideh = 1`.
- * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
- * preserves the spatial dimensions of the input.
- * - padw: Padding for left and right sides.
- * For same output width as input, set `padw = (Wf - 1) / 2`,
- * assuming `stridew = 1`.
- * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
- * preserves the spatial dimensions of the input.
- *
- * Outputs:
- * - out: Outputs, of shape (N, F*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- */
- N = nrow(X)
- F = nrow(W)
- Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
- Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
- # Convolution - built-in implementation
- out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
- stride=[strideh,stridew], padding=[padh,padw])
-
- # Add bias term to each output filter
- out = bias_add(out, b)
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
- matrix[double] X, matrix[double] W, matrix[double] b,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
- /*
- * Computes the backward pass for a 2D spatial convolutional layer
- * with F filters.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of
- * shape (N, F*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- * - C: Number of input channels (dimensionality of depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * For same output height as input, set `padh = (Hf - 1) / 2`,
- * assuming `strideh = 1`.
- * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
- * preserves the spatial dimensions of the input.
- * - padw: Padding for left and right sides.
- * For same output width as input, set `padw = (Wf - 1) / 2`,
- * assuming `stridew = 1`.
- * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
- * preserves the spatial dimensions of the input.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
- * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
- * - db: Gradient wrt `b`, of shape (F, 1).
- */
- N = nrow(X)
- F = nrow(W)
-
- # Partial derivatives for convolution - built-in implementation
- dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
- input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
- dX = conv2d_backward_data(W, dout, stride=[strideh, stridew], padding=[padh,padw],
- input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
-
- # Partial derivatives for bias vector
- db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))
-}
-
-init = function(int F, int C, int Hf, int Wf)
- return (matrix[double] W, matrix[double] b) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * We use the heuristic by He et al., which limits the magnification
- * of inputs/gradients during forward/backward passes by scaling
- * unit-Gaussian weights by a factor of sqrt(2/n), under the
- * assumption of relu neurons.
- * - http://arxiv.org/abs/1502.01852
- *
- * Inputs:
- * - F: Number of filters.
- * - C: Number of input channels (dimensionality of depth).
- * - Hf: Filter height.
- * - Wf: Filter width.
- *
- * Outputs:
- * - W: Weights, of shape (F, C*Hf*Wf).
- * - b: Biases, of shape (F, 1).
- */
- W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
- b = matrix(0, rows=F, cols=1)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
deleted file mode 100644
index 63db502..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
+++ /dev/null
@@ -1,78 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Cross-Entropy loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
- return (double loss) {
- /*
- * Computes the forward pass for a cross-entropy loss function. The
- * inputs consist of N examples, each with K dimensions corresponding
- * to normalized probabilities of K classes.
- *
- * ```
- * L_i = -y_i^T * log(pred_i)
- * L = (1/N) sum(L_i) for i=1 to N
- * ```
- *
- * In these equations, `L` is the total loss, `L_i` is the loss for
- * example `i`, `y_i` is the K-dimensional vector of target class
- * probabilities, `pred_i` is K-dimensional vector of predicted
- * class probabilities, and `N` is the number of examples.
- *
- * This can be interpreted as the negative log-likelihood assuming
- * a Bernoulli distribution generalized to K dimensions, or a
- * Multinomial with one observation.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, K).
- * - y: Targets, of shape (N, K).
- *
- * Outputs:
- * - loss: Average loss.
- */
- N = nrow(y)
- eps = 1e-10 # numerical stability to avoid log(0)
- losses = rowSums(-y * log(pred+eps))
- loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
- return (matrix[double] dpred) {
- /*
- * Computes the backward pass of a cross-entropy loss function. The
- * inputs consist of N examples, each with K dimensions corresponding
- * to normalized probabilities of K classes.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, K).
- * - y: Targets, of shape (N, K).
- *
- * Outputs:
- * - dpred: Gradient wrt `pred`, of shape (N, K).
- */
- N = nrow(y)
- eps = 1e-10 # numerical stability to avoid divide-by-zero
- dpred = (1/N) * -y * (1/(pred+eps))
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/dropout.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/dropout.dml b/scripts/staging/SystemML-NN/nn/layers/dropout.dml
deleted file mode 100644
index a36878b..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/dropout.dml
+++ /dev/null
@@ -1,76 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Dropout layer.
- */
-
-forward = function(matrix[double] X, double p, int seed)
- return (matrix[double] out, matrix[double] mask) {
- /*
- * Computes the forward pass for an inverted dropout layer.
- *
- * Drops the inputs element-wise with a probability p, and divides
- * by p to maintain the expected values of those inputs (which are
- * the outputs of neurons) at test time.
- *
- * Inputs:
- * - X: Inputs, of shape (any, any).
- * - p: Probability of keeping a neuron output.
- * - seed: [Optional: -1] Random number generator seed to allow for
- * deterministic evaluation. Set to -1 for a random seed.
- *
- * Outputs:
- * - out: Outputs, of same shape as `X`.
- * - mask: Dropout mask used to compute the output.
- */
- # Normally, we might use something like
- # `mask = rand(rows=nrow(X), cols=ncol(X), min=0, max=1, seed=seed) <= p`
- # to create a dropout mask. Fortunately, SystemML has a `sparsity` parameter on
- # the `rand` function that allows use to create a mask directly.
- if (seed == -1) {
- mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p)
- } else {
- mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed)
- }
- out = X * mask / p
-}
-
-backward = function(matrix[double] dout, matrix[double] X, double p, matrix[double] mask)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for an inverted dropout layer.
- *
- * Applies the mask to the upstream gradient, and divides by p to
- * maintain the expected values at test time.
- *
- * Inputs:
- * - dout: Gradient wrt `out`, of same shape as `X`.
- * - X: Inputs, of shape (any, any).
- * - p: Probability of keeping a neuron output.
- * - mask: Dropout mask used to compute the output.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of same shape as `X`.
- */
- dX = mask / p * dout
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
deleted file mode 100644
index b74566d..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
+++ /dev/null
@@ -1,72 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L1 loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
- return (double loss) {
- /*
- * Computes the forward pass for an L1 loss function. The inputs
- * consist of N examples, each with M dimensions to predict.
- *
- * ```
- * L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j.
- * L = (1/N) sum(L_i) for i=1 to N
- * ```
- *
- * In these equations, `L` is the total loss, `L_i` is the loss for
- * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
- * prediction, and `N` is the number of examples.
- *
- * This can be interpreted as the negative log-likelihood assuming
- * a Laplace distribution.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, M).
- * - y: Targets, of shape (N, M).
- *
- * Outputs:
- * - loss: Average loss.
- */
- N = nrow(y)
- losses = rowSums(abs(pred-y))
- loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
- return (matrix[double] dpred) {
- /*
- * Computes the backward pass for an L1 loss function. The inputs
- * consist of N examples, each with M dimensions to predict.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, M).
- * - y: Targets, of shape (N, M).
- *
- * Outputs:
- * - dpred: Gradient wrt `pred`, of shape (N, M).
- */
- N = nrow(y)
- dpred = sign(pred-y) / N
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
deleted file mode 100644
index 2b81c0b..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
+++ /dev/null
@@ -1,56 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L1 regularization.
- */
-
-forward = function(matrix[double] X, double lambda)
- return (double reg_loss) {
- /*
- * Computes the forward pass for an L1 regularization function.
- *
- * Inputs:
- * - X: Inputs, of shape (any, any).
- * - lambda: Regularization strength.
- * A typical value is 0.01.
- *
- * Outputs:
- * - reg_loss: Total regularization loss.
- */
- reg_loss = lambda * sum(abs(X))
-}
-
-backward = function(matrix[double] X, double lambda)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for an L1 regularization function.
- *
- * Inputs:
- * - X: Inputs, of shape (any, any).
- * - lambda: Regularization strength.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of same shape as `X`.
- */
- dX = lambda * sign(X)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
deleted file mode 100644
index 0482f25..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
+++ /dev/null
@@ -1,72 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L2 loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
- return (double loss) {
- /*
- * Computes the forward pass for an L2 loss function. The inputs
- * consist of N examples, each with M dimensions to predict.
- *
- * ```
- * L_i = (1/2) norm(pred_i - y_i)^2
- * L = (1/N) sum(L_i) for i=1 to N
- * ```
- *
- * In these equations, `L` is the total loss, `L_i` is the loss for
- * example `i`, `y_i` is the scalar target, `pred_i` is the scalar
- * prediction, and `N` is the number of examples.
- *
- * This can be interpreted as the negative log-likelihood assuming
- * a Gaussian distribution.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, M).
- * - y: Targets, of shape (N, M).
- *
- * Outputs:
- * - loss: Average loss.
- */
- N = nrow(y)
- losses = 0.5 * rowSums((pred-y)^2)
- loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
- return (matrix[double] dpred) {
- /*
- * Computes the backward pass for an L2 loss function. The inputs
- * consist of N examples, each with M dimensions to predict.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, M).
- * - y: Targets, of shape (N, M).
- *
- * Outputs:
- * - dpred: Gradient wrt `pred`, of shape (N, M).
- */
- N = nrow(y)
- dpred = (pred-y) / N
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
deleted file mode 100644
index 7255efe..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
+++ /dev/null
@@ -1,56 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * L2 regularization.
- */
-
-forward = function(matrix[double] X, double lambda)
- return (double reg_loss) {
- /*
- * Computes the forward pass for an L2 regularization function.
- *
- * Inputs:
- * - X: Inputs, of shape (any, any).
- * - lambda: Regularization strength.
- * A typical value is 0.01.
- *
- * Outputs:
- * - reg_loss: Total regularization loss.
- */
- reg_loss = 0.5 * lambda * sum(X^2)
-}
-
-backward = function(matrix[double] X, double lambda)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for an L2 regularization function.
- *
- * Inputs:
- * - X: Inputs, of shape (any, any).
- * - lambda: Regularization strength.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of same shape as `X`.
- */
- dX = lambda * X
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
deleted file mode 100644
index 15914f7..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
+++ /dev/null
@@ -1,76 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Log loss function.
- */
-
-forward = function(matrix[double] pred, matrix[double] y)
- return (double loss) {
- /*
- * Computes the forward pass for a log loss function.
- *
- * ```
- * L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i)
- * L = (1/N) sum(L_i) for i=1 to N
- * ```
- *
- * In these equations, `L` is the total loss, `L_i` is the loss for
- * example `i`, `y_i` is the binary target, `pred_i` is probability
- * of the true class (i.e. `y=1`), and `N` is the number of examples.
- *
- * This can be interpreted as the negative log-likelihood assuming
- * a Bernoulli distribution.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, 1).
- * Predictions should be probabilities of the true
- * class (i.e. probability of `y=1`).
- * - y: Targets, of shape (N, 1).
- * Targets should be binary in the set {0, 1}.
- *
- * Outputs:
- * - loss: Average loss.
- */
- N = nrow(y)
- losses = -y*log(pred) - (1-y)*log(1-pred)
- loss = sum(losses) / N
-}
-
-backward = function(matrix[double] pred, matrix[double] y)
- return (matrix[double] dpred) {
- /*
- * Computes the backward pass for a log loss function.
- *
- * Inputs:
- * - pred: Predictions, of shape (N, 1).
- * Predictions should be probabilities of the true
- * class (i.e. probability of `y=1`).
- * - y: Targets, of shape (N, 1).
- * Targets should be binary in the set {0, 1}.
- *
- * Outputs:
- * - dpred: Gradient wrt `pred`, of shape (N, 1).
- */
- N = nrow(y)
- dpred = (1/N) * (pred-y) / (pred*(1-pred))
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/lstm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/lstm.dml b/scripts/staging/SystemML-NN/nn/layers/lstm.dml
deleted file mode 100644
index a75add4..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/lstm.dml
+++ /dev/null
@@ -1,260 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * LSTM layer.
- */
-source("nn/layers/sigmoid.dml") as sigmoid
-source("nn/layers/tanh.dml") as tanh
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
- boolean return_sequences, matrix[double] out0, matrix[double] c0)
- return (matrix[double] out, matrix[double] c,
- matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog) {
- /*
- * Computes the forward pass for an LSTM layer with M neurons.
- * The input data has N sequences of T examples, each with D features.
- *
- * In an LSTM, an internal cell state is maintained, additive
- * interactions operate over the cell state at each timestep, and
- * some amount of this cell state is exposed as output at each
- * timestep. Additionally, the output of the previous timestep is fed
- * back in as an additional input at the current timestep.
- *
- * Reference:
- * - Long Short-Term Memory, Hochreiter, 1997
- * - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
- *
- * Inputs:
- * - X: Inputs, of shape (N, T*D).
- * - W: Weights, of shape (D+M, 4M).
- * - b: Biases, of shape (1, 4M).
- * - T: Length of example sequences (number of timesteps).
- * - D: Dimensionality of the input features (number of features).
- * - return_sequences: Whether to return `out` at all timesteps,
- * or just for the final timestep.
- * - out0: Outputs from previous timestep, of shape (N, M).
- * Note: This is *optional* and could just be an empty matrix.
- * - c0: Initial cell state, of shape (N, M).
- * Note: This is *optional* and could just be an empty matrix.
- *
- * Outputs:
- * - out: If `return_sequences` is True, outputs for all timesteps,
- * of shape (N, T*M). Else, outputs for the final timestep, of
- * shape (N, M).
- * - c: Cell state for final timestep, of shape (N, M).
- * - cache_out: Cache of outputs, of shape (T, N*M).
- * Note: This is used for performance during training.
- * - cache_c: Cache of cell state, of shape (T, N*M).
- * Note: This is used for performance during training.
- * - cache_ifog: Cache of intermediate values, of shape (T, N*4M).
- * Note: This is used for performance during training.
- */
- N = nrow(X)
- M = as.integer(ncol(W)/4)
- out_prev = out0
- c_prev = c0
- c = c_prev
- if (return_sequences) {
- out = matrix(0, rows=N, cols=T*M)
- }
- else {
- out = matrix(0, rows=N, cols=M)
- }
- # caches to be used during the backward pass for performance
- cache_out = matrix(0, rows=T, cols=N*M)
- cache_c = matrix(0, rows=T, cols=N*M)
- cache_ifog = matrix(0, rows=T, cols=N*4*M)
-
- for (t in 1:T) { # each timestep
- X_t = X[,(t-1)*D+1:t*D] # shape (N, D)
- input = cbind(X_t, out_prev) # shape (N, D+M)
- ifog = input %*% W + b # input, forget, output, and g gates; shape (N, 4M)
- tmp = sigmoid::forward(ifog[,1:3*M]) # i,f,o gates squashed with sigmoid
- ifog[,1:3*M] = tmp
- tmp = tanh::forward(ifog[,3*M+1:4*M]) # g gate squashed with tanh
- ifog[,3*M+1:4*M] = tmp
- # c_t = f*prev_c + i*g
- c = ifog[,M+1:2*M]*c_prev + ifog[,1:M]*ifog[,3*M+1:4*M] # shape (N, M)
- # out_t = o*tanh(c)
- tmp = tanh::forward(c)
- out_t = ifog[,2*M+1:3*M] * tmp # shape (N, M)
-
- # store
- if (return_sequences) {
- out[,(t-1)*M+1:t*M] = out_t
- }
- else {
- out = out_t
- }
- out_prev = out_t
- c_prev = c
- cache_out[t,] = matrix(out_t, rows=1, cols=N*M) # reshape
- cache_c[t,] = matrix(c, rows=1, cols=N*M) # reshape
- cache_ifog[t,] = matrix(ifog, rows=1, cols=N*4*M) # reshape
- }
-}
-
-backward = function(matrix[double] dout, matrix[double] dc,
- matrix[double] X, matrix[double] W, matrix[double] b, int T, int D,
- boolean given_sequences, matrix[double] out0, matrix[double] c0,
- matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog)
- return (matrix[double] dX, matrix[double] dW, matrix[double] db,
- matrix[double] dout0, matrix[double] dc0) {
- /*
- * Computes the backward pass for an LSTM layer with M neurons.
- *
- * Inputs:
- * - dout: Gradient wrt `out`. If `given_sequences` is `True`,
- * contains gradients on outputs for all timesteps, of
- * shape (N, T*M). Else, contains the gradient on the output
- * for the final timestep, of shape (N, M).
- * - dc: Gradient wrt `c` (from later in time), of shape (N, M).
- * This would come from later in time if the cell state was used
- * downstream as the initial cell state for another LSTM layer.
- * Typically, this would be used when a sequence was cut at
- * timestep `T` and then continued in the next batch. If `c`
- * was not used downstream, then `dc` would be an empty matrix.
- * - X: Inputs, of shape (N, T*D).
- * - W: Weights, of shape (D+M, 4M).
- * - b: Biases, of shape (1, 4M).
- * - T: Length of example sequences (number of timesteps).
- * - D: Dimensionality of the input features.
- * - given_sequences: Whether `dout` is for all timesteps,
- * or just for the final timestep. This is based on whether
- * `return_sequences` was true in the forward pass.
- * - out0: Outputs from previous timestep, of shape (N, M).
- * Note: This is *optional* and could just be an empty matrix.
- * - c0: Initial cell state, of shape (N, M).
- * Note: This is *optional* and could just be an empty matrix.
- * - cache_out: Cache of outputs, of shape (T, N*M).
- * Note: This is used for performance during training.
- * - cache_c: Cache of cell state, of shape (T, N*M).
- * Note: This is used for performance during training.
- * - cache_ifog: Cache of intermediate values, of shape (T, N*4*M).
- * Note: This is used for performance during training.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, T*D).
- * - dW: Gradient wrt `W`, of shape (D+M, 4M).
- * - db: Gradient wrt `b`, of shape (1, 4M).
- * - dout0: Gradient wrt `out0`, of shape (N, M).
- * - dc0: Gradient wrt `c0`, of shape (N, M).
- */
- N = nrow(X)
- M = as.integer(ncol(W)/4)
- dX = matrix(0, rows=N, cols=T*D)
- dW = matrix(0, rows=D+M, cols=4*M)
- db = matrix(0, rows=1, cols=4*M)
- dout0 = matrix(0, rows=N, cols=M)
- dc0 = matrix(0, rows=N, cols=M)
- dct = dc
- if (!given_sequences) {
- # only given dout for output at final timestep, so prepend empty douts for all other timesteps
- dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout) # shape (N, T*M)
- }
-
- t = T
- for (iter in 1:T) { # each timestep in reverse order
- X_t = X[,(t-1)*D+1:t*D] # shape (N, D)
- dout_t = dout[,(t-1)*M+1:t*M] # shape (N, M)
- out_t = matrix(cache_out[t,], rows=N, cols=M) # shape (N, M)
- ct = matrix(cache_c[t,], rows=N, cols=M) # shape (N, M)
- if (t == 1) {
- out_prev = out0 # shape (N, M)
- c_prev = c0 # shape (N, M)
- }
- else {
- out_prev = matrix(cache_out[t-1,], rows=N, cols=M) # shape (N, M)
- c_prev = matrix(cache_c[t-1,], rows=N, cols=M) # shape (N, M)
- }
- input = cbind(X_t, out_prev) # shape (N, D+M)
- ifog = matrix(cache_ifog[t,], rows=N, cols=4*M)
- i = ifog[,1:M] # input gate, shape (N, M)
- f = ifog[,M+1:2*M] # forget gate, shape (N, M)
- o = ifog[,2*M+1:3*M] # output gate, shape (N, M)
- g = ifog[,3*M+1:4*M] # g gate, shape (N, M)
-
- tmp = tanh::backward(dout_t, ct)
- dct = dct + o*tmp # shape (N, M)
- tmp = tanh::forward(ct)
- do = tmp * dout_t # output gate, shape (N, M)
- df = c_prev * dct # forget gate, shape (N, M)
- dc_prev = f * dct # shape (N, M)
- di = g * dct # input gate, shape (N, M)
- dg = i * dct # g gate, shape (N, M)
-
- di_raw = i * (1-i) * di
- df_raw = f * (1-f) * df
- do_raw = o * (1-o) * do
- dg_raw = (1-g^2) * dg
- difog_raw = cbind(di_raw, cbind(df_raw, cbind(do_raw, dg_raw))) # shape (N, 4M)
-
- dW = dW + t(input) %*% difog_raw # shape (D+M, 4M)
- db = db + colSums(difog_raw) # shape (1, 4M)
- dinput = difog_raw %*% t(W) # shape (N, D+M)
- dX[,(t-1)*D+1:t*D] = dinput[,1:D]
- dout_prev = dinput[,D+1:D+M] # shape (N, M)
- if (t == 1) {
- dout0 = dout_prev # shape (N, M)
- dc0 = dc_prev # shape (N, M)
- }
- else {
- dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M)
- dct = dc_prev # shape (N, M)
- }
- t = t - 1
- }
-}
-
-init = function(int N, int D, int M)
- return (matrix[double] W, matrix[double] b, matrix[double] out0, matrix[double] c0) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * We use the Glorot uniform heuristic which limits the magnification
- * of inputs/gradients during forward/backward passes by scaling
- * uniform weights by a factor of sqrt(6/(fan_in + fan_out)).
- * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
- *
- * Inputs:
- * - N: Number of examples in batch.
- * - D: Dimensionality of the input features (number of features).
- * - M: Number of neurons in this layer.
- *
- * Outputs:
- * - W: Weights, of shape (D+M, 4M).
- * - b: Biases, of shape (1, 4M).
- * - out0: Empty previous timestep output matrix, of shape (N, M).
- * - c0: Empty initial cell state matrix, of shape (N, M).
- */
- fan_in = D+M
- fan_out = 4*M
- scale = sqrt(6/(fan_in+fan_out))
- W = rand(rows=D+M, cols=4*M, min=-scale, max=scale, pdf="uniform")
- b = matrix(0, rows=1, cols=4*M)
- out0 = matrix(0, rows=N, cols=M)
- c0 = matrix(0, rows=N, cols=M)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
deleted file mode 100644
index fba1a4c..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
+++ /dev/null
@@ -1,159 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Max Pooling layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] out, int Hout, int Wout) {
- /*
- * Computes the forward pass for a 2D spatial max pooling layer.
- * The input data has N examples, each represented as a 3D volume
- * unrolled into a single vector.
- *
- * This implementation uses `im2col` internally for each image to
- * extract local image regions (patches) of each channel slice into
- * columns, and then performs max pooling over the patches to compute
- * the output maps.
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * A typical value is 0.
- * - padw: Padding for left and right sides.
- * A typical value is 0.
- *
- * Outputs:
- * - out: Outputs, of shape (N, C*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- */
- N = nrow(X)
- Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
- Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
- pad_value = -1/0 # in max pooling we pad with -infinity
-
- # Create output volume
- out = matrix(0, rows=N, cols=C*Hout*Wout)
-
- # Max pooling - im2col implementation
- parfor (n in 1:N) { # all examples
- img = matrix(X[n,], rows=C, cols=Hin*Win) # reshape
-
- if (padh > 0 | padw > 0) {
- # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
- img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
- }
-
- img_maxes = matrix(0, rows=C, cols=Hout*Wout) # zeros
- parfor (c in 1:C) { # all channels
- # Extract local image slice patches into columns with im2col, of shape (Hf*Wf, Hout*Wout)
- img_slice_cols = util::im2col(img[c,], Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew)
-
- # Max pooling on patches
- img_maxes[c,] = colMaxs(img_slice_cols)
- }
-
- out[n,] = matrix(img_maxes, rows=1, cols=C*Hout*Wout)
- }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for a 2D spatial max pooling layer.
- * The input data has N examples, each represented as a 3D volume
- * unrolled into a single vector.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of
- * shape (N, C*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- * - X: Input data matrix, of shape (N, C*Hin*Win).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * A typical value is 0.
- * - padw: Padding for left and right sides.
- * A typical value is 0.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
- */
- N = nrow(X)
- pad_value = -1/0 # in max pooling we pad with -infinity
-
- # Create gradient volume
- dX = matrix(0, rows=N, cols=C*Hin*Win)
-
- # Gradient of max pooling
- parfor (n in 1:N, check=0) { # all examples
- img = matrix(X[n,], rows=C, cols=Hin*Win)
- if (padh > 0 | padw > 0) {
- # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
- img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
- }
-
- dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
- parfor (c in 1:C, check=0) { # all channels
- img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
- dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
- for (hout in 1:Hout, check=0) { # all output rows
- hin = (hout-1)*strideh + 1
- for (wout in 1:Wout) { # all output columns
- win = (wout-1)*stridew + 1
- img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
- max_val_ind = img_slice_patch == max(img_slice_patch) # max value indicator matrix
- # gradient passes through only for the max value(s) in this patch
- dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout]
- dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1]
- + dimg_slice_patch
- }
- }
- dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
- }
-
- if (padh > 0 | padw > 0) {
- # Unpad image gradient
- dimg = util::unpad_image(dimg, Hin, Win, padh, padw) # shape (C, (Hin+2*padh)*(Win+2*padw))
- }
- dX[n,] = matrix(dimg, rows=1, cols=C*Hin*Win)
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
deleted file mode 100644
index 880f818..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
+++ /dev/null
@@ -1,103 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Max Pooling layer.
- *
- * This implementation uses a built-in operator for higher performance.
- */
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] out, int Hout, int Wout) {
- /*
- * Computes the forward pass for a 2D spatial max pooling layer.
- * The input data has N examples, each represented as a 3D volume
- * unrolled into a single vector.
- *
- * This implementation uses a built-in operator for higher
- * performance.
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * A typical value is 0.
- * - padw: Padding for left and right sides.
- * A typical value is 0.
- *
- * Outputs:
- * - out: Outputs, of shape (N, C*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- */
- N = nrow(X)
- Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
- Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
-
- # Max pooling - built-in implementation
- out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
- stride=[strideh,stridew], padding=[padh,padw])
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
- int C, int Hin, int Win, int Hf, int Wf,
- int strideh, int stridew, int padh, int padw)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for a 2D spatial max pooling layer.
- * The input data has N examples, each represented as a 3D volume
- * unrolled into a single vector.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of
- * shape (N, C*Hout*Wout).
- * - Hout: Output height.
- * - Wout: Output width.
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - Hf: Filter height.
- * - Wf: Filter width.
- * - strideh: Stride over height.
- * - stridew: Stride over width.
- * - padh: Padding for top and bottom sides.
- * A typical value is 0.
- * - padw: Padding for left and right sides.
- * A typical value is 0.
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
- */
- N = nrow(X)
-
- # Gradient of max pooling
- dX = max_pool_backward(X, dout, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
- stride=[strideh,stridew], padding=[padh,padw])
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/relu.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/relu.dml b/scripts/staging/SystemML-NN/nn/layers/relu.dml
deleted file mode 100644
index 93a6e90..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/relu.dml
+++ /dev/null
@@ -1,59 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Rectified Linear Unit (ReLU) nonlinearity layer.
- */
-
-forward = function(matrix[double] X)
- return (matrix[double] out) {
- /*
- * Computes the forward pass for a ReLU nonlinearity layer.
- *
- * Performs an element-wise evaluation of `f(input) = max(0, input)`.
- *
- * Inputs:
- * - X: Inputs, of shape (any, any).
- *
- * Outputs:
- * - out: Outputs, of same shape as `X`.
- */
- out = max(X, 0)
-}
-
-backward = function(matrix[double] dout, matrix[double] X)
- return (matrix[double] dX) {
- /*
- * Computes the backward pass for a ReLU nonlinearity layer.
- *
- * Essentially performs a pass-through of the upstream gradient
- * for cells > 0.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
- * - X: Previous input data matrix, of shape (any, any).
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of same shape as `X`.
- */
- dX = (X > 0) * dout
-}
-