You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/04/11 00:23:04 UTC
[1/2] incubator-systemml git commit: [SYSTEMML-1463] Rename
`batch_norm.dml` and `spatial_batch_norm.dml`
Repository: incubator-systemml
Updated Branches:
refs/heads/master fb55a74d1 -> 651725651
[SYSTEMML-1463] Rename `batch_norm.dml` and `spatial_batch_norm.dml`
Rename `batch_norm.dml` and `spatial_batch_norm.dml` to
`batch_norm1d.dml` and `batch_norm2d.dml`.
Closes #453.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/f5ef628c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/f5ef628c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/f5ef628c
Branch: refs/heads/master
Commit: f5ef628c0dbe4e5ce8dec61f5e05c5597e341c95
Parents: fb55a74
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Mon Apr 10 17:20:13 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Mon Apr 10 17:20:13 2017 -0700
----------------------------------------------------------------------
.../SystemML-NN/nn/layers/batch_norm.dml | 209 ----------------
.../SystemML-NN/nn/layers/batch_norm1d.dml | 210 ++++++++++++++++
.../SystemML-NN/nn/layers/batch_norm2d.dml | 238 +++++++++++++++++++
.../nn/layers/spatial_batch_norm.dml | 235 ------------------
.../staging/SystemML-NN/nn/test/grad_check.dml | 68 +++---
.../staging/SystemML-NN/nn/test/run_tests.dml | 8 +-
scripts/staging/SystemML-NN/nn/test/test.dml | 24 +-
7 files changed, 495 insertions(+), 497 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f5ef628c/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
deleted file mode 100644
index caad100..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml
+++ /dev/null
@@ -1,209 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Batch Normalization layer.
- */
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
- string mode, matrix[double] ema_mean, matrix[double] ema_var,
- double mu, double epsilon)
- return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
- matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
- /*
- * Computes the forward pass for a batch normalization layer.
- *
- * A batch normalization layer uses the per-feature sample mean and
- * per-feature uncorrected sample variance during training to
- * normalize each feature of the input data. Additionally, it
- * introduces learnable parameters (gamma, beta) to control the
- * amount of normalization.
- *
- * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
- *
- * This implementation maintains exponential moving averages of the
- * mean and variance during training for use during testing.
- *
- * Reference:
- * - Batch Normalization: Accelerating Deep Network Training by
- * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
- * - https://arxiv.org/abs/1502.03167
- *
- * Inputs:
- * - X: Inputs, of shape (N, D).
- * - gamma: Scale parameters, of shape (1, D).
- * - beta: Shift parameters, of shape (1, D).
- * - mode: 'train' or 'test' to indicate if the model is currently
- * being trained or tested. During training, the current batch
- * mean and variance will be used to normalize the inputs, while
- * during testing, the exponential average of the mean and
- * variance over all previous batches will be used.
- * - ema_mean: Exponential moving average of the mean, of
- * shape (1, D).
- * - ema_var: Exponential moving average of the variance, of
- * shape (1, D).
- * - mu: Momentum value for moving averages.
- * Typical values are in the range of [0.9, 0.999].
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Typical values are in the range of [1e-5, 1e-3].
- *
- * Outputs:
- * - out: Outputs, of shape (N, D).
- * - ema_mean_upd: Updated exponential moving average of the mean,
- * of shape (1, D).
- * - ema_var_upd: Updated exponential moving average of the variance,
- * of shape (1, D).
- * - cache_mean: Cache of the batch mean, of shape (1, D).
- * Note: This is used for performance during training.
- * - cache_var: Cache of the batch variance, of shape (1, D).
- * Note: This is used for performance during training.
- * - cache_norm: Cache of the normalized inputs, of shape (N, D).
- * Note: This is used for performance during training.
- */
- N = nrow(X)
-
- if(mode == 'train') {
- # Compute feature-wise mean and variance
- mean = colMeans(X) # shape (1, D)
- # var = (1/N) * colSums((X-mean)^2)
- var = colVars(X) * ((N-1)/N) # compute uncorrected variance, of shape (1, D)
- # Update moving averages
- ema_mean_upd = mu*ema_mean + (1-mu)*mean
- ema_var_upd = mu*ema_var + (1-mu)*var
- }
- else {
- # Use moving averages of mean and variance during testing
- mean = ema_mean
- var = ema_var
- ema_mean_upd = ema_mean
- ema_var_upd = ema_var
- }
-
- # Normalize, shift, and scale
- # norm = (X-mean)*(var+epsilon)^(-1/2)
- norm = (X-mean) / sqrt(var+epsilon) # shape (N, D)
- out = norm*gamma + beta # shape (N, D)
-
- # Save variable for backward pass
- cache_mean = mean
- cache_var = var
- cache_norm = norm
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
- matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
- matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
- matrix[double] X, matrix[double] gamma, matrix[double] beta,
- string mode, matrix[double] ema_mean, matrix[double] ema_var,
- double mu, double epsilon)
- return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
- /*
- * Computes the backward pass for a batch normalization layer.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of shape (N, D).
- * - out: Outputs from the forward pass, of shape (N, D).
- * - ema_mean_upd: Updated exponential moving average of the mean
- * from the forward pass, of shape (1, D).
- * - ema_var_upd: Updated exponential moving average of the variance
- * from the forward pass, of shape (1, D).
- * - cache_mean: Cache of the batch mean from the forward pass, of
- * shape (1, D). Note: This is used for performance during
- * training.
- * - cache_var: Cache of the batch variance from the forward pass,
- * of shape (1, D). Note: This is used for performance during
- * training.
- * - cache_norm: Cache of the normalized inputs from the forward
- * pass, of shape (N, D). Note: This is used for performance
- * during training.
- * - X: Inputs, of shape (N, D).
- * - gamma: Scale parameters, of shape (1, D).
- * - beta: Shift parameters, of shape (1, D).
- * - mode: 'train' or 'test' to indicate if the model is currently
- * being trained or tested. During training, the current batch
- * mean and variance will be used to normalize the inputs, while
- * during testing, the exponential average of the mean and
- * variance over all previous batches will be used.
- * - ema_mean: Exponential moving average of the mean, of
- * shape (1, D).
- * - ema_var: Exponential moving average of the variance, of
- * shape (1, D).
- * - mu: Momentum value for moving averages.
- * Typical values are in the range of [0.9, 0.999].
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Typical values are in the range of [1e-5, 1e-3].
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, D).
- * - dgamma: Gradient wrt `W`, of shape (1, D).
- * - dbeta: Gradient wrt `b`, of shape (1, D).
- *
- */
- N = nrow(X)
- mean = cache_mean
- var = cache_var
- norm = cache_norm
- centered = X-mean
-
- if (mode == 'train') {
- # Compute gradients during training
- dgamma = colSums(norm*dout) # shape (1, D)
- dbeta = colSums(dout) # shape (1, D)
- dnorm = dout * gamma # shape (N, D)
- dvar = (-1/2) * colSums(centered * (var+epsilon)^(-3/2) * dnorm) # shape (1, D)
- dmean = colSums((-dnorm/sqrt(var+epsilon)) + ((-2/N)*centered*dvar)) # shape (1, D)
- dX = (dnorm/sqrt(var+epsilon)) + ((2/N)*centered*dvar) + ((1/N)*dmean) # shape (N, D)
- }
- else {
- # Compute gradients during testing
- dgamma = colSums(norm*dout) # shape (1, D)
- dbeta = colSums(dout) # shape (1, D)
- dnorm = dout * gamma # shape (N, D)
- dX = dnorm / sqrt(var+epsilon) # shape (N, D)
- }
-}
-
-init = function(int D)
- return (matrix[double] gamma, matrix[double] beta,
- matrix[double] ema_mean, matrix[double] ema_var) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * Inputs:
- * - D: Dimensionality of the input features (number of features).
- *
- * Outputs:
- * - gamma: Scale parameters, of shape (1, D).
- * - beta: Shift parameters, of shape (1, D).
- * - ema_mean: Exponential moving average of the mean, of
- * shape (1, D).
- * - ema_var: Exponential moving average of the variance, of
- * shape (1, D).
- */
- gamma = matrix(1, rows=1, cols=D)
- beta = matrix(0, rows=1, cols=D)
- ema_mean = matrix(0, rows=1, cols=D)
- ema_var = matrix(1, rows=1, cols=D)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f5ef628c/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
new file mode 100644
index 0000000..9ecbd77
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml
@@ -0,0 +1,210 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 1D Batch Normalization layer.
+ */
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ string mode, matrix[double] ema_mean, matrix[double] ema_var,
+ double mu, double epsilon)
+ return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
+ matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
+ /*
+ * Computes the forward pass for a 1D batch normalization layer.
+ * The input data has N examples, each with D features.
+ *
+ * A batch normalization layer uses the per-feature sample mean and
+ * per-feature uncorrected sample variance during training to
+ * normalize each feature of the input data. Additionally, it
+ * introduces learnable parameters (gamma, beta) to control the
+ * amount of normalization.
+ *
+ * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
+ *
+ * This implementation maintains exponential moving averages of the
+ * mean and variance during training for use during testing.
+ *
+ * Reference:
+ * - Batch Normalization: Accelerating Deep Network Training by
+ * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
+ * - https://arxiv.org/abs/1502.03167
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, D).
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ * - mode: 'train' or 'test' to indicate if the model is currently
+ * being trained or tested. During training, the current batch
+ * mean and variance will be used to normalize the inputs, while
+ * during testing, the exponential average of the mean and
+ * variance over all previous batches will be used.
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (1, D).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (1, D).
+ * - mu: Momentum value for moving averages.
+ * Typical values are in the range of [0.9, 0.999].
+ * - epsilon: Smoothing term to avoid divide by zero errors.
+ * Typical values are in the range of [1e-5, 1e-3].
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, D).
+ * - ema_mean_upd: Updated exponential moving average of the mean,
+ * of shape (1, D).
+ * - ema_var_upd: Updated exponential moving average of the variance,
+ * of shape (1, D).
+ * - cache_mean: Cache of the batch mean, of shape (1, D).
+ * Note: This is used for performance during training.
+ * - cache_var: Cache of the batch variance, of shape (1, D).
+ * Note: This is used for performance during training.
+ * - cache_norm: Cache of the normalized inputs, of shape (N, D).
+ * Note: This is used for performance during training.
+ */
+ N = nrow(X)
+
+ if(mode == 'train') {
+ # Compute feature-wise mean and variance
+ mean = colMeans(X) # shape (1, D)
+ # var = (1/N) * colSums((X-mean)^2)
+ var = colVars(X) * ((N-1)/N) # compute uncorrected variance, of shape (1, D)
+ # Update moving averages
+ ema_mean_upd = mu*ema_mean + (1-mu)*mean
+ ema_var_upd = mu*ema_var + (1-mu)*var
+ }
+ else {
+ # Use moving averages of mean and variance during testing
+ mean = ema_mean
+ var = ema_var
+ ema_mean_upd = ema_mean
+ ema_var_upd = ema_var
+ }
+
+ # Normalize, shift, and scale
+ # norm = (X-mean)*(var+epsilon)^(-1/2)
+ norm = (X-mean) / sqrt(var+epsilon) # shape (N, D)
+ out = norm*gamma + beta # shape (N, D)
+
+ # Save variable for backward pass
+ cache_mean = mean
+ cache_var = var
+ cache_norm = norm
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+ matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
+ matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
+ matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ string mode, matrix[double] ema_mean, matrix[double] ema_var,
+ double mu, double epsilon)
+ return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+ /*
+ * Computes the backward pass for a 1D batch normalization layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of shape (N, D).
+ * - out: Outputs from the forward pass, of shape (N, D).
+ * - ema_mean_upd: Updated exponential moving average of the mean
+ * from the forward pass, of shape (1, D).
+ * - ema_var_upd: Updated exponential moving average of the variance
+ * from the forward pass, of shape (1, D).
+ * - cache_mean: Cache of the batch mean from the forward pass, of
+ * shape (1, D). Note: This is used for performance during
+ * training.
+ * - cache_var: Cache of the batch variance from the forward pass,
+ * of shape (1, D). Note: This is used for performance during
+ * training.
+ * - cache_norm: Cache of the normalized inputs from the forward
+ * pass, of shape (N, D). Note: This is used for performance
+ * during training.
+ * - X: Inputs, of shape (N, D).
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ * - mode: 'train' or 'test' to indicate if the model is currently
+ * being trained or tested. During training, the current batch
+ * mean and variance will be used to normalize the inputs, while
+ * during testing, the exponential average of the mean and
+ * variance over all previous batches will be used.
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (1, D).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (1, D).
+ * - mu: Momentum value for moving averages.
+ * Typical values are in the range of [0.9, 0.999].
+ * - epsilon: Smoothing term to avoid divide by zero errors.
+ * Typical values are in the range of [1e-5, 1e-3].
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, D).
+ * - dgamma: Gradient wrt `W`, of shape (1, D).
+ * - dbeta: Gradient wrt `b`, of shape (1, D).
+ *
+ */
+ N = nrow(X)
+ mean = cache_mean
+ var = cache_var
+ norm = cache_norm
+ centered = X-mean
+
+ if (mode == 'train') {
+ # Compute gradients during training
+ dgamma = colSums(dout*norm) # shape (1, D)
+ dbeta = colSums(dout) # shape (1, D)
+ dnorm = dout * gamma # shape (N, D)
+ dvar = (-1/2) * colSums(centered * (var+epsilon)^(-3/2) * dnorm) # shape (1, D)
+ dmean = colSums((-dnorm/sqrt(var+epsilon)) + ((-2/N)*centered*dvar)) # shape (1, D)
+ dX = (dnorm/sqrt(var+epsilon)) + ((2/N)*centered*dvar) + ((1/N)*dmean) # shape (N, D)
+ }
+ else {
+ # Compute gradients during testing
+ dgamma = colSums(dout*norm) # shape (1, D)
+ dbeta = colSums(dout) # shape (1, D)
+ dnorm = dout * gamma # shape (N, D)
+ dX = dnorm / sqrt(var+epsilon) # shape (N, D)
+ }
+}
+
+init = function(int D)
+ return (matrix[double] gamma, matrix[double] beta,
+ matrix[double] ema_mean, matrix[double] ema_var) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - D: Dimensionality of the input features (number of features).
+ *
+ * Outputs:
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (1, D).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (1, D).
+ */
+ gamma = matrix(1, rows=1, cols=D)
+ beta = matrix(0, rows=1, cols=D)
+ ema_mean = matrix(0, rows=1, cols=D)
+ ema_var = matrix(1, rows=1, cols=D)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f5ef628c/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
new file mode 100644
index 0000000..fb25b2c
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml
@@ -0,0 +1,238 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D (Spatial) Batch Normalization layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ int C, int Hin, int Win, string mode,
+ matrix[double] ema_mean, matrix[double] ema_var,
+ double mu, double epsilon)
+ return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
+ matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
+ /*
+ * Computes the forward pass for a 2D (spatial) batch normalization
+ * layer. The input data has N examples, each represented as a 3D
+ * volume unrolled into a single vector.
+ *
+ * A spatial batch normalization layer uses the per-channel sample
+ * mean and per-channel uncorrected sample variance during training
+ * to normalize each channel of the input data. Additionally, it
+ * introduces learnable parameters (gamma, beta) to control the
+ * amount of normalization.
+ *
+ * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
+ *
+ * This implementation maintains exponential moving averages of the
+ * mean and variance during training for use during testing.
+ *
+ * Reference:
+ * - Batch Normalization: Accelerating Deep Network Training by
+ * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
+ * - https://arxiv.org/abs/1502.03167
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - mode: 'train' or 'test' to indicate if the model is currently
+ * being trained or tested. During training, the current batch
+ * mean and variance will be used to normalize the inputs, while
+ * during testing, the exponential average of the mean and
+ * variance over all previous batches will be used.
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (C, 1).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (C, 1).
+ * - mu: Momentum value for moving averages.
+ * Typical values are in the range of [0.9, 0.999].
+ * - epsilon: Smoothing term to avoid divide by zero errors.
+ * Typical values are in the range of [1e-5, 1e-3].
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, C*Hin*Win).
+ * - ema_mean_upd: Updated exponential moving average of the mean,
+ * of shape (C, 1).
+ * - ema_var_upd: Updated exponential moving average of the variance,
+ * of shape (C, 1).
+ * - cache_mean: Cache of the batch mean, of shape (C, 1).
+ * Note: This is used for performance during training.
+ * - cache_var: Cache of the batch variance, of shape (C, 1).
+ * Note: This is used for performance during training.
+ * - cache_norm: Cache of the normalized inputs, of
+ * shape (C, N*Hin*Win). Note: This is used for performance
+ * during training.
+ */
+ N = nrow(X)
+
+ if(mode == 'train') {
+ # Compute channel-wise mean and variance
+ # Since we don't have tensors, we will compute the means and variances in a piece-wise fashion.
+ # - mean of total group is mean of subgroup means
+ # - variance is the mean of the subgroup variances + the variance of the subgroup means
+ subgrp_means = matrix(colMeans(X), rows=C, cols=Hin*Win)
+ subgrp_vars = matrix(colVars(X) * ((N-1)/N), rows=C, cols=Hin*Win) # uncorrected variances
+ mean = rowMeans(subgrp_means) # shape (C, 1)
+ var = rowMeans(subgrp_vars) + rowVars(subgrp_means)*(((Hin*Win)-1)/(Hin*Win)) # shape (C, 1)
+ # Update moving averages
+ ema_mean_upd = mu*ema_mean + (1-mu)*mean
+ ema_var_upd = mu*ema_var + (1-mu)*var
+ }
+ else {
+ # Use moving averages of mean and variance during testing
+ mean = ema_mean
+ var = ema_var
+ ema_mean_upd = ema_mean
+ ema_var_upd = ema_var
+ }
+
+ # Normalize, shift, and scale
+ # norm = (X-mean)*(var+epsilon)^(-1/2)
+ # = (X-mean) / sqrt(var+epsilon)
+ centered = bias_add(X, -mean) # shape (N, C*Hin*Win)
+ norm = bias_multiply(centered, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win)
+ # out = norm*gamma + beta
+ scaled = bias_multiply(norm, gamma) # shape (N, C*Hin*Win)
+ out = bias_add(scaled, beta) # shape (N, C*Hin*Win)
+
+ # Save variable for backward pass
+ cache_mean = mean
+ cache_var = var
+ cache_norm = norm
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+ matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
+ matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
+ matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ int C, int Hin, int Win, string mode,
+ matrix[double] ema_mean, matrix[double] ema_var,
+ double mu, double epsilon)
+ return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+ /*
+ * Computes the backward pass for a 2D (spatial) batch normalization
+ * layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
+ * - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
+ * - ema_mean_upd: Updated exponential moving average of the mean
+ * from the forward pass, of shape (C, 1).
+ * - ema_var_upd: Updated exponential moving average of the variance
+ * from the forward pass, of shape (C, 1).
+ * - cache_mean: Cache of the batch mean from the forward pass, of
+ * shape (C, 1). Note: This is used for performance during
+ * training.
+ * - cache_var: Cache of the batch variance from the forward pass,
+ * of shape (C, 1). Note: This is used for performance during
+ * training.
+ * - cache_norm: Cache of the normalized inputs from the forward
+ * pass, of shape (C, N*Hin*Win). Note: This is used for
+ * performance during training.
+ * - X: Input data matrix to the forward pass, of
+ * shape (N, C*Hin*Win).
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - mode: 'train' or 'test' to indicate if the model is currently
+ * being trained or tested. During training, the current batch
+ * mean and variance will be used to normalize the inputs, while
+ * during testing, the exponential average of the mean and
+ * variance over all previous batches will be used.
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (C, 1).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (C, 1).
+ * - mu: Momentum value for moving averages.
+ * Typical values are in the range of [0.9, 0.999].
+ * - epsilon: Smoothing term to avoid divide by zero errors.
+ * Typical values are in the range of [1e-5, 1e-3].
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ * - dgamma: Gradient wrt `W`, of shape (C, 1).
+ * - dbeta: Gradient wrt `b`, of shape (C, 1).
+ *
+ */
+ N = nrow(X)
+ mean = cache_mean
+ var = cache_var
+ norm = cache_norm
+ centered = bias_add(X, -mean) # shape (N, C*Hin*Win)
+
+ if (mode == 'train') {
+ # Compute gradients during training
+ dgamma = util::channel_sums(dout*norm, C, Hin, Win) # shape (C, 1)
+ dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
+ dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
+ dvar = util::channel_sums((-1/2) * bias_multiply(centered, (var+epsilon)^(-3/2)) * dnorm,
+ C, Hin, Win) # shape (C, 1)
+ dmean_norm_branch = util::channel_sums(bias_multiply(dnorm, -1/sqrt(var+epsilon)), C, Hin, Win)
+ dmean_var_branch = util::channel_sums((-2/(N*Hin*Win)) * centered, C, Hin, Win)
+ dmean_var_branch = dmean_var_branch * dvar # we can't use a function within an expression yet
+ dmean = dmean_norm_branch + dmean_var_branch # shape (C, 1)
+ dX_norm_branch = bias_multiply(dnorm, 1/sqrt(var+epsilon))
+ dX_mean_branch = (1/(N*Hin*Win)) * bias_add(matrix(0, rows=1, cols=C*Hin*Win), dmean)
+ dX_var_branch = (2/(N*Hin*Win)) * bias_multiply(centered, dvar)
+ dX = dX_norm_branch + dX_mean_branch + dX_var_branch # shape (N, C*Hin*Win)
+ }
+ else {
+ # Compute gradients during testing
+ dgamma = util::channel_sums(dout*norm, C, Hin, Win) # shape (C, 1)
+ dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
+ dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
+ dX = bias_multiply(dnorm, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win)
+ }
+}
+
+init = function(int C)
+ return (matrix[double] gamma, matrix[double] beta,
+ matrix[double] ema_mean, matrix[double] ema_var) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - C: Number of input channels (dimensionality of input depth).
+ *
+ * Outputs:
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - ema_mean: Exponential moving average of the mean, of
+ * shape (C, 1).
+ * - ema_var: Exponential moving average of the variance, of
+ * shape (C, 1).
+ */
+ gamma = matrix(1, rows=C, cols=1)
+ beta = matrix(0, rows=C, cols=1)
+ ema_mean = matrix(0, rows=C, cols=1)
+ ema_var = matrix(1, rows=C, cols=1)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f5ef628c/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml b/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
deleted file mode 100644
index 6e57b05..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml
+++ /dev/null
@@ -1,235 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Spatial Batch Normalization layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
- int C, int Hin, int Win, string mode,
- matrix[double] ema_mean, matrix[double] ema_var,
- double mu, double epsilon)
- return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
- matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) {
- /*
- * Computes the forward pass for a spatial batch normalization layer.
- *
- * A spatial batch normalization layer uses the per-channel sample
- * mean and per-channel uncorrected sample variance during training
- * to normalize each channel of the input data. Additionally, it
- * introduces learnable parameters (gamma, beta) to control the
- * amount of normalization.
- *
- * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta`
- *
- * This implementation maintains exponential moving averages of the
- * mean and variance during training for use during testing.
- *
- * Reference:
- * - Batch Normalization: Accelerating Deep Network Training by
- * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015
- * - https://arxiv.org/abs/1502.03167
- *
- * Inputs:
- * - X: Inputs, of shape (N, C*Hin*Win).
- * - gamma: Scale parameters, of shape (C, 1).
- * - beta: Shift parameters, of shape (C, 1).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - mode: 'train' or 'test' to indicate if the model is currently
- * being trained or tested. During training, the current batch
- * mean and variance will be used to normalize the inputs, while
- * during testing, the exponential average of the mean and
- * variance over all previous batches will be used.
- * - ema_mean: Exponential moving average of the mean, of
- * shape (C, 1).
- * - ema_var: Exponential moving average of the variance, of
- * shape (C, 1).
- * - mu: Momentum value for moving averages.
- * Typical values are in the range of [0.9, 0.999].
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Typical values are in the range of [1e-5, 1e-3].
- *
- * Outputs:
- * - out: Outputs, of shape (N, C*Hin*Win).
- * - ema_mean_upd: Updated exponential moving average of the mean,
- * of shape (C, 1).
- * - ema_var_upd: Updated exponential moving average of the variance,
- * of shape (C, 1).
- * - cache_mean: Cache of the batch mean, of shape (C, 1).
- * Note: This is used for performance during training.
- * - cache_var: Cache of the batch variance, of shape (C, 1).
- * Note: This is used for performance during training.
- * - cache_norm: Cache of the normalized inputs, of
- * shape (C, N*Hin*Win). Note: This is used for performance
- * during training.
- */
- N = nrow(X)
-
- if(mode == 'train') {
- # Compute channel-wise mean and variance
- # Since we don't have tensors, we will compute the means and variances in a piece-wise fashion.
- # - mean of total group is mean of subgroup means
- # - variance is the mean of the subgroup variances + the variance of the subgroup means
- subgrp_means = matrix(colMeans(X), rows=C, cols=Hin*Win)
- subgrp_vars = matrix(colVars(X) * ((N-1)/N), rows=C, cols=Hin*Win) # uncorrected variances
- mean = rowMeans(subgrp_means) # shape (C, 1)
- var = rowMeans(subgrp_vars) + rowVars(subgrp_means)*(((Hin*Win)-1)/(Hin*Win)) # shape (C, 1)
- # Update moving averages
- ema_mean_upd = mu*ema_mean + (1-mu)*mean
- ema_var_upd = mu*ema_var + (1-mu)*var
- }
- else {
- # Use moving averages of mean and variance during testing
- mean = ema_mean
- var = ema_var
- ema_mean_upd = ema_mean
- ema_var_upd = ema_var
- }
-
- # Normalize, shift, and scale
- # norm = (X-mean)*(var+epsilon)^(-1/2)
- # = (X-mean) / sqrt(var+epsilon)
- centered = bias_add(X, -mean) # shape (N, C*Hin*Win)
- norm = bias_multiply(centered, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win)
- # out = norm*gamma + beta
- scaled = bias_multiply(norm, gamma) # shape (N, C*Hin*Win)
- out = bias_add(scaled, beta) # shape (N, C*Hin*Win)
-
- # Save variable for backward pass
- cache_mean = mean
- cache_var = var
- cache_norm = norm
-}
-
-backward = function(matrix[double] dout, matrix[double] out,
- matrix[double] ema_mean_upd, matrix[double] ema_var_upd,
- matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm,
- matrix[double] X, matrix[double] gamma, matrix[double] beta,
- int C, int Hin, int Win, string mode,
- matrix[double] ema_mean, matrix[double] ema_var,
- double mu, double epsilon)
- return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
- /*
- * Computes the backward pass for a spatial batch normalization layer.
- *
- * Inputs:
- * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
- * - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
- * - ema_mean_upd: Updated exponential moving average of the mean
- * from the forward pass, of shape (C, 1).
- * - ema_var_upd: Updated exponential moving average of the variance
- * from the forward pass, of shape (C, 1).
- * - cache_mean: Cache of the batch mean from the forward pass, of
- * shape (C, 1). Note: This is used for performance during
- * training.
- * - cache_var: Cache of the batch variance from the forward pass,
- * of shape (C, 1). Note: This is used for performance during
- * training.
- * - cache_norm: Cache of the normalized inputs from the forward
- * pass, of shape (C, N*Hin*Win). Note: This is used for
- * performance during training.
- * - X: Input data matrix to the forward pass, of
- * shape (N, C*Hin*Win).
- * - gamma: Scale parameters, of shape (C, 1).
- * - beta: Shift parameters, of shape (C, 1).
- * - C: Number of input channels (dimensionality of input depth).
- * - Hin: Input height.
- * - Win: Input width.
- * - mode: 'train' or 'test' to indicate if the model is currently
- * being trained or tested. During training, the current batch
- * mean and variance will be used to normalize the inputs, while
- * during testing, the exponential average of the mean and
- * variance over all previous batches will be used.
- * - ema_mean: Exponential moving average of the mean, of
- * shape (C, 1).
- * - ema_var: Exponential moving average of the variance, of
- * shape (C, 1).
- * - mu: Momentum value for moving averages.
- * Typical values are in the range of [0.9, 0.999].
- * - epsilon: Smoothing term to avoid divide by zero errors.
- * Typical values are in the range of [1e-5, 1e-3].
- *
- * Outputs:
- * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
- * - dgamma: Gradient wrt `W`, of shape (C, 1).
- * - dbeta: Gradient wrt `b`, of shape (C, 1).
- *
- */
- N = nrow(X)
- mean = cache_mean
- var = cache_var
- norm = cache_norm
- centered = bias_add(X, -mean) # shape (N, C*Hin*Win)
-
- if (mode == 'train') {
- # Compute gradients during training
- dgamma = util::channel_sums(norm*dout, C, Hin, Win) # shape (C, 1)
- dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
- dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
- dvar = util::channel_sums((-1/2) * bias_multiply(centered, (var+epsilon)^(-3/2)) * dnorm,
- C, Hin, Win) # shape (C, 1)
- dmean_norm_branch = util::channel_sums(bias_multiply(dnorm, -1/sqrt(var+epsilon)), C, Hin, Win)
- dmean_var_branch = util::channel_sums((-2/(N*Hin*Win)) * centered, C, Hin, Win)
- dmean_var_branch = dmean_var_branch * dvar # we can't use a function within an expression yet
- dmean = dmean_norm_branch + dmean_var_branch # shape (C, 1)
- dX_norm_branch = bias_multiply(dnorm, 1/sqrt(var+epsilon))
- dX_mean_branch = (1/(N*Hin*Win)) * bias_add(matrix(0, rows=1, cols=C*Hin*Win), dmean)
- dX_var_branch = (2/(N*Hin*Win)) * bias_multiply(centered, dvar)
- dX = dX_norm_branch + dX_mean_branch + dX_var_branch # shape (N, C*Hin*Win)
- }
- else {
- # Compute gradients during testing
- dgamma = util::channel_sums(norm*dout, C, Hin, Win) # shape (C, 1)
- dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
- dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
- dX = bias_multiply(dnorm, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win)
- }
-}
-
-init = function(int C)
- return (matrix[double] gamma, matrix[double] beta,
- matrix[double] ema_mean, matrix[double] ema_var) {
- /*
- * Initialize the parameters of this layer.
- *
- * Note: This is just a convenience function, and parameters
- * may be initialized manually if needed.
- *
- * Inputs:
- * - C: Number of input channels (dimensionality of input depth).
- *
- * Outputs:
- * - gamma: Scale parameters, of shape (C, 1).
- * - beta: Shift parameters, of shape (C, 1).
- * - ema_mean: Exponential moving average of the mean, of
- * shape (C, 1).
- * - ema_var: Exponential moving average of the variance, of
- * shape (C, 1).
- */
- gamma = matrix(1, rows=C, cols=1)
- beta = matrix(0, rows=C, cols=1)
- ema_mean = matrix(0, rows=C, cols=1)
- ema_var = matrix(1, rows=C, cols=1)
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f5ef628c/scripts/staging/SystemML-NN/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/grad_check.dml b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
index 1b42b67..f21811c 100644
--- a/scripts/staging/SystemML-NN/nn/test/grad_check.dml
+++ b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
@@ -23,7 +23,8 @@
* Gradient checks for various architectures.
*/
source("nn/layers/affine.dml") as affine
-source("nn/layers/batch_norm.dml") as batch_norm
+source("nn/layers/batch_norm1d.dml") as batch_norm1d
+source("nn/layers/batch_norm2d.dml") as batch_norm2d
source("nn/layers/conv2d.dml") as conv2d
source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
@@ -40,7 +41,6 @@ source("nn/layers/relu.dml") as relu
source("nn/layers/rnn.dml") as rnn
source("nn/layers/sigmoid.dml") as sigmoid
source("nn/layers/softmax.dml") as softmax
-source("nn/layers/spatial_batch_norm.dml") as spatial_batch_norm
source("nn/layers/tanh.dml") as tanh
source("nn/test/conv2d_simple.dml") as conv2d_simple
source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
@@ -125,11 +125,11 @@ affine = function() {
}
}
-batch_norm = function() {
+batch_norm1d = function() {
/*
- * Gradient check for the batch normalization layer.
+ * Gradient check for the 1D batch normalization layer.
*/
- print("Grad checking the batch normalization layer with L2 loss.")
+ print("Grad checking the 1D batch normalization layer with L2 loss.")
# Generate data
N = 3 # num examples
@@ -142,7 +142,7 @@ batch_norm = function() {
beta = rand(rows=1, cols=D)
ema_mean = rand(rows=1, cols=D)
ema_var = rand(rows=1, cols=D)
- #[dummy, dummy, ema_mean, ema_var] = batch_norm::init(D)
+ #[dummy, dummy, ema_mean, ema_var] = batch_norm1d::init(D)
# Check training & testing modes
for (i in 1:2) {
@@ -154,11 +154,11 @@ batch_norm = function() {
# Compute analytical gradients of loss wrt parameters
[out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
dout = l2_loss::backward(out, y)
- [dX, dgamma, dbeta] = batch_norm::backward(dout, out, ema_mean_upd, ema_var_upd,
- cache_mean, cache_var, cache_norm,
- X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ [dX, dgamma, dbeta] = batch_norm1d::backward(dout, out, ema_mean_upd, ema_var_upd,
+ cache_mean, cache_var, cache_norm,
+ X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
# Grad check
h = 1e-5
@@ -169,11 +169,11 @@ batch_norm = function() {
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
@@ -190,11 +190,11 @@ batch_norm = function() {
old = as.scalar(gamma[i,j])
gamma[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
gamma[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
gamma[i,j] = old # reset
dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
@@ -212,11 +212,11 @@ batch_norm = function() {
old = as.scalar(beta[i,j])
beta[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
beta[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
beta[i,j] = old # reset
dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
@@ -1276,11 +1276,11 @@ softmax = function() {
}
}
-spatial_batch_norm = function() {
+batch_norm2d = function() {
/*
- * Gradient check for the spatial batch normalization layer.
+ * Gradient check for the 2D (spatial) batch normalization layer.
*/
- print("Grad checking the spatial batch normalization layer with L2 loss.")
+ print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.")
# Generate data
N = 3 # num examples
@@ -1296,7 +1296,7 @@ spatial_batch_norm = function() {
beta = rand(rows=C, cols=1)
ema_mean = rand(rows=C, cols=1)
ema_var = rand(rows=C, cols=1)
- #[dummy, dummy, ema_mean, ema_var] = spatial_batch_norm::init(C)
+ #[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C)
# Check training & testing modes
for (i in 1:2) {
@@ -1308,12 +1308,12 @@ spatial_batch_norm = function() {
# Compute analytical gradients of loss wrt parameters
[out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- spatial_batch_norm::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
dout = l2_loss::backward(out, y)
- [dX, dgamma, dbeta] = spatial_batch_norm::backward(dout, out, ema_mean_upd, ema_var_upd,
- cache_mean, cache_var, cache_norm,
- X, gamma, beta, C, Hin, Win, mode,
- ema_mean, ema_var, mu, eps)
+ [dX, dgamma, dbeta] = batch_norm2d::backward(dout, out, ema_mean_upd, ema_var_upd,
+ cache_mean, cache_var, cache_norm,
+ X, gamma, beta, C, Hin, Win, mode,
+ ema_mean, ema_var, mu, eps)
# Grad check
h = 1e-5
@@ -1324,13 +1324,11 @@ spatial_batch_norm = function() {
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- spatial_batch_norm::forward(X, gamma, beta, C, Hin, Win, mode,
- ema_mean, ema_var, mu, eps)
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- spatial_batch_norm::forward(X, gamma, beta, C, Hin, Win, mode,
- ema_mean, ema_var, mu, eps)
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
@@ -1347,13 +1345,11 @@ spatial_batch_norm = function() {
old = as.scalar(gamma[i,j])
gamma[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- spatial_batch_norm::forward(X, gamma, beta, C, Hin, Win, mode,
- ema_mean, ema_var, mu, eps)
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
gamma[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- spatial_batch_norm::forward(X, gamma, beta, C, Hin, Win, mode,
- ema_mean, ema_var, mu, eps)
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
gamma[i,j] = old # reset
dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
@@ -1371,13 +1367,11 @@ spatial_batch_norm = function() {
old = as.scalar(beta[i,j])
beta[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- spatial_batch_norm::forward(X, gamma, beta, C, Hin, Win, mode,
- ema_mean, ema_var, mu, eps)
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
beta[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- spatial_batch_norm::forward(X, gamma, beta, C, Hin, Win, mode,
- ema_mean, ema_var, mu, eps)
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
beta[i,j] = old # reset
dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f5ef628c/scripts/staging/SystemML-NN/nn/test/run_tests.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/run_tests.dml b/scripts/staging/SystemML-NN/nn/test/run_tests.dml
index dc53cb9..0279363 100644
--- a/scripts/staging/SystemML-NN/nn/test/run_tests.dml
+++ b/scripts/staging/SystemML-NN/nn/test/run_tests.dml
@@ -37,7 +37,8 @@ tmp = grad_check::log_loss()
# Other layers
tmp = grad_check::affine()
-tmp = grad_check::batch_norm()
+tmp = grad_check::batch_norm1d()
+tmp = grad_check::batch_norm2d()
tmp = grad_check::conv2d_simple()
tmp = grad_check::conv2d()
tmp = grad_check::conv2d_builtin()
@@ -52,7 +53,6 @@ tmp = grad_check::relu()
tmp = grad_check::rnn()
tmp = grad_check::sigmoid()
tmp = grad_check::softmax()
-tmp = grad_check::spatial_batch_norm()
tmp = grad_check::tanh()
# Example model
@@ -69,13 +69,13 @@ print("")
print("Starting other tests.")
print("---")
-tmp = test::batch_norm()
+tmp = test::batch_norm1d()
+tmp = test::batch_norm2d()
tmp = test::im2col()
tmp = test::padding()
tmp = test::conv2d()
tmp = test::cross_entropy_loss()
tmp = test::max_pool2d()
-tmp = test::spatial_batch_norm()
tmp = test::tanh()
print("---")
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f5ef628c/scripts/staging/SystemML-NN/nn/test/test.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/test.dml b/scripts/staging/SystemML-NN/nn/test/test.dml
index 958c2c5..3928fac 100644
--- a/scripts/staging/SystemML-NN/nn/test/test.dml
+++ b/scripts/staging/SystemML-NN/nn/test/test.dml
@@ -22,24 +22,24 @@
/*
* Various tests, not including gradient checks.
*/
-source("nn/layers/batch_norm.dml") as batch_norm
+source("nn/layers/batch_norm1d.dml") as batch_norm1d
+source("nn/layers/batch_norm2d.dml") as batch_norm2d
source("nn/layers/conv2d.dml") as conv2d
source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
source("nn/layers/max_pool2d.dml") as max_pool2d
source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
-source("nn/layers/spatial_batch_norm.dml") as spatial_batch_norm
source("nn/layers/tanh.dml") as tanh
source("nn/test/conv2d_simple.dml") as conv2d_simple
source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
source("nn/test/util.dml") as test_util
source("nn/util.dml") as util
-batch_norm = function() {
+batch_norm1d = function() {
/*
- * Test for the batch normalization function.
+ * Test for the 1D batch normalization function.
*/
- print("Testing the batch normalization function.")
+ print("Testing the 1D batch normalization function.")
# Generate data
N = 4 # Number of examples
@@ -50,11 +50,11 @@ batch_norm = function() {
X = matrix(seq(1,16), rows=N, cols=D)
# Create layer
- [gamma, beta, ema_mean, ema_var] = batch_norm::init(D)
+ [gamma, beta, ema_mean, ema_var] = batch_norm1d::init(D)
# Forward
[out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
+ batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
# Equivalency check
target = matrix("-1.34160721 -1.34160721 -1.34160733 -1.34160709
@@ -428,11 +428,11 @@ max_pool2d = function() {
tmp = test_util::check_all_equal(out_builtin, target)
}
-spatial_batch_norm = function() {
+batch_norm2d = function() {
/*
- * Test for the spatial batch normalization function.
+ * Test for the 2D (spatial) batch normalization function.
*/
- print("Testing the spatial batch normalization function.")
+ print("Testing the 2D (spatial) batch normalization function.")
# Generate data
N = 2 # Number of examples
@@ -474,11 +474,11 @@ spatial_batch_norm = function() {
55 58 52 0 99", rows=N, cols=C*Hin*Win)
# Create layer
- [gamma, beta, ema_mean, ema_var] = spatial_batch_norm::init(C)
+ [gamma, beta, ema_mean, ema_var] = batch_norm2d::init(C)
# Forward
[out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- spatial_batch_norm::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
# Equivalency check
target = matrix("0.86215019 -0.76679718 -1.00517964 0.26619387 0.94161105
[2/2] incubator-systemml git commit: [SYSTEMML-1468] Add new 1D/2D
"Scale & Shift" layers
Posted by du...@apache.org.
[SYSTEMML-1468] Add new 1D/2D "Scale & Shift" layers
A "Scale & Shift" layer introduces learnable parameters
(`gamma`, `beta`) to scale and shift the input on either
a per-feature basis (1D) or a per-channel basis (2D).
`y = x*gamma + beta`
Closes #453.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/65172565
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/65172565
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/65172565
Branch: refs/heads/master
Commit: 6517256511b5953b4efea97600164261243a8402
Parents: f5ef628
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Mon Apr 10 17:20:55 2017 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Mon Apr 10 17:20:55 2017 -0700
----------------------------------------------------------------------
.../SystemML-NN/nn/layers/scale_shift1d.dml | 95 +++++
.../SystemML-NN/nn/layers/scale_shift2d.dml | 107 ++++++
.../staging/SystemML-NN/nn/test/grad_check.dml | 379 +++++++++++++------
.../staging/SystemML-NN/nn/test/run_tests.dml | 21 +-
4 files changed, 486 insertions(+), 116 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/65172565/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml b/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml
new file mode 100644
index 0000000..7e162a3
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml
@@ -0,0 +1,95 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 1D Scale & Shift layer.
+ */
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for a 1D scale & shift layer. The input
+ * data has N examples, each with D features.
+ *
+ * A 1D scale & shift layer introduces learnable parameters
+ * (gamma, beta) to scale and shift the input on a per-feature basis.
+ *
+ * `y = x*gamma + beta`
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, D).
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, D).
+ */
+ # Scale and shift
+ out = X*gamma + beta # shape (N, D)
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+ matrix[double] X, matrix[double] gamma, matrix[double] beta)
+ return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+ /*
+ * Computes the backward pass for a 1D scale & shift layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of shape (N, D).
+ * - out: Outputs from the forward pass, of shape (N, D).
+ * - X: Inputs, of shape (N, D).
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, D).
+ * - dgamma: Gradient wrt `W`, of shape (1, D).
+ * - dbeta: Gradient wrt `b`, of shape (1, D).
+ *
+ */
+ # Compute gradients during training
+ dgamma = colSums(dout*X) # shape (1, D)
+ dbeta = colSums(dout) # shape (1, D)
+ dX = dout * gamma # shape (N, D)
+}
+
+init = function(int D)
+ return (matrix[double] gamma, matrix[double] beta) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * By default, we initialize to an identity function, with a scale
+ * filler of `1`, and a shift filler of `0`.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - D: Dimensionality of the input features (number of features).
+ *
+ * Outputs:
+ * - gamma: Scale parameters, of shape (1, D).
+ * - beta: Shift parameters, of shape (1, D).
+ */
+ gamma = matrix(1, rows=1, cols=D)
+ beta = matrix(0, rows=1, cols=D)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/65172565/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml b/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml
new file mode 100644
index 0000000..79c884a
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml
@@ -0,0 +1,107 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Scale & Shift layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ int C, int Hin, int Win)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for a 2D scale & shift layer. The input
+ * data has N examples, each represented as a 3D volume unrolled into
+ * a single vector.
+ *
+ * A 2D scale & shift layer introduces learnable parameters
+ * (gamma, beta) to scale and shift the input on a per-channel basis.
+ *
+ * `y = x*gamma + beta`
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, C*Hin*Win).
+ */
+ # Scale and shift
+ scaled = bias_multiply(X, gamma) # shape (N, C*Hin*Win)
+ out = bias_add(scaled, beta) # shape (N, C*Hin*Win)
+}
+
+backward = function(matrix[double] dout, matrix[double] out,
+ matrix[double] X, matrix[double] gamma, matrix[double] beta,
+ int C, int Hin, int Win)
+ return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) {
+ /*
+ * Computes the backward pass for a 2D scale & shift layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win).
+ * - out: Outputs from the forward pass, of shape (N, C*Hin*Win).
+ * - X: Input data matrix to the forward pass, of
+ * shape (N, C*Hin*Win).
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ * - dgamma: Gradient wrt `W`, of shape (C, 1).
+ * - dbeta: Gradient wrt `b`, of shape (C, 1).
+ *
+ */
+ # Compute gradients during training
+ dgamma = util::channel_sums(dout*X, C, Hin, Win) # shape (C, 1)
+ dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1)
+ dX = bias_multiply(dout, gamma) # shape (N, C*Hin*Win)
+}
+
+init = function(int C)
+ return (matrix[double] gamma, matrix[double] beta) {
+ /*
+ * Initialize the parameters of this layer.
+ *
+ * By default, we initialize to an identity function, with a scale
+ * filler of `1`, and a shift filler of `0`.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - C: Number of input channels (dimensionality of input depth).
+ *
+ * Outputs:
+ * - gamma: Scale parameters, of shape (C, 1).
+ * - beta: Shift parameters, of shape (C, 1).
+ */
+ gamma = matrix(1, rows=C, cols=1)
+ beta = matrix(0, rows=C, cols=1)
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/65172565/scripts/staging/SystemML-NN/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/grad_check.dml b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
index f21811c..516fe2a 100644
--- a/scripts/staging/SystemML-NN/nn/test/grad_check.dml
+++ b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
@@ -39,6 +39,8 @@ source("nn/layers/max_pool2d.dml") as max_pool2d
source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
source("nn/layers/relu.dml") as relu
source("nn/layers/rnn.dml") as rnn
+source("nn/layers/scale_shift1d.dml") as scale_shift1d
+source("nn/layers/scale_shift2d.dml") as scale_shift2d
source("nn/layers/sigmoid.dml") as sigmoid
source("nn/layers/softmax.dml") as softmax
source("nn/layers/tanh.dml") as tanh
@@ -229,6 +231,113 @@ batch_norm1d = function() {
}
}
+batch_norm2d = function() {
+ /*
+ * Gradient check for the 2D (spatial) batch normalization layer.
+ */
+ print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ C = 2 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ mu = 0.9 # momentum
+ eps = 1e-5 # epsilon
+ X = rand(rows=N, cols=C*Hin*Win)
+ y = rand(rows=N, cols=C*Hin*Win)
+ gamma = rand(rows=C, cols=1)
+ beta = rand(rows=C, cols=1)
+ ema_mean = rand(rows=C, cols=1)
+ ema_var = rand(rows=C, cols=1)
+ #[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C)
+
+ # Check training & testing modes
+ for (i in 1:2) {
+ if (i == 1)
+ mode = 'train'
+ else
+ mode = 'test'
+ print(" - Grad checking the '"+mode+"' mode.")
+
+ # Compute analytical gradients of loss wrt parameters
+ [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ dout = l2_loss::backward(out, y)
+ [dX, dgamma, dbeta] = batch_norm2d::backward(dout, out, ema_mean_upd, ema_var_upd,
+ cache_mean, cache_var, cache_norm,
+ X, gamma, beta, C, Hin, Win, mode,
+ ema_mean, ema_var, mu, eps)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking gamma.")
+ for (i in 1:nrow(gamma)) {
+ for (j in 1:ncol(gamma)) {
+ # Compute numerical derivative
+ old = as.scalar(gamma[i,j])
+ gamma[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ gamma[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ gamma[i,j] = old # reset
+ dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+ lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking beta.")
+ for (i in 1:nrow(beta)) {
+ for (j in 1:ncol(beta)) {
+ # Compute numerical derivative
+ old = as.scalar(beta[i,j])
+ beta[i,j] = old - h
+ [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossmh = l2_loss::forward(outmh, y)
+ beta[i,j] = old + h
+ [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
+ batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
+ lossph = l2_loss::forward(outph, y)
+ beta[i,j] = old # reset
+ dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+ lossph, lossmh)
+ }
+ }
+ }
+}
+
conv2d = function() {
/*
* Gradient check for the 2D convolutional layer using `im2col`.
@@ -1199,6 +1308,168 @@ rnn = function() {
}
}
+scale_shift1d = function() {
+ /*
+ * Gradient check for the 1D scale & shift layer.
+ */
+ print("Grad checking the 1D scale & shift layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ D = 100 # num features
+ X = rand(rows=N, cols=D)
+ y = rand(rows=N, cols=D)
+ [gamma, beta] = scale_shift1d::init(D)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = scale_shift1d::forward(X, gamma, beta)
+ dout = l2_loss::backward(out, y)
+ [dX, dgamma, dbeta] = scale_shift1d::backward(dout, out, X, gamma, beta)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = scale_shift1d::forward(X, gamma, beta)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = scale_shift1d::forward(X, gamma, beta)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking gamma.")
+ for (i in 1:nrow(gamma)) {
+ for (j in 1:ncol(gamma)) {
+ # Compute numerical derivative
+ old = as.scalar(gamma[i,j])
+ gamma[i,j] = old - h
+ outmh = scale_shift1d::forward(X, gamma, beta)
+ lossmh = l2_loss::forward(outmh, y)
+ gamma[i,j] = old + h
+ outph = scale_shift1d::forward(X, gamma, beta)
+ lossph = l2_loss::forward(outph, y)
+ gamma[i,j] = old # reset
+ dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+ lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking beta.")
+ for (i in 1:nrow(beta)) {
+ for (j in 1:ncol(beta)) {
+ # Compute numerical derivative
+ old = as.scalar(beta[i,j])
+ beta[i,j] = old - h
+ outmh = scale_shift1d::forward(X, gamma, beta)
+ lossmh = l2_loss::forward(outmh, y)
+ beta[i,j] = old + h
+ outph = scale_shift1d::forward(X, gamma, beta)
+ lossph = l2_loss::forward(outph, y)
+ beta[i,j] = old # reset
+ dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+ lossph, lossmh)
+ }
+ }
+}
+
+scale_shift2d = function() {
+ /*
+ * Gradient check for the 2D scale & shift layer.
+ */
+ print("Grad checking the 2D scale & shift layer with L2 loss.")
+
+ # Generate data
+ N = 3 # num examples
+ C = 2 # num channels
+ Hin = 5 # input height
+ Win = 5 # input width
+ X = rand(rows=N, cols=C*Hin*Win)
+ y = rand(rows=N, cols=C*Hin*Win)
+ [gamma, beta] = scale_shift2d::init(C)
+
+ # Compute analytical gradients of loss wrt parameters
+ out = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ dout = l2_loss::backward(out, y)
+ [dX, dgamma, dbeta] = scale_shift2d::backward(dout, out, X, gamma, beta, C, Hin, Win)
+
+ # Grad check
+ h = 1e-5
+ print(" - Grad checking X.")
+ for (i in 1:nrow(X)) {
+ for (j in 1:ncol(X)) {
+ # Compute numerical derivative
+ old = as.scalar(X[i,j])
+ X[i,j] = old - h
+ outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossmh = l2_loss::forward(outmh, y)
+ X[i,j] = old + h
+ outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossph = l2_loss::forward(outph, y)
+ X[i,j] = old # reset
+ dX_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking gamma.")
+ for (i in 1:nrow(gamma)) {
+ for (j in 1:ncol(gamma)) {
+ # Compute numerical derivative
+ old = as.scalar(gamma[i,j])
+ gamma[i,j] = old - h
+ outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossmh = l2_loss::forward(outmh, y)
+ gamma[i,j] = old + h
+ outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossph = l2_loss::forward(outph, y)
+ gamma[i,j] = old # reset
+ dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
+ lossph, lossmh)
+ }
+ }
+
+ print(" - Grad checking beta.")
+ for (i in 1:nrow(beta)) {
+ for (j in 1:ncol(beta)) {
+ # Compute numerical derivative
+ old = as.scalar(beta[i,j])
+ beta[i,j] = old - h
+ outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossmh = l2_loss::forward(outmh, y)
+ beta[i,j] = old + h
+ outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
+ lossph = l2_loss::forward(outph, y)
+ beta[i,j] = old # reset
+ dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
+
+ # Check error
+ rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
+ lossph, lossmh)
+ }
+ }
+}
+
sigmoid = function() {
/*
* Gradient check for the sigmoid nonlinearity layer.
@@ -1276,114 +1547,6 @@ softmax = function() {
}
}
-batch_norm2d = function() {
- /*
- * Gradient check for the 2D (spatial) batch normalization layer.
- */
- print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.")
-
- # Generate data
- N = 3 # num examples
- N = 2 # num examples
- C = 2 # num channels
- Hin = 5 # input height
- Win = 5 # input width
- mu = 0.9 # momentum
- eps = 1e-5 # epsilon
- X = rand(rows=N, cols=C*Hin*Win)
- y = rand(rows=N, cols=C*Hin*Win)
- gamma = rand(rows=C, cols=1)
- beta = rand(rows=C, cols=1)
- ema_mean = rand(rows=C, cols=1)
- ema_var = rand(rows=C, cols=1)
- #[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C)
-
- # Check training & testing modes
- for (i in 1:2) {
- if (i == 1)
- mode = 'train'
- else
- mode = 'test'
- print(" - Grad checking the '"+mode+"' mode.")
-
- # Compute analytical gradients of loss wrt parameters
- [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
- dout = l2_loss::backward(out, y)
- [dX, dgamma, dbeta] = batch_norm2d::backward(dout, out, ema_mean_upd, ema_var_upd,
- cache_mean, cache_var, cache_norm,
- X, gamma, beta, C, Hin, Win, mode,
- ema_mean, ema_var, mu, eps)
-
- # Grad check
- h = 1e-5
- print(" - Grad checking X.")
- for (i in 1:nrow(X)) {
- for (j in 1:ncol(X)) {
- # Compute numerical derivative
- old = as.scalar(X[i,j])
- X[i,j] = old - h
- [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
- lossmh = l2_loss::forward(outmh, y)
- X[i,j] = old + h
- [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
- lossph = l2_loss::forward(outph, y)
- X[i,j] = old # reset
- dX_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
- }
- }
-
- print(" - Grad checking gamma.")
- for (i in 1:nrow(gamma)) {
- for (j in 1:ncol(gamma)) {
- # Compute numerical derivative
- old = as.scalar(gamma[i,j])
- gamma[i,j] = old - h
- [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
- lossmh = l2_loss::forward(outmh, y)
- gamma[i,j] = old + h
- [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
- lossph = l2_loss::forward(outph, y)
- gamma[i,j] = old # reset
- dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
- lossph, lossmh)
- }
- }
-
- print(" - Grad checking beta.")
- for (i in 1:nrow(beta)) {
- for (j in 1:ncol(beta)) {
- # Compute numerical derivative
- old = as.scalar(beta[i,j])
- beta[i,j] = old - h
- [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
- lossmh = l2_loss::forward(outmh, y)
- beta[i,j] = old + h
- [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
- batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
- lossph = l2_loss::forward(outph, y)
- beta[i,j] = old # reset
- dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
-
- # Check error
- rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
- lossph, lossmh)
- }
- }
- }
-}
-
tanh = function() {
/*
* Gradient check for the hyperbolic tangent (tanh) nonlinearity
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/65172565/scripts/staging/SystemML-NN/nn/test/run_tests.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/run_tests.dml b/scripts/staging/SystemML-NN/nn/test/run_tests.dml
index 0279363..ee4da68 100644
--- a/scripts/staging/SystemML-NN/nn/test/run_tests.dml
+++ b/scripts/staging/SystemML-NN/nn/test/run_tests.dml
@@ -29,34 +29,39 @@ print("")
print("Starting grad checks.")
print("---")
-# Loss functions
+# Loss & loss-related functions
tmp = grad_check::cross_entropy_loss()
tmp = grad_check::l1_loss()
+tmp = grad_check::l1_reg()
tmp = grad_check::l2_loss()
+tmp = grad_check::l2_reg()
tmp = grad_check::log_loss()
+print("")
-# Other layers
+# Core layers
tmp = grad_check::affine()
tmp = grad_check::batch_norm1d()
tmp = grad_check::batch_norm2d()
-tmp = grad_check::conv2d_simple()
tmp = grad_check::conv2d()
tmp = grad_check::conv2d_builtin()
+tmp = grad_check::conv2d_simple()
tmp = grad_check::dropout()
-tmp = grad_check::l1_reg()
-tmp = grad_check::l2_reg()
tmp = grad_check::lstm()
-tmp = grad_check::max_pool2d_simple()
tmp = grad_check::max_pool2d()
tmp = grad_check::max_pool2d_builtin()
+tmp = grad_check::max_pool2d_simple()
tmp = grad_check::relu()
tmp = grad_check::rnn()
+tmp = grad_check::scale_shift1d()
+tmp = grad_check::scale_shift2d()
tmp = grad_check::sigmoid()
tmp = grad_check::softmax()
tmp = grad_check::tanh()
+print("")
# Example model
tmp = grad_check::two_layer_affine_l2_net()
+print("")
print("---")
print("Grad checks complete -- look for any ERRORs or WARNINGs.")
@@ -71,11 +76,11 @@ print("---")
tmp = test::batch_norm1d()
tmp = test::batch_norm2d()
-tmp = test::im2col()
-tmp = test::padding()
tmp = test::conv2d()
tmp = test::cross_entropy_loss()
+tmp = test::im2col()
tmp = test::max_pool2d()
+tmp = test::padding()
tmp = test::tanh()
print("---")