You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2017/07/18 00:20:45 UTC
[2/5] systemml git commit: [SYSTEMML-1185][SYSTEMML-1766] Merge
experimental breast cancer updates
http://git-wip-us.apache.org/repos/asf/systemml/blob/532da1bc/projects/breast_cancer/bin/clean_spark.sh
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/bin/clean_spark.sh b/projects/breast_cancer/bin/clean_spark.sh
new file mode 100755
index 0000000..d92ce87
--- /dev/null
+++ b/projects/breast_cancer/bin/clean_spark.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+rm -rf metastore_db/
+rm -rf derby.log
+rm -rf spark-warehouse/
+rm -rf scratch_space/
http://git-wip-us.apache.org/repos/asf/systemml/blob/532da1bc/projects/breast_cancer/bin/monitor_gpu.sh
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/bin/monitor_gpu.sh b/projects/breast_cancer/bin/monitor_gpu.sh
new file mode 100755
index 0000000..b432e3f
--- /dev/null
+++ b/projects/breast_cancer/bin/monitor_gpu.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+watch -n 0.5 nvidia-smi
http://git-wip-us.apache.org/repos/asf/systemml/blob/532da1bc/projects/breast_cancer/bin/remove_old_processes.sh
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/bin/remove_old_processes.sh b/projects/breast_cancer/bin/remove_old_processes.sh
new file mode 100755
index 0000000..2a7e903
--- /dev/null
+++ b/projects/breast_cancer/bin/remove_old_processes.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+ps -ef | grep `whoami` | grep "[p]ython" | awk '{print $2}' | xargs kill -9
+ps -ef | grep `whoami` | grep "[j]ava" | awk '{print $2}' | xargs kill -9
http://git-wip-us.apache.org/repos/asf/systemml/blob/532da1bc/projects/breast_cancer/bin/run_tensorboard.sh
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/bin/run_tensorboard.sh b/projects/breast_cancer/bin/run_tensorboard.sh
new file mode 100755
index 0000000..8445858
--- /dev/null
+++ b/projects/breast_cancer/bin/run_tensorboard.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+tensorboard --logdir=experiments --reload_interval 5
http://git-wip-us.apache.org/repos/asf/systemml/blob/532da1bc/projects/breast_cancer/breastcancer/convnet.dml
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/breastcancer/convnet.dml b/projects/breast_cancer/breastcancer/convnet.dml
new file mode 100644
index 0000000..6cbea39
--- /dev/null
+++ b/projects/breast_cancer/breastcancer/convnet.dml
@@ -0,0 +1,495 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Breast Cancer LeNet-like ConvNet Model
+ */
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/conv2d_builtin.dml") as conv2d
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/dropout.dml") as dropout
+source("nn/layers/l2_reg.dml") as l2_reg
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
+source("nn/layers/relu.dml") as relu
+source("nn/layers/softmax.dml") as softmax
+#source("nn/optim/adam.dml") as adam
+source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+
+train = function(matrix[double] X, matrix[double] Y,
+ matrix[double] X_val, matrix[double] Y_val,
+ int C, int Hin, int Win,
+ double lr, double mu, double decay, double lambda,
+ int batch_size, int epochs, int log_interval,
+ string checkpoint_dir)
+ return (matrix[double] Wc1, matrix[double] bc1,
+ matrix[double] Wc2, matrix[double] bc2,
+ matrix[double] Wc3, matrix[double] bc3,
+ matrix[double] Wa1, matrix[double] ba1,
+ matrix[double] Wa2, matrix[double] ba2) {
+ /*
+ * Trains a convolutional net using a "LeNet"-like architecture.
+ *
+ * The input matrix, X, has N examples, each represented as a 3D
+ * volume unrolled into a single vector. The targets, Y, have K
+ * classes, and are one-hot encoded.
+ *
+ * Inputs:
+ * - X: Input data matrix, of shape (N, C*Hin*Win).
+ * - Y: Target matrix, of shape (N, K).
+ * - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
+ * - Y_val: Target validation matrix, of shape (N, K).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - lr: Learning rate.
+ * - mu: Momentum value.
+ * Typical values are in the range of [0.5, 0.99], usually
+ * started at the lower end and annealed towards the higher end.
+ * - decay: Learning rate decay rate.
+ * - lambda: Regularization strength.
+ * - batch_size: Size of mini-batches to train on.
+ * - epochs: Total number of full training loops over the full data set.
+ * - log_interval: Interval, in iterations, between log outputs.
+ * - checkpoint_dir: Directory to store model checkpoints.
+ *
+ * Outputs:
+ * - Wc1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
+ * - bc1: 1st layer biases vector, of shape (F1, 1).
+ * - Wc2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
+ * - bc2: 2nd layer biases vector, of shape (F2, 1).
+ * - Wc3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
+ * - bc3: 3rd layer biases vector, of shape (1, N3).
+ * - Wa2: 4th layer weights (parameters) matrix, of shape (N3, K).
+ * - ba2: 4th layer biases vector, of shape (1, K).
+ */
+ N = nrow(X)
+ K = ncol(Y)
+
+ # Create network:
+ # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> pool3
+ # -> affine1 -> relu1 -> dropout1 -> affine2 -> softmax
+ Hf = 3 # filter height
+ Wf = 3 # filter width
+ stride = 1
+ pad = 1 # For same dimensions, (Hf - stride) / 2
+ F1 = 32 # num conv filters in conv1
+ F2 = 32 # num conv filters in conv2
+ F3 = 32 # num conv filters in conv3
+ N1 = 512 # num nodes in affine1
+ # Note: affine2 has K nodes, which is equal to the number of target dimensions (num classes)
+ [Wc1, bc1] = conv2d::init(F1, C, Hf, Wf) # inputs: (N, C*Hin*Win)
+ [Wc2, bc2] = conv2d::init(F2, F1, Hf, Wf) # inputs: (N, F1*(Hin/2)*(Win/2))
+ [Wc3, bc3] = conv2d::init(F3, F2, Hf, Wf) # inputs: (N, F2*(Hin/2^2)*(Win/2^2))
+ [Wa1, ba1] = affine::init(F3*(Hin/2^3)*(Win/2^3), N1) # inputs: (N, F3*(Hin/2^3)*(Win/2^3))
+ [Wa2, ba2] = affine::init(N1, K) # inputs: (N, N1)
+ Wa2 = Wa2 / sqrt(2) # different initialization, since being fed into softmax, instead of relu
+
+ # TODO: Compare optimizers once training is faster.
+ # Initialize SGD w/ Nesterov momentum optimizer
+ vWc1 = sgd_nesterov::init(Wc1); vbc1 = sgd_nesterov::init(bc1)
+ vWc2 = sgd_nesterov::init(Wc2); vbc2 = sgd_nesterov::init(bc2)
+ vWc3 = sgd_nesterov::init(Wc3); vbc3 = sgd_nesterov::init(bc3)
+ vWa1 = sgd_nesterov::init(Wa1); vba1 = sgd_nesterov::init(ba1)
+ vWa2 = sgd_nesterov::init(Wa2); vba2 = sgd_nesterov::init(ba2)
+ #[mWc1, vWc1] = adam::init(Wc1) # optimizer 1st & 2nd moment state for Wc1
+ #[mbc1, vbc1] = adam::init(bc1) # optimizer 1st & 2nd moment state for bc1
+ #[mWc2, vWc2] = adam::init(Wc2) # optimizer 1st & 2nd moment state for Wc2
+ #[mbc2, vbc2] = adam::init(bc2) # optimizer 1st & 2nd moment state for bc2
+ #[mWc3, vWc3] = adam::init(Wc3) # optimizer 1st & 2nd moment state for Wc3
+ #[mbc3, vbc3] = adam::init(bc3) # optimizer 1st & 2nd moment state for bc3
+ #[mWa1, vWa1] = adam::init(Wa1) # optimizer 1st & 2nd moment state for Wa1
+ #[mba1, vba1] = adam::init(ba1) # optimizer 1st & 2nd moment state for ba1
+ #[mWa2, vWa2] = adam::init(Wa2) # optimizer 1st & 2nd moment state for Wa2
+ #[mba2, vba2] = adam::init(ba2) # optimizer 1st & 2nd moment state for ba2
+ #beta1 = 0.9
+ #beta2 = 0.999
+ #eps = 1e-8
+
+ # TODO: Enable starting val metrics once fast, distributed predictions are available.
+ # Starting validation loss & accuracy
+ #probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)
+ #loss_val = cross_entropy_loss::forward(probs_val, Y_val)
+ #accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
+ ## Output results
+ #print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+
+ # Optimize
+ print("Starting optimization")
+ iters = ceil(N / batch_size)
+ for (e in 1:epochs) {
+ for(i in 1:iters) {
+ # Get next batch
+ beg = ((i-1) * batch_size) %% N + 1
+ end = min(N, beg + batch_size - 1)
+ X_batch = X[beg:end,]
+ y_batch = Y[beg:end,]
+
+ # Compute forward pass
+ ## conv layer 1: conv1 -> relu1 -> pool1
+ [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+ outc1r = relu::forward(outc1)
+ [outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+ strideh=2, stridew=2, 0, 0)
+ ## conv layer 2: conv2 -> relu2 -> pool2
+ [outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf,
+ stride, stride, pad, pad)
+ outc2r = relu::forward(outc2)
+ [outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+ strideh=2, stridew=2, 0, 0)
+ ## conv layer 3: conv3 -> relu3 -> pool3
+ [outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf,
+ stride, stride, pad, pad)
+ outc3r = relu::forward(outc3)
+ [outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2,
+ strideh=2, stridew=2, 0, 0)
+ ## affine layer 1: affine1 -> relu1 -> dropout1
+ outa1 = affine::forward(outc3p, Wa1, ba1)
+ outa1r = relu::forward(outa1)
+ [outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1)
+ ## affine layer 2: affine2 -> softmax
+ outa2 = affine::forward(outa1d, Wa2, ba2)
+ probs = softmax::forward(outa2)
+
+ # Compute data backward pass
+ ## loss:
+ dprobs = cross_entropy_loss::backward(probs, y_batch)
+ ## affine layer 2: affine2 -> softmax
+ douta2 = softmax::backward(dprobs, outa2)
+ [douta1d, dWa2, dba2] = affine::backward(douta2, outa1d, Wa2, ba2)
+ ## layer 3: affine3 -> relu3 -> dropout
+ ## affine layer 1: affine1 -> relu1 -> dropout
+ douta1r = dropout::backward(douta1d, outa1r, 0.5, maskad1)
+ douta1 = relu::backward(douta1r, outa1)
+ [doutc3p, dWa1, dba1] = affine::backward(douta1, outc3p, Wa1, ba1)
+ ## conv layer 3: conv3 -> relu3 -> pool3
+ doutc3r = max_pool2d::backward(doutc3p, Houtc3p, Woutc3p, outc3r, F3, Houtc3, Woutc3,
+ Hf=2, Wf=2, strideh=2, stridew=2, 0, 0)
+ doutc3 = relu::backward(doutc3r, outc3)
+ [doutc2p, dWc3, dbc3] = conv2d::backward(doutc3, Houtc3, Woutc3, outc2p, Wc3, bc2, F2,
+ Houtc2p, Woutc2p, Hf, Wf, stride, stride, pad, pad)
+ ## conv layer 2: conv2 -> relu2 -> pool2
+ doutc2r = max_pool2d::backward(doutc2p, Houtc2p, Woutc2p, outc2r, F2, Houtc2, Woutc2,
+ Hf=2, Wf=2, strideh=2, stridew=2, 0, 0)
+ doutc2 = relu::backward(doutc2r, outc2)
+ [doutc1p, dWc2, dbc2] = conv2d::backward(doutc2, Houtc2, Woutc2, outc1p, Wc2, bc2, F1,
+ Houtc1p, Woutc1p, Hf, Wf, stride, stride, pad, pad)
+ ## conv layer 1: conv1 -> relu1 -> pool1
+ doutc1r = max_pool2d::backward(doutc1p, Houtc1p, Woutc1p, outc1r, F1, Houtc1, Woutc1,
+ Hf=2, Wf=2, strideh=2, stridew=2, 0, 0)
+ doutc1 = relu::backward(doutc1r, outc1)
+ [dX_batch, dWc1, dbc1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, Wc1, bc1, C,
+ Hin, Win, Hf, Wf, stride, stride, pad, pad)
+
+ # Compute regularization backward pass
+ dWc1_reg = l2_reg::backward(Wc1, lambda)
+ dWc2_reg = l2_reg::backward(Wc2, lambda)
+ dWc3_reg = l2_reg::backward(Wc3, lambda)
+ dWa1_reg = l2_reg::backward(Wa1, lambda)
+ dWa2_reg = l2_reg::backward(Wa2, lambda)
+ dWc1 = dWc1 + dWc1_reg
+ dWc2 = dWc2 + dWc2_reg
+ dWc3 = dWc3 + dWc3_reg
+ dWa1 = dWa1 + dWa1_reg
+ dWa2 = dWa2 + dWa2_reg
+
+ # Optimize with SGD w/ Nesterov momentum
+ [Wc1, vWc1] = sgd_nesterov::update(Wc1, dWc1, lr, mu, vWc1)
+ [bc1, vbc1] = sgd_nesterov::update(bc1, dbc1, lr, mu, vbc1)
+ [Wc2, vWc2] = sgd_nesterov::update(Wc2, dWc2, lr, mu, vWc2)
+ [bc2, vbc2] = sgd_nesterov::update(bc2, dbc2, lr, mu, vbc2)
+ [Wc3, vWc3] = sgd_nesterov::update(Wc3, dWc3, lr, mu, vWc3)
+ [bc3, vbc3] = sgd_nesterov::update(bc3, dbc3, lr, mu, vbc3)
+ [Wa1, vWa1] = sgd_nesterov::update(Wa1, dWa1, lr, mu, vWa1)
+ [ba1, vba1] = sgd_nesterov::update(ba1, dba1, lr, mu, vba1)
+ [Wa2, vWa2] = sgd_nesterov::update(Wa2, dWa2, lr, mu, vWa2)
+ [ba2, vba2] = sgd_nesterov::update(ba2, dba2, lr, mu, vba2)
+ #t = e*i - 1
+ #[Wc1, mWc1, vWc1] = adam::update(Wc1, dWc1, lr, beta1, beta2, eps, t, mWc1, vWc1)
+ #[bc1, mbc1, vbc1] = adam::update(bc1, dbc1, lr, beta1, beta2, eps, t, mbc1, vbc1)
+ #[Wc2, mWc2, vWc2] = adam::update(Wc2, dWc2, lr, beta1, beta2, eps, t, mWc2, vWc2)
+ #[bc2, mbc2, vbc2] = adam::update(bc2, dbc2, lr, beta1, beta2, eps, t, mbc2, vbc2)
+ #[Wc3, mWc3, vWc3] = adam::update(Wc3, dWc3, lr, beta1, beta2, eps, t, mWc3, vWc3)
+ #[bc3, mbc3, vbc3] = adam::update(bc3, dbc3, lr, beta1, beta2, eps, t, mbc3, vbc3)
+ #[Wa1, mWa1, vWa1] = adam::update(Wa1, dWa1, lr, beta1, beta2, eps, t, mWa1, vWa1)
+ #[ba1, mba1, vba1] = adam::update(ba1, dba1, lr, beta1, beta2, eps, t, mba1, vba1)
+ #[Wa2, mWa2, vWa2] = adam::update(Wa2, dWa2, lr, beta1, beta2, eps, t, mWa2, vWa2)
+ #[ba2, mba2, vba2] = adam::update(ba2, dba2, lr, beta1, beta2, eps, t, mba2, vba2)
+
+ # Compute loss & accuracy for training & validation data every `log_interval` iterations.
+ if (i %% log_interval == 0) {
+ # Compute training loss & accuracy
+ loss_data = cross_entropy_loss::forward(probs, y_batch)
+ loss_reg_Wc1 = l2_reg::forward(Wc1, lambda)
+ loss_reg_Wc2 = l2_reg::forward(Wc2, lambda)
+ loss_reg_Wc3 = l2_reg::forward(Wc3, lambda)
+ loss_reg_Wa1 = l2_reg::forward(Wa1, lambda)
+ loss_reg_Wa2 = l2_reg::forward(Wa2, lambda)
+ loss = loss_data + loss_reg_Wc1 + loss_reg_Wc2 + loss_reg_Wc3 + loss_reg_Wa1 + loss_reg_Wa2
+ accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
+
+ # TODO: Consider enabling val metrics here once fast, distributed predictions are available.
+ ## Compute validation loss & accuracy
+ #probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)
+ #loss_val = cross_entropy_loss::forward(probs_val, Y_val)
+ #accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
+
+ ## Output results
+ #print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
+ # + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val
+ # + ", lr: " + lr + ", mu " + mu)
+ # Output results
+ print("Epoch: " + e + "/" + epochs + ", Iter: " + i + "/" + iters
+ + ", Train Loss: " + loss + ", Train Accuracy: " + accuracy)
+ }
+ }
+
+ # Compute validation loss & accuracy for validation data every epoch
+ probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)
+ loss_val = cross_entropy_loss::forward(probs_val, Y_val)
+ accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
+
+ # Output results
+ print("Epoch: " + e + "/" + epochs + ", Val Loss: " + loss_val
+ + ", Val Accuracy: " + accuracy_val + ", lr: " + lr + ", mu " + mu)
+
+ # Checkpoint model
+ dir = checkpoint_dir + e + "/"
+ dummy = checkpoint(dir, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)
+ str = "lr: " + lr + ", mu: " + mu + ", decay: " + decay + ", lambda: " + lambda
+ + ", batch_size: " + batch_size
+ name = dir + accuracy_val
+ write(str, name)
+
+ # Anneal momentum towards 0.999
+ mu = mu + (0.999 - mu)/(1+epochs-e)
+ # Decay learning rate
+ lr = lr * decay
+ }
+}
+
+checkpoint = function(string dir,
+ matrix[double] Wc1, matrix[double] bc1,
+ matrix[double] Wc2, matrix[double] bc2,
+ matrix[double] Wc3, matrix[double] bc3,
+ matrix[double] Wa1, matrix[double] ba1,
+ matrix[double] Wa2, matrix[double] ba2) {
+ /*
+ * Save the model parameters.
+ *
+ * Inputs:
+ * - dir: Directory in which to save model parameters.
+ * - Wc1: 1st conv layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
+ * - bc1: 1st conv layer biases vector, of shape (F1, 1).
+ * - Wc2: 2nd conv layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
+ * - bc2: 2nd conv layer biases vector, of shape (F2, 1).
+ * - Wc3: 3rd conv layer weights (parameters) matrix, of shape (F3, F2*Hf*Wf).
+ * - bc3: 3rd conv layer biases vector, of shape (F3, 1).
+ * - Wa1: 1st affine layer weights (parameters) matrix, of shape (F3*(Hin/2^3)*(Win/2^1), N1).
+ * - ba1: 1st affine layer biases vector, of shape (1, N1).
+ * - Wa2: 2nd affine layer weights (parameters) matrix, of shape (N1, K).
+ * - ba2: 2nd affine layer biases vector, of shape (1, K).
+ *
+ * Outputs:
+ * - probs: Class probabilities, of shape (N, K).
+ */
+ write(Wc1, dir + "Wc1", format="binary")
+ write(bc1, dir + "bc1", format="binary")
+ write(Wc2, dir + "Wc2", format="binary")
+ write(bc2, dir + "bc2", format="binary")
+ write(Wc3, dir + "Wc3", format="binary")
+ write(bc3, dir + "bc3", format="binary")
+ write(Wa1, dir + "Wa1", format="binary")
+ write(ba1, dir + "ba1", format="binary")
+ write(Wa2, dir + "Wa2", format="binary")
+ write(ba2, dir + "ba2", format="binary")
+}
+
+predict = function(matrix[double] X, int C, int Hin, int Win,
+ matrix[double] Wc1, matrix[double] bc1,
+ matrix[double] Wc2, matrix[double] bc2,
+ matrix[double] Wc3, matrix[double] bc3,
+ matrix[double] Wa1, matrix[double] ba1,
+ matrix[double] Wa2, matrix[double] ba2)
+ return (matrix[double] probs) {
+ /*
+ * Computes the class probability predictions of a convolutional
+ * net using the "LeNet" architecture.
+ *
+ * The input matrix, X, has N examples, each represented as a 3D
+ * volume unrolled into a single vector.
+ *
+ * Inputs:
+ * - X: Input data matrix, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Wc1: 1st conv layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
+ * - bc1: 1st conv layer biases vector, of shape (F1, 1).
+ * - Wc2: 2nd conv layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
+ * - bc2: 2nd conv layer biases vector, of shape (F2, 1).
+ * - Wc3: 3rd conv layer weights (parameters) matrix, of shape (F3, F2*Hf*Wf).
+ * - bc3: 3rd conv layer biases vector, of shape (F3, 1).
+ * - Wa1: 1st affine layer weights (parameters) matrix, of shape (F3*(Hin/2^3)*(Win/2^1), N1).
+ * - ba1: 1st affine layer biases vector, of shape (1, N1).
+ * - Wa2: 2nd affine layer weights (parameters) matrix, of shape (N1, K).
+ * - ba2: 2nd affine layer biases vector, of shape (1, K).
+ *
+ * Outputs:
+ * - probs: Class probabilities, of shape (N, K).
+ */
+ N = nrow(X)
+
+ # Network:
+ # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> pool3
+ # -> affine1 -> relu1 -> affine2 -> softmax
+ Hf = 3 # filter height
+ Wf = 3 # filter width
+ stride = 1
+ pad = 1 # For same dimensions, (Hf - stride) / 2
+
+ F1 = nrow(Wc1) # num conv filters in conv1
+ F2 = nrow(Wc2) # num conv filters in conv2
+ F3 = nrow(Wc3) # num conv filters in conv3
+ N1 = ncol(Wa1) # num nodes in affine1
+ K = ncol(Wa2) # num nodes in affine2, equal to number of target dimensions (num classes)
+
+ # TODO: Implement fast, distributed conv & max pooling operators so that predictions
+ # can be computed in a full-batch, distributed manner. Alternatively, improve `parfor`
+ # so that it can be efficiently used for parallel predictions.
+ ## Compute forward pass
+ ### conv layer 1: conv1 -> relu1 -> pool1
+ #[outc1, Houtc1, Woutc1] = conv2d::forward(X, Wc1, bc1, C, Hin, Win, Hf, Wf, stride, stride,
+ # pad, pad)
+ #outc1r = relu::forward(outc1)
+ #[outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+ # strideh=2, stridew=2, 0, 0)
+ ### conv layer 2: conv2 -> relu2 -> pool2
+ #[outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf,
+ # stride, stride, pad, pad)
+ #outc2r = relu::forward(outc2)
+ #[outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+ # strideh=2, stridew=2, 0, 0)
+ ### conv layer 3: conv3 -> relu3 -> pool3
+ #[outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf,
+ # stride, stride, pad, pad)
+ #outc3r = relu::forward(outc3)
+ #[outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2,
+ # strideh=2, stridew=2, 0, 0)
+ ### affine layer 1: affine1 -> relu1 -> dropout
+ #outa1 = affine::forward(outc3p, Wa1, ba1)
+ #outa1r = relu::forward(outa1)
+ ##[outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1)
+ ### affine layer 2: affine2 -> softmax
+ #outa2 = affine::forward(outa1r, Wa2, ba2)
+ #probs = softmax::forward(outa2)
+
+ # Compute predictions over mini-batches
+ probs = matrix(0, rows=N, cols=K)
+ batch_size = 50
+ iters = ceil(N / batch_size)
+ for(i in 1:iters) {
+ # TODO: `parfor` should work here, possibly as an alternative to distributed predictions.
+ #parfor(i in 1:iters, check=0, mode=REMOTE_SPARK, resultmerge=REMOTE_SPARK) {
+ # Get next batch
+ beg = ((i-1) * batch_size) %% N + 1
+ end = min(N, beg + batch_size - 1)
+ X_batch = X[beg:end,]
+
+ # Compute forward pass
+ ## conv layer 1: conv1 -> relu1 -> pool1
+ [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+ outc1r = relu::forward(outc1)
+ [outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+ strideh=2, stridew=2, 0, 0)
+ ## conv layer 2: conv2 -> relu2 -> pool2
+ [outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf,
+ stride, stride, pad, pad)
+ outc2r = relu::forward(outc2)
+ [outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+ strideh=2, stridew=2, 0, 0)
+ ## conv layer 3: conv3 -> relu3 -> pool3
+ [outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf,
+ stride, stride, pad, pad)
+ outc3r = relu::forward(outc3)
+ [outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2,
+ strideh=2, stridew=2, 0, 0)
+ ## affine layer 1: affine1 -> relu1 -> dropout
+ outa1 = affine::forward(outc3p, Wa1, ba1)
+ outa1r = relu::forward(outa1)
+ #[outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1)
+ ## affine layer 2: affine2 -> softmax
+ outa2 = affine::forward(outa1r, Wa2, ba2)
+ probs_batch = softmax::forward(outa2)
+
+ # Store predictions
+ probs[beg:end,] = probs_batch
+ }
+}
+
+eval = function(matrix[double] probs, matrix[double] Y)
+ return (double loss, double accuracy) {
+ /*
+ * Evaluates a convolutional net using the "LeNet" architecture.
+ *
+ * The probs matrix contains the class probability predictions
+ * of K classes over N examples. The targets, Y, have K classes,
+ * and are one-hot encoded.
+ *
+ * Inputs:
+ * - probs: Class probabilities, of shape (N, K).
+ * - Y: Target matrix, of shape (N,
+ *
+ * Outputs:
+ * - loss: Scalar loss, of shape (1).
+ * - accuracy: Scalar accuracy, of shape (1).
+ */
+ # Compute loss & accuracy
+ loss = cross_entropy_loss::forward(probs, Y)
+ correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
+ accuracy = mean(correct_pred)
+}
+
+generate_dummy_data = function()
+ return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) {
+ /*
+ * Generate a dummy dataset similar to the breast cancer dataset.
+ *
+ * Outputs:
+ * - X: Input data matrix, of shape (N, D).
+ * - Y: Target matrix, of shape (N, K).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ */
+ # Generate dummy input data
+ N = 1024 # num examples
+ C = 3 # num input channels
+ Hin = 256 # input height
+ Win = 256 # input width
+ K = 3 # num target classes
+ X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+ classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))
+ Y = table(seq(1, N), classes) # one-hot encoding
+}
+
http://git-wip-us.apache.org/repos/asf/systemml/blob/532da1bc/projects/breast_cancer/breastcancer/convnet_distrib_sgd.dml
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/breastcancer/convnet_distrib_sgd.dml b/projects/breast_cancer/breastcancer/convnet_distrib_sgd.dml
new file mode 100644
index 0000000..0c5869e
--- /dev/null
+++ b/projects/breast_cancer/breastcancer/convnet_distrib_sgd.dml
@@ -0,0 +1,592 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Breast Cancer LeNet-like ConvNet Model
+ */
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/conv2d_builtin.dml") as conv2d
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/dropout.dml") as dropout
+source("nn/layers/l2_reg.dml") as l2_reg
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
+source("nn/layers/relu.dml") as relu
+source("nn/layers/softmax.dml") as softmax
+#source("nn/optim/adam.dml") as adam
+source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+
+train = function(matrix[double] X, matrix[double] Y,
+ matrix[double] X_val, matrix[double] Y_val,
+ int C, int Hin, int Win,
+ double lr, double mu, double decay, double lambda,
+ int batch_size, int parallel_batches, int epochs,
+ int log_interval, string checkpoint_dir)
+ return (matrix[double] Wc1, matrix[double] bc1,
+ matrix[double] Wc2, matrix[double] bc2,
+ matrix[double] Wc3, matrix[double] bc3,
+ matrix[double] Wa1, matrix[double] ba1,
+ matrix[double] Wa2, matrix[double] ba2) {
+ /*
+ * Trains a convolutional net using a "LeNet"-like architecture.
+ *
+ * The input matrix, X, has N examples, each represented as a 3D
+ * volume unrolled into a single vector. The targets, Y, have K
+ * classes, and are one-hot encoded.
+ *
+ * Inputs:
+ * - X: Input data matrix, of shape (N, C*Hin*Win).
+ * - Y: Target matrix, of shape (N, K).
+ * - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
+ * - Y_val: Target validation matrix, of shape (N, K).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - lr: Learning rate.
+ * - mu: Momentum value.
+ * Typical values are in the range of [0.5, 0.99], usually
+ * started at the lower end and annealed towards the higher end.
+ * - decay: Learning rate decay rate.
+ * - lambda: Regularization strength.
+ * - batch_size: Size of mini-batches to train on.
+ * - parallel_batches: Number of parallel batches to run for
+ * distributed SGD.
+ * - epochs: Total number of full training loops over the full data
+ * set.
+ * - log_interval: Interval, in iterations, between log outputs.
+ * - checkpoint_dir: Directory to store model checkpoints.
+ *
+ * Outputs:
+ * - Wc1: 1st layer weights (parameters) matrix, of
+ * shape (F1, C*Hf*Wf).
+ * - bc1: 1st layer biases vector, of shape (F1, 1).
+ * - Wc2: 2nd layer weights (parameters) matrix, of
+ * shape (F2, F1*Hf*Wf).
+ * - bc2: 2nd layer biases vector, of shape (F2, 1).
+ * - Wc3: 3rd layer weights (parameters) matrix, of
+ * shape (F2*(Hin/4)*(Win/4), N3).
+ * - bc3: 3rd layer biases vector, of shape (1, N3).
+ * - Wa2: 4th layer weights (parameters) matrix, of shape (N3, K).
+ * - ba2: 4th layer biases vector, of shape (1, K).
+ */
+ N = nrow(X)
+ K = ncol(Y)
+
+ # Create network:
+ # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> pool3
+ # -> affine1 -> relu1 -> dropout1 -> affine2 -> softmax
+ Hf = 3 # filter height
+ Wf = 3 # filter width
+ stride = 1
+ pad = 1 # For same dimensions, (Hf - stride) / 2
+ F1 = 32 # num conv filters in conv1
+ F2 = 32 # num conv filters in conv2
+ F3 = 32 # num conv filters in conv3
+ N1 = 256 # num nodes in affine1
+ # Note: affine2 has K nodes, which is equal to the number of target dimensions (num classes)
+ [Wc1, bc1] = conv2d::init(F1, C, Hf, Wf) # inputs: (N, C*Hin*Win)
+ [Wc2, bc2] = conv2d::init(F2, F1, Hf, Wf) # inputs: (N, F1*(Hin/2)*(Win/2))
+ [Wc3, bc3] = conv2d::init(F3, F2, Hf, Wf) # inputs: (N, F2*(Hin/2^2)*(Win/2^2))
+ [Wa1, ba1] = affine::init(F3*(Hin/2^3)*(Win/2^3), N1) # inputs: (N, F3*(Hin/2^3)*(Win/2^3))
+ [Wa2, ba2] = affine::init(N1, K) # inputs: (N, N1)
+ Wa2 = Wa2 / sqrt(2) # different initialization, since being fed into softmax, instead of relu
+
+ # TODO: Compare optimizers once training is faster.
+ # Initialize SGD w/ Nesterov momentum optimizer
+ vWc1 = sgd_nesterov::init(Wc1); vbc1 = sgd_nesterov::init(bc1)
+ vWc2 = sgd_nesterov::init(Wc2); vbc2 = sgd_nesterov::init(bc2)
+ vWc3 = sgd_nesterov::init(Wc3); vbc3 = sgd_nesterov::init(bc3)
+ vWa1 = sgd_nesterov::init(Wa1); vba1 = sgd_nesterov::init(ba1)
+ vWa2 = sgd_nesterov::init(Wa2); vba2 = sgd_nesterov::init(ba2)
+ #[mWc1, vWc1] = adam::init(Wc1) # optimizer 1st & 2nd moment state for Wc1
+ #[mbc1, vbc1] = adam::init(bc1) # optimizer 1st & 2nd moment state for bc1
+ #[mWc2, vWc2] = adam::init(Wc2) # optimizer 1st & 2nd moment state for Wc2
+ #[mbc2, vbc2] = adam::init(bc2) # optimizer 1st & 2nd moment state for bc2
+ #[mWc3, vWc3] = adam::init(Wc3) # optimizer 1st & 2nd moment state for Wc3
+ #[mbc3, vbc3] = adam::init(bc3) # optimizer 1st & 2nd moment state for bc3
+ #[mWa1, vWa1] = adam::init(Wa1) # optimizer 1st & 2nd moment state for Wa1
+ #[mba1, vba1] = adam::init(ba1) # optimizer 1st & 2nd moment state for ba1
+ #[mWa2, vWa2] = adam::init(Wa2) # optimizer 1st & 2nd moment state for Wa2
+ #[mba2, vba2] = adam::init(ba2) # optimizer 1st & 2nd moment state for ba2
+ #beta1 = 0.9
+ #beta2 = 0.999
+ #eps = 1e-8
+
+ # TODO: Enable starting val metrics once fast, distributed predictions are available.
+ # Starting validation loss & accuracy
+ #probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)
+ #loss_val = cross_entropy_loss::forward(probs_val, Y_val)
+ #accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
+ ## Output results
+ #print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+
+ # Optimize
+ print("Starting optimization")
+ group_batch_size = parallel_batches*batch_size
+ groups = ceil(N/group_batch_size)
+ print("Total Epochs: "+epochs+", Batch size: "+batch_size+
+ ", Degree of parallelism: "+parallel_batches+", Group batch size: "+group_batch_size+
+ ", Num groups: "+groups)
+ # Loop over the dataset multiple times
+ for (e in 1:epochs) {
+ # Grab groups of mini-batches
+ for (g in 1:groups) {
+ # Get next group of mini-batches
+ # NOTE: At the end of the dataset, the last mini-batch in this group could be smaller than
+ # the other groups.
+ #group_beg = ((g-1) * group_batch_size) %% N + 1
+ #group_end = min(N, group_beg + group_batch_size - 1)
+ #X_group_batch = X[group_beg:group_end,]
+ #Y_group_batch = Y[group_beg:group_end,]
+ group_beg = 1
+ group_end = group_batch_size
+ X_group_batch = X[group_beg:group_end,]
+ Y_group_batch = Y[group_beg:group_end,]
+
+ # Create data structure to store gradients computed in parallel
+ dWc1_agg = matrix(0, rows=parallel_batches, cols=nrow(Wc1)*ncol(Wc1))
+ dWc2_agg = matrix(0, rows=parallel_batches, cols=nrow(Wc2)*ncol(Wc2))
+ dWc3_agg = matrix(0, rows=parallel_batches, cols=nrow(Wc3)*ncol(Wc3))
+ dWa1_agg = matrix(0, rows=parallel_batches, cols=nrow(Wa1)*ncol(Wa1))
+ dWa2_agg = matrix(0, rows=parallel_batches, cols=nrow(Wa2)*ncol(Wa2))
+ dbc1_agg = matrix(0, rows=parallel_batches, cols=nrow(bc1)*ncol(bc1))
+ dbc2_agg = matrix(0, rows=parallel_batches, cols=nrow(bc2)*ncol(bc2))
+ dbc3_agg = matrix(0, rows=parallel_batches, cols=nrow(bc3)*ncol(bc3))
+ dba1_agg = matrix(0, rows=parallel_batches, cols=nrow(ba1)*ncol(ba1))
+ dba2_agg = matrix(0, rows=parallel_batches, cols=nrow(ba2)*ncol(ba2))
+
+ # Run graph on each mini-batch in this group in parallel (ideally on multiple GPUs)
+ # NOTE: The parfor is causing the sizes to not be propagated into the body both here and
+ # in `predict`. It is not caused by the batch extraction below. It is the parfor.
+ parfor (j in 1:parallel_batches, mode=REMOTE_SPARK, opt=CONSTRAINED) {
+ #parfor (j in 1:parallel_batches) {
+ # Get a mini-batch in this group
+ beg = ((j-1) * batch_size) %% nrow(X_group_batch) + 1
+ end = min(nrow(X_group_batch), beg + batch_size - 1)
+ X_batch = X_group_batch[beg:end,]
+ Y_batch = Y_group_batch[beg:end,]
+ #beg = 1
+ #end = batch_size
+ #X_batch = X_group_batch[beg:end,]
+ #Y_batch = Y_group_batch[beg:end,]
+
+ # Compute forward pass
+ ## conv layer 1: conv1 -> relu1 -> pool1
+ [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+ outc1r = relu::forward(outc1)
+ [outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+ strideh=2, stridew=2, 0, 0)
+ ## conv layer 2: conv2 -> relu2 -> pool2
+ [outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf,
+ stride, stride, pad, pad)
+ outc2r = relu::forward(outc2)
+ [outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+ strideh=2, stridew=2, 0, 0)
+ ## conv layer 3: conv3 -> relu3 -> pool3
+ [outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf,
+ stride, stride, pad, pad)
+ outc3r = relu::forward(outc3)
+ [outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2,
+ strideh=2, stridew=2, 0, 0)
+ ## affine layer 1: affine1 -> relu1 -> dropout1
+ outa1 = affine::forward(outc3p, Wa1, ba1)
+ outa1r = relu::forward(outa1)
+ [outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1)
+ ## affine layer 2: affine2 -> softmax
+ outa2 = affine::forward(outa1d, Wa2, ba2)
+ probs = softmax::forward(outa2)
+
+ # Compute data backward pass
+ ## loss:
+ dprobs = cross_entropy_loss::backward(probs, Y_batch)
+ ## affine layer 2: affine2 -> softmax
+ douta2 = softmax::backward(dprobs, outa2)
+ [douta1d, dWa2, dba2] = affine::backward(douta2, outa1d, Wa2, ba2)
+ ## layer 3: affine3 -> relu3 -> dropout
+ ## affine layer 1: affine1 -> relu1 -> dropout
+ douta1r = dropout::backward(douta1d, outa1r, 0.5, maskad1)
+ douta1 = relu::backward(douta1r, outa1)
+ [doutc3p, dWa1, dba1] = affine::backward(douta1, outc3p, Wa1, ba1)
+ ## conv layer 3: conv3 -> relu3 -> pool3
+ doutc3r = max_pool2d::backward(doutc3p, Houtc3p, Woutc3p, outc3r, F3, Houtc3, Woutc3,
+ Hf=2, Wf=2, strideh=2, stridew=2, 0, 0)
+ doutc3 = relu::backward(doutc3r, outc3)
+ [doutc2p, dWc3, dbc3] = conv2d::backward(doutc3, Houtc3, Woutc3, outc2p, Wc3, bc2, F2,
+ Houtc2p, Woutc2p, Hf, Wf, stride, stride, pad, pad)
+ ## conv layer 2: conv2 -> relu2 -> pool2
+ doutc2r = max_pool2d::backward(doutc2p, Houtc2p, Woutc2p, outc2r, F2, Houtc2, Woutc2,
+ Hf=2, Wf=2, strideh=2, stridew=2, 0, 0)
+ doutc2 = relu::backward(doutc2r, outc2)
+ [doutc1p, dWc2, dbc2] = conv2d::backward(doutc2, Houtc2, Woutc2, outc1p, Wc2, bc2, F1,
+ Houtc1p, Woutc1p, Hf, Wf, stride, stride, pad, pad)
+ ## conv layer 1: conv1 -> relu1 -> pool1
+ doutc1r = max_pool2d::backward(doutc1p, Houtc1p, Woutc1p, outc1r, F1, Houtc1, Woutc1,
+ Hf=2, Wf=2, strideh=2, stridew=2, 0, 0)
+ doutc1 = relu::backward(doutc1r, outc1)
+ [dX_batch, dWc1, dbc1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, Wc1, bc1, C,
+ Hin, Win, Hf, Wf, stride, stride, pad, pad)
+
+ # Compute regularization backward pass on weights
+ dWc1_reg = l2_reg::backward(Wc1, lambda)
+ dWc2_reg = l2_reg::backward(Wc2, lambda)
+ dWc3_reg = l2_reg::backward(Wc3, lambda)
+ dWa1_reg = l2_reg::backward(Wa1, lambda)
+ dWa2_reg = l2_reg::backward(Wa2, lambda)
+ dWc1 = dWc1 + dWc1_reg
+ dWc2 = dWc2 + dWc2_reg
+ dWc3 = dWc3 + dWc3_reg
+ dWa1 = dWa1 + dWa1_reg
+ dWa2 = dWa2 + dWa2_reg
+
+ # Flatten and store gradients for this parallel execution
+ # Note: We multiply by a weighting to allow for proper gradient averaging during the
+ # aggregation even with uneven batch sizes.
+ weighting = nrow(X_batch) / nrow(X_group_batch)
+ dWc1_agg[j,] = matrix(dWc1, rows=1, cols=nrow(Wc1)*ncol(Wc1)) * weighting
+ dWc2_agg[j,] = matrix(dWc2, rows=1, cols=nrow(Wc2)*ncol(Wc2)) * weighting
+ dWc3_agg[j,] = matrix(dWc3, rows=1, cols=nrow(Wc3)*ncol(Wc3)) * weighting
+ dWa1_agg[j,] = matrix(dWa1, rows=1, cols=nrow(Wa1)*ncol(Wa1)) * weighting
+ dWa2_agg[j,] = matrix(dWa2, rows=1, cols=nrow(Wa2)*ncol(Wa2)) * weighting
+ dbc1_agg[j,] = matrix(dbc1, rows=1, cols=nrow(bc1)*ncol(bc1)) * weighting
+ dbc2_agg[j,] = matrix(dbc2, rows=1, cols=nrow(bc2)*ncol(bc2)) * weighting
+ dbc3_agg[j,] = matrix(dbc3, rows=1, cols=nrow(bc3)*ncol(bc3)) * weighting
+ dba1_agg[j,] = matrix(dba1, rows=1, cols=nrow(ba1)*ncol(ba1)) * weighting
+ dba2_agg[j,] = matrix(dba2, rows=1, cols=nrow(ba2)*ncol(ba2)) * weighting
+ }
+
+ # Aggregate gradients
+ # Note: The gradients are already pre-multiplied by a weight so that addition here
+ # results in gradient averaging even with different possible mini-batch sizes. I.e.,
+ # the final mini-batch at the end of the dataset could be smaller than the other mini-batches.
+ dWc1 = matrix(colSums(dWc1_agg), rows=nrow(Wc1), cols=ncol(Wc1))
+ dWc2 = matrix(colSums(dWc2_agg), rows=nrow(Wc2), cols=ncol(Wc2))
+ dWc3 = matrix(colSums(dWc3_agg), rows=nrow(Wc3), cols=ncol(Wc3))
+ dWa1 = matrix(colSums(dWa1_agg), rows=nrow(Wa1), cols=ncol(Wa1))
+ dWa2 = matrix(colSums(dWa2_agg), rows=nrow(Wa2), cols=ncol(Wa2))
+ dbc1 = matrix(colSums(dbc1_agg), rows=nrow(bc1), cols=ncol(bc1))
+ dbc2 = matrix(colSums(dbc2_agg), rows=nrow(bc2), cols=ncol(bc2))
+ dbc3 = matrix(colSums(dbc3_agg), rows=nrow(bc3), cols=ncol(bc3))
+ dba1 = matrix(colSums(dba1_agg), rows=nrow(ba1), cols=ncol(ba1))
+ dba2 = matrix(colSums(dba2_agg), rows=nrow(ba2), cols=ncol(ba2))
+
+ # Optimize with SGD w/ Nesterov momentum
+ [Wc1, vWc1] = sgd_nesterov::update(Wc1, dWc1, lr, mu, vWc1)
+ [Wc2, vWc2] = sgd_nesterov::update(Wc2, dWc2, lr, mu, vWc2)
+ [Wc3, vWc3] = sgd_nesterov::update(Wc3, dWc3, lr, mu, vWc3)
+ [Wa1, vWa1] = sgd_nesterov::update(Wa1, dWa1, lr, mu, vWa1)
+ [Wa2, vWa2] = sgd_nesterov::update(Wa2, dWa2, lr, mu, vWa2)
+ [bc1, vbc1] = sgd_nesterov::update(bc1, dbc1, lr, mu, vbc1)
+ [bc2, vbc2] = sgd_nesterov::update(bc2, dbc2, lr, mu, vbc2)
+ [bc3, vbc3] = sgd_nesterov::update(bc3, dbc3, lr, mu, vbc3)
+ [ba1, vba1] = sgd_nesterov::update(ba1, dba1, lr, mu, vba1)
+ [ba2, vba2] = sgd_nesterov::update(ba2, dba2, lr, mu, vba2)
+ #t = e*i - 1
+ #[Wc1, mWc1, vWc1] = adam::update(Wc1, dWc1, lr, beta1, beta2, eps, t, mWc1, vWc1)
+ #[bc1, mbc1, vbc1] = adam::update(bc1, dbc1, lr, beta1, beta2, eps, t, mbc1, vbc1)
+ #[Wc2, mWc2, vWc2] = adam::update(Wc2, dWc2, lr, beta1, beta2, eps, t, mWc2, vWc2)
+ #[bc2, mbc2, vbc2] = adam::update(bc2, dbc2, lr, beta1, beta2, eps, t, mbc2, vbc2)
+ #[Wc3, mWc3, vWc3] = adam::update(Wc3, dWc3, lr, beta1, beta2, eps, t, mWc3, vWc3)
+ #[bc3, mbc3, vbc3] = adam::update(bc3, dbc3, lr, beta1, beta2, eps, t, mbc3, vbc3)
+ #[Wa1, mWa1, vWa1] = adam::update(Wa1, dWa1, lr, beta1, beta2, eps, t, mWa1, vWa1)
+ #[ba1, mba1, vba1] = adam::update(ba1, dba1, lr, beta1, beta2, eps, t, mba1, vba1)
+ #[Wa2, mWa2, vWa2] = adam::update(Wa2, dWa2, lr, beta1, beta2, eps, t, mWa2, vWa2)
+ #[ba2, mba2, vba2] = adam::update(ba2, dba2, lr, beta1, beta2, eps, t, mba2, vba2)
+
+ # Compute loss & accuracy for training data every `log_interval` iterations.
+ if (g %% log_interval == 0) {
+ print("Logging training loss & accuracy for group " + g + ":")
+ # Get a mini-batch in this group
+ #j = 0
+ #beg = ((j-1) * batch_size) %% nrow(X_group_batch) + 1
+ #end = min(nrow(X_group_batch), beg + batch_size - 1)
+ #X_batch = X_group_batch[beg:end,]
+ #Y_batch = Y_group_batch[beg:end,]
+
+ # Compute training loss & accuracy using final
+ #probs = predict(X_batch, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)
+ #loss_data = cross_entropy_loss::forward(probs, Y_batch)
+ probs = predict(X_group_batch, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1,
+ Wa2, ba2, batch_size)
+ loss_data = cross_entropy_loss::forward(probs, Y_group_batch)
+ loss_reg_Wc1 = l2_reg::forward(Wc1, lambda)
+ loss_reg_Wc2 = l2_reg::forward(Wc2, lambda)
+ loss_reg_Wc3 = l2_reg::forward(Wc3, lambda)
+ loss_reg_Wa1 = l2_reg::forward(Wa1, lambda)
+ loss_reg_Wa2 = l2_reg::forward(Wa2, lambda)
+ loss = loss_data + loss_reg_Wc1 + loss_reg_Wc2 + loss_reg_Wc3 + loss_reg_Wa1 + loss_reg_Wa2
+ #accuracy = mean(rowIndexMax(probs) == rowIndexMax(Y_batch))
+ accuracy = mean(rowIndexMax(probs) == rowIndexMax(Y_group_batch))
+
+ # TODO: Consider enabling val metrics here once fast, distributed predictions are available.
+ ## Compute validation loss & accuracy
+ #probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)
+ #loss_val = cross_entropy_loss::forward(probs_val, Y_val)
+ #accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
+
+ # Output results
+ print("Epoch: " + e + ", Group: " + g + ", Train Loss: " + loss + ", Train Accuracy: "
+ + accuracy) # + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+ }
+ }
+
+ # Compute validation loss & accuracy for validation data every epoch
+ print("Logging validation loss & accuracy.")
+ probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2,
+ batch_size)
+ loss_val = cross_entropy_loss::forward(probs_val, Y_val)
+ accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
+
+ # Output results
+ print("Epoch: " + e + "/" + epochs + ", Val Loss: " + loss_val
+ + ", Val Accuracy: " + accuracy_val + ", lr: " + lr + ", mu " + mu)
+
+ # Checkpoint model
+ dir = checkpoint_dir + e + "/"
+ dummy = checkpoint(dir, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)
+ str = "lr: " + lr + ", mu: " + mu + ", decay: " + decay + ", lambda: " + lambda
+ + ", batch_size: " + batch_size
+ name = dir + accuracy_val
+ write(str, name)
+
+ # Anneal momentum towards 0.999
+ mu = mu + (0.999 - mu)/(1+epochs-e)
+ # Decay learning rate
+ lr = lr * decay
+ }
+}
+
+checkpoint = function(string dir,
+ matrix[double] Wc1, matrix[double] bc1,
+ matrix[double] Wc2, matrix[double] bc2,
+ matrix[double] Wc3, matrix[double] bc3,
+ matrix[double] Wa1, matrix[double] ba1,
+ matrix[double] Wa2, matrix[double] ba2) {
+ /*
+ * Save the model parameters.
+ *
+ * Inputs:
+ * - dir: Directory in which to save model parameters.
+ * - Wc1: 1st conv layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
+ * - bc1: 1st conv layer biases vector, of shape (F1, 1).
+ * - Wc2: 2nd conv layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
+ * - bc2: 2nd conv layer biases vector, of shape (F2, 1).
+ * - Wc3: 3rd conv layer weights (parameters) matrix, of shape (F3, F2*Hf*Wf).
+ * - bc3: 3rd conv layer biases vector, of shape (F3, 1).
+ * - Wa1: 1st affine layer weights (parameters) matrix, of shape (F3*(Hin/2^3)*(Win/2^1), N1).
+ * - ba1: 1st affine layer biases vector, of shape (1, N1).
+ * - Wa2: 2nd affine layer weights (parameters) matrix, of shape (N1, K).
+ * - ba2: 2nd affine layer biases vector, of shape (1, K).
+ *
+ * Outputs:
+ * - probs: Class probabilities, of shape (N, K).
+ */
+ write(Wc1, dir + "Wc1", format="binary")
+ write(bc1, dir + "bc1", format="binary")
+ write(Wc2, dir + "Wc2", format="binary")
+ write(bc2, dir + "bc2", format="binary")
+ write(Wc3, dir + "Wc3", format="binary")
+ write(bc3, dir + "bc3", format="binary")
+ write(Wa1, dir + "Wa1", format="binary")
+ write(ba1, dir + "ba1", format="binary")
+ write(Wa2, dir + "Wa2", format="binary")
+ write(ba2, dir + "ba2", format="binary")
+}
+
+predict = function(matrix[double] X, int C, int Hin, int Win,
+ matrix[double] Wc1, matrix[double] bc1,
+ matrix[double] Wc2, matrix[double] bc2,
+ matrix[double] Wc3, matrix[double] bc3,
+ matrix[double] Wa1, matrix[double] ba1,
+ matrix[double] Wa2, matrix[double] ba2,
+ int batch_size)
+ return (matrix[double] probs) {
+ /*
+ * Computes the class probability predictions of a convolutional
+ * net using the "LeNet" architecture.
+ *
+ * The input matrix, X, has N examples, each represented as a 3D
+ * volume unrolled into a single vector.
+ *
+ * Inputs:
+ * - X: Input data matrix, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Wc1: 1st conv layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
+ * - bc1: 1st conv layer biases vector, of shape (F1, 1).
+ * - Wc2: 2nd conv layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
+ * - bc2: 2nd conv layer biases vector, of shape (F2, 1).
+ * - Wc3: 3rd conv layer weights (parameters) matrix, of shape (F3, F2*Hf*Wf).
+ * - bc3: 3rd conv layer biases vector, of shape (F3, 1).
+ * - Wa1: 1st affine layer weights (parameters) matrix, of shape (F3*(Hin/2^3)*(Win/2^1), N1).
+ * - ba1: 1st affine layer biases vector, of shape (1, N1).
+ * - Wa2: 2nd affine layer weights (parameters) matrix, of shape (N1, K).
+ * - ba2: 2nd affine layer biases vector, of shape (1, K).
+ * - batch_size: Size of mini-batches to train on.
+ *
+ * Outputs:
+ * - probs: Class probabilities, of shape (N, K).
+ */
+ N = nrow(X)
+
+ # Network:
+ # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> pool3
+ # -> affine1 -> relu1 -> affine2 -> softmax
+ Hf = 3 # filter height
+ Wf = 3 # filter width
+ stride = 1
+ pad = 1 # For same dimensions, (Hf - stride) / 2
+
+ F1 = nrow(Wc1) # num conv filters in conv1
+ F2 = nrow(Wc2) # num conv filters in conv2
+ F3 = nrow(Wc3) # num conv filters in conv3
+ N1 = ncol(Wa1) # num nodes in affine1
+ K = ncol(Wa2) # num nodes in affine2, equal to number of target dimensions (num classes)
+
+ # TODO: Implement fast, distributed conv & max pooling operators so that predictions
+ # can be computed in a full-batch, distributed manner. Alternatively, improve `parfor`
+ # so that it can be efficiently used for parallel predictions.
+ ## Compute forward pass
+ ### conv layer 1: conv1 -> relu1 -> pool1
+ #[outc1, Houtc1, Woutc1] = conv2d::forward(X, Wc1, bc1, C, Hin, Win, Hf, Wf, stride, stride,
+ # pad, pad)
+ #outc1r = relu::forward(outc1)
+ #[outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+ # strideh=2, stridew=2, 0, 0)
+ ### conv layer 2: conv2 -> relu2 -> pool2
+ #[outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf,
+ # stride, stride, pad, pad)
+ #outc2r = relu::forward(outc2)
+ #[outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+ # strideh=2, stridew=2, 0, 0)
+ ### conv layer 3: conv3 -> relu3 -> pool3
+ #[outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf,
+ # stride, stride, pad, pad)
+ #outc3r = relu::forward(outc3)
+ #[outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2,
+ # strideh=2, stridew=2, 0, 0)
+ ### affine layer 1: affine1 -> relu1 -> dropout
+ #outa1 = affine::forward(outc3p, Wa1, ba1)
+ #outa1r = relu::forward(outa1)
+ ##[outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1)
+ ### affine layer 2: affine2 -> softmax
+ #outa2 = affine::forward(outa1r, Wa2, ba2)
+ #probs = softmax::forward(outa2)
+
+ # Compute predictions over mini-batches
+ probs = matrix(0, rows=N, cols=K)
+ #batch_size = 32
+ #iters = ceil(N / batch_size)
+ # TODO: `parfor` should work here, possibly as an alternative to distributed predictions.
+ #for (i in 1:iters) {
+ #parfor (i in 1:iters, check=0, mode=REMOTE_SPARK, opt=CONSTRAINED) {
+ #parfor (i in 1:iters, check=0) { # complains about `probs` as an inter-loop dependency
+ parfor (i in 1:N, check=0, mode=REMOTE_SPARK, opt=CONSTRAINED) {
+ #parfor (i in 1:N) {
+ ## Get next batch
+ #beg = ((i-1) * batch_size) %% N + 1
+ #end = min(N, beg + batch_size - 1)
+ #X_batch = X[beg:end,]
+ #X_batch = X[i,]
+
+ # Compute forward pass
+ ## conv layer 1: conv1 -> relu1 -> pool1
+ [outc1, Houtc1, Woutc1] = conv2d::forward(X[i,], Wc1, bc1, C, Hin, Win, Hf, Wf,
+ stride, stride, pad, pad)
+ outc1r = relu::forward(outc1)
+ [outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+ strideh=2, stridew=2, 0, 0)
+ ## conv layer 2: conv2 -> relu2 -> pool2
+ [outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf,
+ stride, stride, pad, pad)
+ outc2r = relu::forward(outc2)
+ [outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+ strideh=2, stridew=2, 0, 0)
+ ## conv layer 3: conv3 -> relu3 -> pool3
+ [outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf,
+ stride, stride, pad, pad)
+ outc3r = relu::forward(outc3)
+ [outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2,
+ strideh=2, stridew=2, 0, 0)
+ ## affine layer 1: affine1 -> relu1 -> dropout
+ outa1 = affine::forward(outc3p, Wa1, ba1)
+ outa1r = relu::forward(outa1)
+ #[outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1)
+ ## affine layer 2: affine2 -> softmax
+ outa2 = affine::forward(outa1r, Wa2, ba2)
+ probs_batch = softmax::forward(outa2)
+
+ # Store predictions
+ #probs[beg:end,] = probs_batch
+ probs[i,] = probs_batch
+ }
+}
+
+eval = function(matrix[double] probs, matrix[double] Y)
+ return (double loss, double accuracy) {
+ /*
+ * Evaluates a convolutional net using the "LeNet" architecture.
+ *
+ * The probs matrix contains the class probability predictions
+ * of K classes over N examples. The targets, Y, have K classes,
+ * and are one-hot encoded.
+ *
+ * Inputs:
+ * - probs: Class probabilities, of shape (N, K).
+ * - Y: Target matrix, of shape (N,
+ *
+ * Outputs:
+ * - loss: Scalar loss, of shape (1).
+ * - accuracy: Scalar accuracy, of shape (1).
+ */
+ # Compute loss & accuracy
+ loss = cross_entropy_loss::forward(probs, Y)
+ correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
+ accuracy = mean(correct_pred)
+}
+
+generate_dummy_data = function(int N)
+ return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) {
+ /*
+ * Generate a dummy dataset similar to the breast cancer dataset.
+ *
+ * Inputs:
+ * - N: Number of examples to generate.
+ *
+ * Outputs:
+ * - X: Input data matrix, of shape (N, D).
+ * - Y: Target matrix, of shape (N, K).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ */
+ # Generate dummy input data
+ #N = 1024 # num examples
+ C = 3 # num input channels
+ Hin = 256 # input height
+ Win = 256 # input width
+ K = 3 # num target classes
+ X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+ classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))
+ Y = table(seq(1, N), classes, N, K) # one-hot encoding
+}
+
http://git-wip-us.apache.org/repos/asf/systemml/blob/532da1bc/projects/breast_cancer/breastcancer/input_data.py
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/breastcancer/input_data.py b/projects/breast_cancer/breastcancer/input_data.py
new file mode 100644
index 0000000..6e65a3e
--- /dev/null
+++ b/projects/breast_cancer/breastcancer/input_data.py
@@ -0,0 +1,229 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+"""
+Data utilities for the TUPAC16 breast cancer project.
+
+This is early, experimental code.
+
+TODO: Cleanup & add proper comments to all functions.
+"""
+import multiprocessing as mp
+import os
+import threading
+
+import numpy as np
+import py4j
+
+
+# Utils for reading data
+
+def compute_channel_means(rdd, channels, size):
+ """Compute the means of each color channel across the dataset."""
+ # TODO: Replace this with pyspark.ml.feature.VectorSlicer
+ # to cut vector into separate channel vectors, then grab the mean
+ # of those new columns, all using DataFrame functions, rather than
+ # RDD functions.
+ # from pyspark.ml.linalg import VectorUDT
+ # from pyspark.sql.functions import udf
+ # from pyspark.sql import functions as F
+ # as_ml = udf(lambda v: v.asML() if v is not None else None, VectorUDT())
+ # slicers[0].transform(train_df.withColumn("sample", as_ml("sample"))).select(F.avg("ch0"))
+ # slicers = [VectorSlicer(inputCol="sample", outputCol="ch{}".format(c), indices=range(c*pixels, c*pixels + pixels)) for c in range(CHANNELS)]
+ def helper(x):
+ x = x.sample.values
+ x = np.array(x)
+ x = (x.reshape((-1,channels,size,size)) # shape (N,C,H,W)
+ .transpose((0,2,3,1)) # shape (N,H,W,C)
+ .astype(np.float32))
+ mu = np.mean(x, axis=(0,1,2))
+ return mu
+
+ means = rdd.map(helper).collect()
+ means = np.array(means)
+ means = np.mean(means, axis=0)
+ return means
+
+
+def gen_class_weights(df):
+ """Generate class weights to even out the class distribution during training."""
+ class_counts_df = df.select("tumor_score").groupBy("tumor_score").count()
+ class_counts = {row["tumor_score"]:row["count"] for row in class_counts_df.collect()}
+ max_count = max(class_counts.values())
+ class_weights = {k-1:max_count/v for k,v in class_counts.items()}
+ return class_weights
+
+
+def read_data(spark_session, filename_template, sample_size, channels, sample_prob, normalize_class_distribution, seed):
+ """Read and return training & validation Spark DataFrames."""
+ # TODO: Clean this function up!!!
+ assert channels in (1, 3)
+ grayscale = False if channels == 3 else True
+
+ # Sample (Optional)
+ if sample_prob < 1:
+ try:
+ # Ex: `train_0.01_sample_256.parquet`
+ sampled_filename_template = filename_template.format("{}_sample_".format(sample_prob), sample_size, "_grayscale" if grayscale else "")
+ filename = os.path.join("data", sampled_filename_template)
+ df = spark_session.read.load(filename)
+ except: # Pre-sampled DataFrame not available
+ filename = os.path.join("data", filename_template.format("", sample_size, "_grayscale" if grayscale else ""))
+ df = spark_session.read.load(filename)
+ p = sample_prob # sample percentage
+ if normalize_class_distribution:
+ # stratified sample with even class proportions
+ n = df.count() # num examples
+ K = 3 # num classes
+ s = p * n # num examples in p% sample, as a fraction
+ s_k = s / K # num examples per class in evenly-distributed p% sample, as fraction
+ class_counts_df = df.select("tumor_score").groupBy("tumor_score").count()
+ class_counts = {row["tumor_score"]:row["count"] for row in class_counts_df.collect()}
+ ps = {k:s_k/v for k,v in class_counts.items()}
+ df = df.sampleBy("tumor_score", fractions=ps, seed=seed)
+ else:
+ # stratified sample maintaining the original class proportions
+ df = df.sampleBy("tumor_score", fractions={1: p, 2: p, 3: p}, seed=seed)
+ # TODO: Determine if coalesce actually provides a perf benefit on Spark 2.x
+ #train_df.cache(), val_df.cache() # cache here, or coalesce will hang
+ # tc = train_df.count()
+ # vc = val_df.count()
+ #
+ # # Reduce num partitions to ideal size (~128 MB/partition, determined empirically)
+ # current_tr_parts = train_df.rdd.getNumPartitions()
+ # current_val_parts = train_df.rdd.getNumPartitions()
+ # ex_mb = sample_size * sample_size * channels * 8 / 1024 / 1024 # size of one example in MB
+ # ideal_part_size_mb = 128 # 128 MB partitions sizes are empirically ideal
+ # ideal_exs_per_part = round(ideal_part_size_mb / ex_mb)
+ # tr_parts = round(tc / ideal_exs_per_part)
+ # val_parts = round(vc / ideal_exs_per_part)
+ # if current_tr_parts > tr_parts:
+ # train_df = train_df.coalesce(tr_parts)
+ # if current_val_parts > val_parts:
+ # val_df = val_df.coalesce(val_parts)
+ # train_df.cache(), val_df.cache()
+ else:
+ # Read in data
+ filename = os.path.join("data", filename_template.format("", sample_size, "_grayscale" if grayscale else ""))
+ df = spark_session.read.load(filename)
+
+ return df
+
+
+def read_train_data(spark_session, sample_size, channels, sample_prob=1, normalize_class_distribution=False, seed=42):
+ """Read training Spark DataFrame."""
+ filename = "train_{}{}{}_updated.parquet"
+ train_df = read_data(spark_session, filename, sample_size, channels, sample_prob, normalize_class_distribution, seed)
+ return train_df
+
+
+def read_val_data(spark_session, sample_size, channels, sample_prob=1, normalize_class_distribution=False, seed=42):
+ """Read validation Spark DataFrame."""
+ filename = "val_{}{}{}_updated.parquet"
+ train_df = read_data(spark_session, filename, sample_size, channels, sample_prob, normalize_class_distribution, seed)
+ return train_df
+
+
+# Utils for creating asynchronous queuing batch generators
+# TODO: Add comments to these functions
+
+def fill_partition_num_queue(partition_num_queue, num_partitions, stop_event):
+ partition_num_queue.cancel_join_thread()
+ while not stop_event.is_set():
+ for i in range(num_partitions):
+ partition_num_queue.put(i)
+
+
+def fill_partition_queue(partition_queue, partition_num_queue, rdd, stop_event):
+ partition_queue.cancel_join_thread()
+ while not stop_event.is_set():
+ # py4j has some issues with imports with first starting.
+ try:
+ partition_num = partition_num_queue.get()
+ partition = rdd.context.runJob(rdd, lambda x: x, [partition_num])
+ partition_queue.put(partition)
+ except (AttributeError, py4j.protocol.Py4JError, Exception) as err:
+ print("error: {}".format(err))
+
+
+def fill_row_queue(row_queue, partition_queue, stop_event):
+ row_queue.cancel_join_thread()
+ while not stop_event.is_set():
+ rows = partition_queue.get()
+ for row in rows:
+ row_queue.put(row)
+
+
+def gen_batch(row_queue, batch_size):
+ while True:
+ features = []
+ labels = []
+ for i in range(batch_size):
+ row = row_queue.get()
+ features.append(row.sample.values)
+ labels.append(row.tumor_score)
+ x_batch = np.array(features).astype(np.uint8)
+ y_batch = np.array(labels).astype(np.uint8)
+ yield x_batch, y_batch
+
+
+def create_batch_generator(
+ rdd, batch_size=32, num_partition_threads=32, num_row_processes=16,
+ partition_num_queue_size=128, partition_queue_size=16, row_queue_size=2048):
+ """
+ Create a multiprocess batch generator.
+
+ This creates a generator that uses processes and threads to create a
+ pipeline that asynchronously fetches data from Spark, filling a set
+ of queues, while yielding batches. The goal here is to amortize the
+ time needed to fetch data from Spark so that downstream consumers
+ are saturated.
+ """
+ #rdd.cache()
+ partition_num_queue = mp.Queue(partition_num_queue_size)
+ partition_queue = mp.Queue(partition_queue_size)
+ row_queue = mp.Queue(row_queue_size)
+
+ num_partitions = rdd.getNumPartitions()
+ stop_event = mp.Event()
+
+ partition_num_process = mp.Process(target=fill_partition_num_queue, args=(partition_num_queue, num_partitions, stop_event), daemon=True)
+ partition_threads = [threading.Thread(target=fill_partition_queue, args=(partition_queue, partition_num_queue, rdd, stop_event), daemon=True) for _ in range(num_partition_threads)]
+ row_processes = [mp.Process(target=fill_row_queue, args=(row_queue, partition_queue, stop_event), daemon=True) for _ in range(num_row_processes)]
+
+ ps = [partition_num_process] + row_processes + partition_threads
+ queues = [partition_num_queue, partition_queue, row_queue]
+
+ for p in ps:
+ p.start()
+
+ generator = gen_batch(row_queue, batch_size)
+ return generator, ps, queues, stop_event
+
+
+def stop(processes, stop_event):
+ """Stop queuing processes."""
+ stop_event.set()
+ for p in processes:
+ if isinstance(p, mp.Process):
+ p.terminate()
+ mp.active_children() # Use to join the killed processes above.
+
http://git-wip-us.apache.org/repos/asf/systemml/blob/532da1bc/projects/breast_cancer/breastcancer/softmax_clf.dml
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/breastcancer/softmax_clf.dml b/projects/breast_cancer/breastcancer/softmax_clf.dml
new file mode 100644
index 0000000..35fd545
--- /dev/null
+++ b/projects/breast_cancer/breastcancer/softmax_clf.dml
@@ -0,0 +1,207 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Breast Cancer Softmax Model
+ */
+# Imports
+source("nn/layers/affine.dml") as affine
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/softmax.dml") as softmax
+#source("nn/optim/adam.dml") as adam
+source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
+
+train = function(matrix[double] X, matrix[double] Y,
+ matrix[double] X_val, matrix[double] Y_val,
+ double lr, double mu, double decay,
+ int batch_size, int epochs, int log_interval)
+ return (matrix[double] W, matrix[double] b) {
+ /*
+ * Trains a softmax classifier.
+ *
+ * The input matrix, X, has N examples, each with D features.
+ * The targets, Y, have K classes, and are one-hot encoded.
+ *
+ * Inputs:
+ * - X: Input data matrix, of shape (N, D).
+ * - Y: Target matrix, of shape (N, K).
+ * - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
+ * - Y_val: Target validation matrix, of shape (N, K).
+ * - lr: Learning rate.
+ * - mu: Momentum value.
+ * Typical values are in the range of [0.5, 0.99], usually
+ * started at the lower end and annealed towards the higher end.
+ * - decay: Learning rate decay rate.
+ * - batch_size: Size of mini-batches to train on.
+ * - epochs: Total number of full training loops over the full data set.
+ * - log_interval: Interval, in iterations, between log outputs.
+ *
+ * Outputs:
+ * - W: Weights (parameters) matrix, of shape (D, K).
+ * - b: Biases vector, of shape (1, K).
+ */
+ N = nrow(Y) # num examples
+ D = ncol(X) # num features
+ K = ncol(Y) # num classes
+
+ # Create softmax classifier:
+ # affine -> softmax
+ [W, b] = affine::init(D, K)
+ W = W / sqrt(2.0/(D)) * sqrt(1/(D))
+
+ # Initialize SGD w/ Nesterov momentum optimizer
+ vW = sgd_nesterov::init(W) # optimizer momentum state for W
+ vb = sgd_nesterov::init(b) # optimizer momentum state for b
+ #[mW, vW] = adam::init(W) # optimizer 1st & 2nd moment state for W
+ #[mb, vb] = adam::init(b) # optimizer 1st & 2nd moment state for b
+
+ # Starting validation loss & accuracy
+ probs_val = predict(X_val, W, b)
+ loss_val = cross_entropy_loss::forward(probs_val, Y_val)
+ accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
+ # Output results
+ print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+
+ # Optimize
+ print("Starting optimization")
+ iters = ceil(N / batch_size)
+ for (e in 1:epochs) {
+ for(i in 1:iters) {
+ # Get next batch
+ beg = ((i-1) * batch_size) %% N + 1
+ end = min(N, beg + batch_size - 1)
+ #print("Epoch: " + e + ", Iter: " + i + ", X[" + beg + ":" + end + ",]")
+ X_batch = X[beg:end,]
+ Y_batch = Y[beg:end,]
+
+ # Compute forward pass
+ ## affine & softmax:
+ out = affine::forward(X_batch, W, b)
+ probs = softmax::forward(out)
+
+ # Compute backward pass
+ ## loss:
+ dprobs = cross_entropy_loss::backward(probs, Y_batch)
+ ## affine & softmax:
+ dout = softmax::backward(dprobs, out)
+ [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b)
+
+ # Optimize with SGD w/ Nesterov momentum
+ [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW)
+ [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb)
+ #[W, mW, vW] = adam::update(W, dW, lr, 0.9, 0.999, 1e-8, e*i-1, mW, vW)
+ #[b, mb, vb] = adam::update(b, db, lr, 0.9, 0.999, 1e-8, e*i-1, mb, vb)
+
+ # Compute loss & accuracy for training & validation data every `log_interval` iterations.
+ if (i %% log_interval == 0) {
+ #print("Eval time! - i: " + i)
+ # Compute training loss & accuracy
+ loss = cross_entropy_loss::forward(probs, Y_batch)
+ accuracy = mean(rowIndexMax(probs) == rowIndexMax(Y_batch))
+
+ # Compute validation loss & accuracy
+ probs_val = predict(X_val, W, b)
+ loss_val = cross_entropy_loss::forward(probs_val, Y_val)
+ accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
+
+ # Output results
+ print("Epoch: " + e + "/" + epochs + ", Iter: " + i + "/" + iters
+ + ", Train Loss: " + loss + ", Train Accuracy: " + accuracy + ", Val Loss: "
+ + loss_val + ", Val Accuracy: " + accuracy_val + ", lr: " + lr + ", mu " + mu)
+ }
+ }
+ # Anneal momentum towards 0.999
+ mu = mu + (0.999 - mu)/(1+epochs-e)
+ # Decay learning rate
+ lr = lr * decay
+ }
+}
+
+predict = function(matrix[double] X, matrix[double] W, matrix[double] b)
+ return (matrix[double] probs) {
+ /*
+ * Computes the class probability predictions of a softmax classifier.
+ *
+ * The input matrix, X, has N examples, each with D features.
+ *
+ * Inputs:
+ * - X: Input data matrix, of shape (N, D).
+ * - W: Weights (parameters) matrix, of shape (D, K).
+ * - b: Biases vector, of shape (1, K).
+ *
+ * Outputs:
+ * - probs: Class probabilities, of shape (N, K).
+ */
+ N = nrow(X) # num examples
+ K = ncol(W) # num classes
+
+ # Compute forward pass
+ ## affine & softmax:
+ out = affine::forward(X, W, b)
+ probs = softmax::forward(out)
+}
+
+eval = function(matrix[double] probs, matrix[double] Y)
+ return (double loss, double accuracy) {
+ /*
+ * Evaluates a softmax classifier.
+ *
+ * The probs matrix contains the class probability predictions
+ * of K classes over N examples. The targets, Y, have K classes,
+ * and are one-hot encoded.
+ *
+ * Inputs:
+ * - probs: Class probabilities, of shape (N, K).
+ * - Y: Target matrix, of shape (N, K).
+ *
+ * Outputs:
+ * - loss: Scalar loss, of shape (1).
+ * - accuracy: Scalar accuracy, of shape (1).
+ */
+ # Compute loss & accuracy
+ loss = cross_entropy_loss::forward(probs, Y)
+ correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
+ accuracy = mean(correct_pred)
+}
+
+generate_dummy_data = function()
+ return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) {
+ /*
+ * Generate a dummy dataset similar to the breast cancer dataset.
+ *
+ * Outputs:
+ * - X: Input data matrix, of shape (N, D).
+ * - Y: Target matrix, of shape (N, K).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ */
+ # Generate dummy input data
+ N = 1024 # num examples
+ C = 3 # num input channels
+ Hin = 256 # input height
+ Win = 256 # input width
+ T = 10 # num targets
+ X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+ classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform"))
+ Y = table(seq(1, N), classes) # one-hot encoding
+}
+