You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by sk...@apache.org on 2017/12/19 01:25:45 UTC

[incubator-mxnet] branch master updated: Usability fixes for examples (#9091)

This is an automated email from the ASF dual-hosted git repository.

skm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 16bd961  Usability fixes for examples (#9091)
16bd961 is described below

commit 16bd9612ea4b3e2003e6472681160e3203526009
Author: Roshani Nagmote <ro...@gmail.com>
AuthorDate: Mon Dec 18 17:25:42 2017 -0800

    Usability fixes for examples (#9091)
    
    * Usability improvements for some examples
    
    * some more modifications
    
    * formatting fixes
    
    * shape fixed
    
    * comments added
    
    * fix
    
    * fix
    
    * comments addressed
    
    * fix
    
    * num-gpus changed to 1
---
 example/model-parallel/lstm/README.md              |  11 +-
 example/model-parallel/lstm/lstm.py                |  26 +++-
 example/model-parallel/lstm/lstm_ptb.py            |   2 +-
 .../model-parallel/matrix_factorization/README.md  |  22 +++
 .../model-parallel/matrix_factorization/model.py   |   3 +-
 .../model-parallel/matrix_factorization/readme.md  |   6 -
 .../model-parallel/matrix_factorization/train.py   |   7 +-
 example/stochastic-depth/README.md                 |  29 ++++
 example/stochastic-depth/sd_cifar10.py             |   1 -
 example/vae/README.md                              |  21 +++
 example/vae/VAE.py                                 | 147 +++++++++++----------
 example/vae/VAE_example.ipynb                      | 137 ++++++++++++-------
 12 files changed, 273 insertions(+), 139 deletions(-)

diff --git a/example/model-parallel/lstm/README.md b/example/model-parallel/lstm/README.md
index 9acea85..6f31ff8 100644
--- a/example/model-parallel/lstm/README.md
+++ b/example/model-parallel/lstm/README.md
@@ -1,4 +1,13 @@
 Model Parallel LSTM
 ===================
+
 This is an example showing how to do model parallel LSTM in MXNet.
-Most of the code is duplicated with the rnn example, and should be eventually merged.
+
+We use [the PenTreeBank dataset](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/)
+in this example. Download the dataset with below command:
+
+`bash get_ptb_data.sh`
+
+This will download PenTreeBank dataset under `data` folder. Now, you can run the training as follows:
+
+`python lstm_ptb.py`
diff --git a/example/model-parallel/lstm/lstm.py b/example/model-parallel/lstm/lstm.py
index c24017f..75fa533 100644
--- a/example/model-parallel/lstm/lstm.py
+++ b/example/model-parallel/lstm/lstm.py
@@ -84,7 +84,7 @@ def lstm_unroll(num_lstm_layer, seq_len, input_size,
 
     last_hidden = []
     for seqidx in range(seq_len):
-        # embeding layer
+        # embedding layer
         with mx.AttrScope(ctx_group='embed'):
             data = mx.sym.Variable("t%d_data" % seqidx)
             hidden = mx.sym.Embedding(data=data, weight=embed_weight,
@@ -121,7 +121,13 @@ def lstm_unroll(num_lstm_layer, seq_len, input_size,
                                            name="t%d_cls" % seqidx)
                 label = mx.sym.Variable("t%d_label" % seqidx)
                 if use_loss:
-                    sm = mx.sym.softmax_cross_entropy(fc, label, name="t%d_sm" % seqidx)
+                    # Currently softmax_cross_entropy fails https://github.com/apache/incubator-mxnet/issues/6874
+                    # So, workaround for now to fix this example
+                    out = mx.symbol.softmax(data=fc)
+                    label = mx.sym.Reshape(label, shape=(-1,1))
+                    ce = - mx.sym.broadcast_add(mx.sym.broadcast_mul(label, mx.sym.log(out)),
+                                              mx.sym.broadcast_mul((1 - label), mx.sym.log(1 - out)))
+                    sm = mx.sym.MakeLoss(ce,  name="t%d_sm" % seqidx)
                 else:
                     sm = mx.sym.SoftmaxOutput(data=fc, label=label, name="t%d_sm" % seqidx)
                 out_prob.append(sm)
@@ -134,7 +140,13 @@ def lstm_unroll(num_lstm_layer, seq_len, input_size,
                                        num_hidden=num_label)
             label = mx.sym.Variable("label")
             if use_loss:
-                sm = mx.sym.softmax_cross_entropy(fc, label, name="sm")
+                # Currently softmax_cross_entropy fails https://github.com/apache/incubator-mxnet/issues/6874
+                # So, workaround for now to fix this example
+                out = mx.symbol.softmax(data=fc)
+                label = mx.sym.Reshape(label, shape=(-1, 1))
+                ce = mx.sym.broadcast_add(mx.sym.broadcast_mul(label, mx.sym.log(out)),
+                                              mx.sym.broadcast_mul((1 - label), mx.sym.log(1 - out)))
+                sm = mx.sym.MakeLoss(ce,  name="sm")
             else:
                 sm = mx.sym.SoftmaxOutput(data=fc, label=label, name="sm")
             out_prob = [sm]
@@ -208,7 +220,7 @@ def setup_rnn_model(default_ctx,
             if not name.startswith("t"):
                 print("%s group=%s, ctx=%s" % (name, group, str(ctx)))
 
-        #bind with shared executor
+        # bind with shared executor
         rnn_exec = None
         if max_len == bucket_key:
               rnn_exec = rnn_sym.bind(default_ctx, args=arg_arrays,
@@ -344,7 +356,7 @@ def train_lstm(model, X_train_batch, X_val_batch,
             # update epoch counter
             epoch_counter += 1
             if epoch_counter % update_period == 0:
-                # updare parameters
+                # update parameters
                 norm = 0.
                 for idx, weight, grad, name in m.param_blocks:
                     grad /= batch_size
@@ -363,7 +375,7 @@ def train_lstm(model, X_train_batch, X_val_batch,
                 else:
                     train_nll += calc_nll(seq_label_probs, batch_size, batch_seq_length)
             else:
-                train_nll += sum([x.asscalar() for x in seq_loss]) / batch_size
+                train_nll += sum([x.sum().asscalar() for x in seq_loss]) / batch_size
 
             nbatch += batch_size
             toc = time.time()
@@ -405,7 +417,7 @@ def train_lstm(model, X_train_batch, X_val_batch,
                 else:
                     val_nll += calc_nll(seq_label_probs, batch_size, batch_seq_length)
             else:
-                val_nll += sum([x.asscalar() for x in seq_loss]) / batch_size
+                val_nll += sum([x.sum().asscalar() for x in seq_loss]) / batch_size
             nbatch += batch_size
 
         perp = np.exp(val_nll / nbatch)
diff --git a/example/model-parallel/lstm/lstm_ptb.py b/example/model-parallel/lstm/lstm_ptb.py
index 0141338..965ba19 100644
--- a/example/model-parallel/lstm/lstm_ptb.py
+++ b/example/model-parallel/lstm/lstm_ptb.py
@@ -22,7 +22,7 @@ sys.path.insert(0, "../../python")
 import mxnet as mx
 import numpy as np
 # reuse the bucket_io library
-sys.path.insert(0, "../rnn")
+sys.path.insert(0, "../../rnn/old")
 from bucket_io import BucketSentenceIter, default_build_vocab
 
 """
diff --git a/example/model-parallel/matrix_factorization/README.md b/example/model-parallel/matrix_factorization/README.md
new file mode 100644
index 0000000..00507d9
--- /dev/null
+++ b/example/model-parallel/matrix_factorization/README.md
@@ -0,0 +1,22 @@
+Model Parallel Matrix Factorization
+===================================
+
+This example walks you through a matrix factorization algorithm for recommendations and also
+demonstrates the basic usage of `group2ctxs` in `Module`, which allows one part of the model to be
+trained on cpu and the other on gpu. So, it is necessary to have GPUs available on the machine
+to run this example.
+
+To run this example, first make sure you download a dataset of 10 million movie ratings available
+from [the MovieLens project](http://files.grouplens.org/datasets/movielens/) by running following command:
+
+`python get_data.py`
+
+This will download MovieLens 10M dataset under ml-10M100K folder. Now, you can run the training as follows:
+
+`python train.py --num-gpus 1`
+
+You can also specify other attributes such as num-epoch, batch-size,
+factor-size(output dim of the embedding operation) to train.py.
+
+While training you will be able to see the usage of ctx_group attribute to divide the operators
+into different groups corresponding to different CPU/GPU devices.
diff --git a/example/model-parallel/matrix_factorization/model.py b/example/model-parallel/matrix_factorization/model.py
index f4004d1..16cd9b3 100644
--- a/example/model-parallel/matrix_factorization/model.py
+++ b/example/model-parallel/matrix_factorization/model.py
@@ -32,6 +32,7 @@ def matrix_fact_model_parallel_net(factor_size, num_hidden, max_user, max_item):
         item_weight = mx.symbol.Variable('item_weight')
         item = mx.symbol.Embedding(data=item, weight=item_weight,
                                    input_dim=max_item, output_dim=factor_size)
+
     # set ctx_group attribute to 'dev2' for the symbols created in this scope,
     # the symbols will be bound to the context that 'dev2' map to in group2ctxs
     with mx.AttrScope(ctx_group='dev2'):
@@ -45,7 +46,7 @@ def matrix_fact_model_parallel_net(factor_size, num_hidden, max_user, max_item):
         fc_item_weight = mx.symbol.Variable('fc_item_weight')
         fc_item_bias = mx.symbol.Variable('fc_item_bias')
         item = mx.symbol.FullyConnected(data=item, weight=fc_item_weight, bias=fc_item_bias, num_hidden=num_hidden)
-        # predict by the inner product, which is elementwise product and then sum
+        # predict by the inner product, which is element-wise product and then sum
         pred = user * item
         pred = mx.symbol.sum(data=pred, axis=1)
         pred = mx.symbol.Flatten(data=pred)
diff --git a/example/model-parallel/matrix_factorization/readme.md b/example/model-parallel/matrix_factorization/readme.md
deleted file mode 100644
index 5d724ae..0000000
--- a/example/model-parallel/matrix_factorization/readme.md
+++ /dev/null
@@ -1,6 +0,0 @@
-Model Parallel Matrix Factorization
-==============
-
-The example demonstrates the basic usage of `group2ctxs` in `Module`, which allows one part of the model trained on cpu and the other on gpu.
-
-- `python matrix_factorization_model_parallel.py --num-gpus 2`
diff --git a/example/model-parallel/matrix_factorization/train.py b/example/model-parallel/matrix_factorization/train.py
index 7a2073b..591dab3 100644
--- a/example/model-parallel/matrix_factorization/train.py
+++ b/example/model-parallel/matrix_factorization/train.py
@@ -21,7 +21,7 @@ import time
 import mxnet as mx
 import numpy as np
 from get_data import get_movielens_iter, get_movielens_data
-from matrix_fact_parallel_model import matrix_fact_model_parallel_net
+from model import matrix_fact_model_parallel_net
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -77,10 +77,13 @@ if __name__ == '__main__':
     # construct the module
     # map the ctx_group attribute to the context assignment
     group2ctxs={'dev1':[mx.cpu()]*num_gpus, 'dev2':[mx.gpu(i) for i in range(num_gpus)]}
+
+    # Creating a module by passing group2ctxs attribute which maps
+    # the ctx_group attribute to the context assignment
     mod = mx.module.Module(symbol=net, context=[mx.cpu()]*num_gpus, data_names=['user', 'item'],
         label_names=['score'], group2ctxs=group2ctxs)
     
-    # the initializer uesd to initialize the parameters
+    # the initializer used to initialize the parameters
     initializer = mx.init.Xavier(factor_type="in", magnitude=2.34)
     
     # the parameters for the optimizer constructor
diff --git a/example/stochastic-depth/README.md b/example/stochastic-depth/README.md
new file mode 100644
index 0000000..08c466e
--- /dev/null
+++ b/example/stochastic-depth/README.md
@@ -0,0 +1,29 @@
+Stochastic Depth
+================
+
+This folder contains examples showing implementation of the stochastic depth algorithm described in the paper
+Huang, Gao, et al. ["Deep networks with stochastic depth."](https://arxiv.org/abs/1603.09382)
+arXiv preprint arXiv:1603.09382 (2016). This paper introduces a new way to perturb networks during training
+in order to improve their performance. Stochastic Depth (SD) is a method for residual networks,
+which randomly removes/deactivates residual blocks during training.
+
+The paper talks about constructing the network of residual blocks which are basically a set of
+convolution layers and a bypass that passes the information from the previous layer through without any change.
+With stochastic depth, the convolution block is sometimes switched off allowing the information
+to flow through the layer without being changed, effectively removing the layer from the network.
+During testing, all layers are left in and the weights are modified by their survival probability.
+This is very similar to how dropout works, except instead of dropping a single node in a layer
+the entire layer is dropped!
+
+The main idea behind stochastic depth is relatively simple, but the results are surprisingly good.
+The authors demonstrated the new architecture on CIFAR-10, CIFAR-100, and the Street View House Number dataset (SVHN).
+They achieve the lowest published error on CIFAR-10 and CIFAR-100, and second lowest for SVHN.
+
+Files in this example folder:
+
+- `sd_mnist.py` example shows sample implementation of the algorithm just for the sanity check.
+
+- **sd_cifar10.py** shows the algorithm implementation for 500 epochs on cifar_10 dataset. After 500 epochs, ~9.4% error
+was achieved for cifar10, it can be further improved by some more careful hyper parameters tuning to achieve
+the reported numbers in the paper.
+You can see the sample result log in the top section of sd_cifar10.py file.
diff --git a/example/stochastic-depth/sd_cifar10.py b/example/stochastic-depth/sd_cifar10.py
index c123562..7eb3202 100644
--- a/example/stochastic-depth/sd_cifar10.py
+++ b/example/stochastic-depth/sd_cifar10.py
@@ -214,4 +214,3 @@ mod_seq.fit(train, val,
             num_epoch=num_epochs, batch_end_callback=batch_end_callbacks,
             epoch_end_callback=epoch_end_callbacks,
             initializer=initializer)
-
diff --git a/example/vae/README.md b/example/vae/README.md
new file mode 100644
index 0000000..c6e68d5
--- /dev/null
+++ b/example/vae/README.md
@@ -0,0 +1,21 @@
+Variational Auto Encoder(VAE)
+=============================
+
+This folder contains a tutorial which implements the Variational Auto Encoder in MXNet using the MNIST handwritten digit
+recognition dataset. Model built is referred from [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114/)
+paper. This paper introduces a stochastic variational inference and learning algorithm that scales to large datasets.
+
+Prerequisites:
+To run this example, you need:
+- [Jupyter Notebook](http://jupyter.org/index.html)
+- Matplotlib
+
+Files in this folder:
+- **VAE_example.ipynb** : Jupyter notebook which explains concept of VAE step by step and also shows how to use
+MXNet-based VAE class(from VAE.py) to do the training directly.
+
+- **VAE.py** : Contains class which implements the Variational Auto Encoder. This is used in the above tutorial.
+
+In VAE, the encoder becomes a variational inference network that maps the data to a distribution
+for the hidden variables, and the decoder becomes a generative network that maps the latent variables back to the data.
+The network architecture shown in the tutorial uses Gaussian MLP as an encoder and Bernoulli MLP as a decoder.
diff --git a/example/vae/VAE.py b/example/vae/VAE.py
index 9de1abf..ba06733 100644
--- a/example/vae/VAE.py
+++ b/example/vae/VAE.py
@@ -21,86 +21,90 @@ import numpy as np
 import os
 import logging
 
-
 class VAE:
-    '''This class implements the Variational Auto Encoder'''
+    """This class implements the Variational Auto Encoder"""
     
     def Bernoulli(x_hat,loss_label):
-        return(-mx.symbol.sum(mx.symbol.broadcast_mul(loss_label,mx.symbol.log(x_hat)) + mx.symbol.broadcast_mul(1-loss_label,mx.symbol.log(1-x_hat)),axis=1))
-
-    
-    def __init__(self,n_latent=5,num_hidden_ecoder=400,num_hidden_decoder=400,x_train=None,x_valid=None,batch_size=100,learning_rate=0.001,weight_decay=0.01,num_epoch=100,optimizer='sgd',model_prefix=None, initializer = mx.init.Normal(0.01),likelihood=Bernoulli):
-        
-
-        self.n_latent = n_latent                            #dimension of the latent space Z
-        self.num_hidden_ecoder = num_hidden_ecoder          #number of hidden units in the encoder
-        self.num_hidden_decoder = num_hidden_decoder        #number of hidden units in the decoder
-        self.batch_size = batch_size                        #mini batch size
-        self.learning_rate = learning_rate                  #learning rate during training
-        self.weight_decay = weight_decay                    #weight decay during training, for regulariization of parameters
-        self.num_epoch = num_epoch                          #total number of training epoch
-        self.optimizer = optimizer
-
-
-
-        #train the model
-        self.model, self.training_loss = VAE.train_vae(x_train,x_valid,batch_size,n_latent,num_hidden_ecoder,num_hidden_decoder,learning_rate,weight_decay,num_epoch,optimizer,model_prefix,likelihood,initializer)
-        #save model parameters (i.e. weights and biases)
+        return(-mx.symbol.sum(mx.symbol.broadcast_mul(loss_label,mx.symbol.log(x_hat))
+                              + mx.symbol.broadcast_mul(1-loss_label,mx.symbol.log(1-x_hat)), axis=1))
+
+    def __init__(self, n_latent=5, num_hidden_ecoder=400, num_hidden_decoder=400, x_train=None, x_valid=None,
+                 batch_size=100, learning_rate=0.001, weight_decay=0.01, num_epoch=100, optimizer='sgd',
+                 model_prefix=None, initializer=mx.init.Normal(0.01), likelihood=Bernoulli):
+        self.n_latent = n_latent                      # dimension of the latent space Z
+        self.num_hidden_ecoder = num_hidden_ecoder    # number of hidden units in the encoder
+        self.num_hidden_decoder = num_hidden_decoder  # number of hidden units in the decoder
+        self.batch_size = batch_size                  # mini batch size
+        self.learning_rate = learning_rate            # learning rate during training
+        self.weight_decay = weight_decay              # weight decay during training, for regularization of parameters
+        self.num_epoch = num_epoch                    # total number of training epoch
+        self.optimizer = optimizer                    # 'sgd' optimizer by default
+
+        # train the model
+        self.model, self.training_loss = VAE.train_vae(x_train, x_valid, batch_size, n_latent, num_hidden_ecoder,
+                                                       num_hidden_decoder, learning_rate, weight_decay,
+                                                       num_epoch,optimizer, model_prefix, likelihood, initializer)
+
+        # save model parameters (i.e. weights and biases)
         self.arg_params = self.model.get_params()[0]
-        #save loss(ELBO) for the training set 
-        nd_iter = mx.io.NDArrayIter(data={'data':x_train},label={'loss_label':x_train},batch_size = batch_size)     
 
-        #if saved parameters, can access them at specific iteration e.g. last epoch using
+        # save loss(ELBO) for the training set
+        nd_iter = mx.io.NDArrayIter(data={'data':x_train}, label={'loss_label':x_train}, batch_size=batch_size)
+
+        # if saved parameters, can access them at specific iteration e.g. last epoch using
         #   sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, self.num_epoch)
         #   assert sym.tojson() == output.tojson()
-        #   self.arg_params = arg_params 
-    def train_vae(x_train,x_valid,batch_size,n_latent,num_hidden_ecoder,num_hidden_decoder,learning_rate,weight_decay,num_epoch,optimizer,model_prefix,likelihood,initializer):
-        [N,features] = np.shape(x_train)          #number of examples and features
+        #   self.arg_params = arg_params
+
+    @staticmethod
+    def train_vae(x_train, x_valid, batch_size, n_latent, num_hidden_ecoder, num_hidden_decoder, learning_rate,
+                  weight_decay, num_epoch, optimizer, model_prefix, likelihood, initializer):
+        [N,features] = np.shape(x_train)          # number of examples and features
+
+        # create data iterator to feed into NN
+        nd_iter = mx.io.NDArrayIter(data={'data':x_train}, label={'loss_label':x_train}, batch_size=batch_size)
 
-        #create data iterator to feed into NN
-        nd_iter = mx.io.NDArrayIter(data={'data':x_train},label={'loss_label':x_train},batch_size = batch_size)
         if x_valid is not None:
-            nd_iter_val = mx.io.NDArrayIter(data={'data':x_valid},label={'loss_label':x_valid},batch_size = batch_size)
+            nd_iter_val = mx.io.NDArrayIter(data={'data':x_valid}, label={'loss_label':x_valid}, batch_size=batch_size)
         else:
             nd_iter_val = None
+
         data = mx.sym.var('data')
         loss_label = mx.sym.var('loss_label')
 
+        # build network architecture
+        encoder_h = mx.sym.FullyConnected(data=data, name="encoder_h", num_hidden=num_hidden_ecoder)
+        act_h = mx.sym.Activation(data=encoder_h, act_type="tanh", name="activation_h")
 
-        #build network architucture
-        encoder_h  = mx.sym.FullyConnected(data=data, name="encoder_h",num_hidden=num_hidden_ecoder)
-        act_h = mx.sym.Activation(data=encoder_h, act_type="tanh",name="activation_h")
+        mu = mx.sym.FullyConnected(data=act_h, name="mu", num_hidden=n_latent)
+        logvar = mx.sym.FullyConnected(data=act_h, name="logvar", num_hidden=n_latent)
 
-        
-        mu  = mx.sym.FullyConnected(data=act_h, name="mu",num_hidden = n_latent)
-        logvar  = mx.sym.FullyConnected(data=act_h, name="logvar",num_hidden = n_latent)
-        #latent manifold
-        z = mu + mx.symbol.broadcast_mul(mx.symbol.exp(0.5*logvar),mx.symbol.random_normal(loc=0, scale=1,shape=(batch_size,n_latent))) 
-        decoder_z = mx.sym.FullyConnected(data=z, name="decoder_z",num_hidden=num_hidden_decoder)
-        act_z = mx.sym.Activation(data=decoder_z, act_type="tanh",name="actication_z")
+        # latent manifold
+        z = mu + mx.symbol.broadcast_mul(mx.symbol.exp(0.5*logvar),
+                                         mx.symbol.random_normal(loc=0, scale=1, shape=(batch_size, n_latent)))
+        decoder_z = mx.sym.FullyConnected(data=z, name="decoder_z", num_hidden=num_hidden_decoder)
+        act_z = mx.sym.Activation(data=decoder_z, act_type="tanh", name="actication_z")
 
-        decoder_x = mx.sym.FullyConnected(data=act_z, name="decoder_x",num_hidden=features)
-        act_x = mx.sym.Activation(data=decoder_x, act_type="sigmoid",name='activation_x')
+        decoder_x = mx.sym.FullyConnected(data=act_z, name="decoder_x", num_hidden=features)
+        act_x = mx.sym.Activation(data=decoder_x, act_type="sigmoid", name='activation_x')
 
-        KL = -0.5*mx.symbol.sum(1+logvar-pow( mu,2)-mx.symbol.exp(logvar),axis=1)
+        KL = -0.5 * mx.symbol.sum(1+logvar-pow(mu,2)-mx.symbol.exp(logvar), axis=1)
 
-        #compute minus ELBO to minimize 
-        loss = likelihood(act_x,loss_label)+KL
-        output = mx.symbol.MakeLoss(sum(loss),name='loss')
+        # compute minus ELBO to minimize
+        loss = likelihood(act_x, loss_label)+KL
+        output = mx.symbol.MakeLoss(sum(loss), name='loss')
 
-        #train the model
+        # train the model
         nd_iter.reset()
         logging.getLogger().setLevel(logging.DEBUG)  # logging to stdout
 
         model = mx.mod.Module(
-            symbol = output ,
+            symbol=output ,
             data_names=['data'],
-            label_names = ['loss_label'])
-
-             #initialize the weights and bias 
-
+            label_names=['loss_label'])
 
         training_loss = list()
+
         def log_to_list(period, lst):
                 def _callback(param):
                         """The checkpoint function."""
@@ -110,37 +114,40 @@ class VAE:
                 return _callback
 
         model.fit(nd_iter,  # train data
-                    initializer = initializer,
-                    eval_data = nd_iter_val,
-                    optimizer = optimizer,  # use SGD to train
-                    optimizer_params = {'learning_rate':learning_rate,'wd':weight_decay},  
-                    epoch_end_callback  = None if model_prefix==None else mx.callback.do_checkpoint(model_prefix, 1),   #save parameters for each epoch if model_prefix is supplied
-                    batch_end_callback = log_to_list(int(N/batch_size),training_loss),  #this can save the training loss
-                    num_epoch = num_epoch,
-                    eval_metric = 'Loss')
+                  initializer=initializer, # initialize the weights and bias
+                  eval_data=nd_iter_val,
+                  optimizer=optimizer,  # use SGD to train
+                  optimizer_params={'learning_rate':learning_rate, 'wd':weight_decay},
+                  # save parameters for each epoch if model_prefix is supplied
+                  epoch_end_callback=None if model_prefix==None else mx.callback.do_checkpoint(model_prefix, 1),
+                  batch_end_callback=log_to_list(int(N/batch_size), training_loss),  # this can save the training loss
+                  num_epoch=num_epoch,
+                  eval_metric='Loss')
 
         return model,training_loss
 
-
-    def encoder(model,x):
+    @staticmethod
+    def encoder(model, x):
         params = model.arg_params
         encoder_n = np.shape(params['encoder_h_bias'].asnumpy())[0]
-        encoder_h = np.dot(params['encoder_h_weight'].asnumpy(),np.transpose(x)) + np.reshape(params['encoder_h_bias'].asnumpy(),(encoder_n,1))
+        encoder_h = np.dot(params['encoder_h_weight'].asnumpy(), np.transpose(x)) \
+                    + np.reshape(params['encoder_h_bias'].asnumpy(), (encoder_n,1))
         act_h = np.tanh(encoder_h)
         mu = np.transpose(np.dot(params['mu_weight'].asnumpy(),act_h)) + params['mu_bias'].asnumpy()
         logvar = np.transpose(np.dot(params['logvar_weight'].asnumpy(),act_h)) + params['logvar_bias'].asnumpy()
         return mu,logvar
 
-    def sampler(mu,logvar):
-        z = mu + np.multiply(np.exp(0.5*logvar),np.random.normal(loc=0, scale=1,size=np.shape(logvar))) 
+    @staticmethod
+    def sampler(mu, logvar):
+        z = mu + np.multiply(np.exp(0.5*logvar), np.random.normal(loc=0, scale=1,size=np.shape(logvar)))
         return z
 
-
-
-    def decoder(model,z):
+    @staticmethod
+    def decoder(model, z):
         params = model.arg_params
         decoder_n = np.shape(params['decoder_z_bias'].asnumpy())[0]
-        decoder_z = np.dot(params['decoder_z_weight'].asnumpy(),np.transpose(z)) + np.reshape(params['decoder_z_bias'].asnumpy(),(decoder_n,1))
+        decoder_z = np.dot(params['decoder_z_weight'].asnumpy(),np.transpose(z)) \
+                    + np.reshape(params['decoder_z_bias'].asnumpy(),(decoder_n,1))
         act_z = np.tanh(decoder_z)
         decoder_x = np.transpose(np.dot(params['decoder_x_weight'].asnumpy(),act_z)) + params['decoder_x_bias'].asnumpy()
         reconstructed_x = 1/(1+np.exp(-decoder_x))
diff --git a/example/vae/VAE_example.ipynb b/example/vae/VAE_example.ipynb
old mode 100644
new mode 100755
index c29348a..e7ec03a
--- a/example/vae/VAE_example.ipynb
+++ b/example/vae/VAE_example.ipynb
@@ -24,11 +24,28 @@
     "\n",
     "#### Xiaoyu Lu,  July 5th, 2017\n",
     "\n",
-    "This tutorial guides you through the process of building a variational encoder in MXNet. in this notebook we'll focus on an example unsing the MNIST handwritten digit recognition dataset. Refer to [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114/) for more details on the model description.\n",
+    "This tutorial guides you through the process of building a variational encoder in MXNet. In this notebook we'll focus on an example using the MNIST handwritten digit recognition dataset. Refer to [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114/) for more details on the model description.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "\n",
+    "To complete this tutorial, we need following python packages:\n",
     "\n",
+    "- numpy, matplotlib "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "## 1. Loading the Data\n",
     "\n",
-    "We first load the MNIST dataset, which contains 60000 trainings and 10000 test examples. The following code import required modules and load the data. These images are stored in a 4-D matrix with shape (`batch_size, num_channels, width, height`). For the MNIST dataset, there is only one color channel, and both width and height are 28, so we reshape each image as a 28x28 array. See below for a visualization.\n"
+    "We first load the MNIST dataset, which contains 60000 training and 10000 test examples. The following code imports required modules and loads the data. These images are stored in a 4-D matrix with shape (`batch_size, num_channels, width, height`). For the MNIST dataset, there is only one color channel, and both width and height are 28, so we reshape each image as a 28x28 array. See below for a visualization:\n"
    ]
   },
   {
@@ -50,7 +67,9 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -103,7 +122,7 @@
     "## 2.  Building the Network Architecture\n",
     "\n",
     "### 2.1 Gaussian MLP as encoder\n",
-    "Next we constuct the neural network, as in the paper, we use *Multilayer Perceptron (MLP)* for both the encoder and decoder. For encoder, a Gaussian MLP is used:\n",
+    "Next we constuct the neural network, as in the [paper](https://arxiv.org/abs/1312.6114/), we use *Multilayer Perceptron (MLP)* for both the encoder and decoder. For encoder, a Gaussian MLP is used as follows:\n",
     "\n",
     "\\begin{align}\n",
     "\\log q_{\\phi}(z|x) &= \\log \\mathcal{N}(z:\\mu,\\sigma^2I) \\\\\n",
@@ -112,7 +131,7 @@
     "\\end{align}\n",
     "\n",
     "where $\\{W_1,W_2,W_3,b_1,b_2,b_3\\}$ are the weights and biases of the MLP.\n",
-    "Note below that `encoder_mu` and `encoder_logvar` are symbols, can use `get_internals()` to get the values of them, after which we can sample the latent variable $z$.\n",
+    "Note below that `encoder_mu`(`mu`) and `encoder_logvar`(`logvar`) are symbols. So, we can use `get_internals()` to get the values of them, after which we can sample the latent variable $z$.\n",
     "\n",
     "\n",
     "\n"
@@ -139,7 +158,8 @@
     "logvar  = mx.sym.FullyConnected(data=act_h, name=\"logvar\",num_hidden = 5)\n",
     "\n",
     "## sample the latent variables z according to Normal(mu,var)\n",
-    "z = mu + np.multiply(mx.symbol.exp(0.5*logvar),mx.symbol.random_normal(loc=0, scale=1,shape=np.shape(logvar.get_internals()[\"logvar_output\"])))"
+    "z = mu + np.multiply(mx.symbol.exp(0.5 * logvar), \n",
+    "                     mx.symbol.random_normal(loc=0, scale=1, shape=np.shape(logvar.get_internals()[\"logvar_output\"])))"
    ]
   },
   {
@@ -181,13 +201,13 @@
    "source": [
     "### 2.3 Joint Loss Function for the Encoder and the Decoder\n",
     "\n",
-    "The variational lower bound can be estimated as:\n",
+    "The variational lower bound also called evidence lower bound (ELBO) can be estimated as:\n",
     "\n",
     "\\begin{align}\n",
     "\\mathcal{L}(\\theta,\\phi;x_{(i)}) \\approx \\frac{1}{2}\\left(1+\\log ((\\sigma_j^{(i)})^2)-(\\mu_j^{(i)})^2-(\\sigma_j^{(i)})^2\\right) + \\log p_\\theta(x^{(i)}|z^{(i)})\n",
     "\\end{align}\n",
     "\n",
-    "where the first term is the KL divergence of the approximate posterior from the prior, and the second term is an expected negative reconstruction error. We would like to maximize this lower bound, so we can define the loss to be $-\\mathcal{L}$ for MXNet to minimize."
+    "where the first term is the KL divergence of the approximate posterior from the prior, and the second term is an expected negative reconstruction error. We would like to maximize this lower bound, so we can define the loss to be $-\\mathcal{L}$(minus ELBO) for MXNet to minimize."
    ]
   },
   {
@@ -200,7 +220,8 @@
    "source": [
     "# define the objective loss function that needs to be minimized\n",
     "KL = 0.5*mx.symbol.sum(1+logvar-pow( mu,2)-mx.symbol.exp(logvar),axis=1)\n",
-    "loss = -mx.symbol.sum(mx.symbol.broadcast_mul(loss_label,mx.symbol.log(y)) + mx.symbol.broadcast_mul(1-loss_label,mx.symbol.log(1-y)),axis=1)-KL\n",
+    "loss = -mx.symbol.sum(mx.symbol.broadcast_mul(loss_label,mx.symbol.log(y)) \n",
+    "                      + mx.symbol.broadcast_mul(1-loss_label,mx.symbol.log(1-y)),axis=1)-KL\n",
     "output = mx.symbol.MakeLoss(sum(loss),name='loss')"
    ]
   },
@@ -209,7 +230,10 @@
    "metadata": {},
    "source": [
     "## 3. Training the model\n",
-    "Now we can define the model and train it, we initilize the weights and the biases to be Gaussian(0,0.01), and use stochastic gradient descent for optimization. To warm start the training, one may initilize with pre-trainined parameters `arg_params` using `init=mx.initializer.Load(arg_params)`. To save intermediate results, we can optionally use `epoch_end_callback  = mx.callback.do_checkpoint(model_prefix, 1)` which saves the parameters to the path given by model_prefix, and with pe [...]
+    "\n",
+    "Now, we can define the model and train it. First we will initilize the weights and the biases to be Gaussian(0,0.01), and then use stochastic gradient descent for optimization. To warm start the training, one may also initilize with pre-trainined parameters `arg_params` using `init=mx.initializer.Load(arg_params)`. \n",
+    "\n",
+    "To save intermediate results, we can optionally use `epoch_end_callback = mx.callback.do_checkpoint(model_prefix, 1)` which saves the parameters to the path given by model_prefix, and with period every $1$ epoch. To assess the performance, we output $-\\mathcal{L}$(minus ELBO) after each epoch, with the command `eval_metric = 'Loss'` which is defined above. We will also plot the training loss for mini batches by accessing the log and saving it to a list, and then parsing it to the a [...]
    ]
   },
   {
@@ -224,7 +248,7 @@
     "nd_iter.reset()\n",
     "logging.getLogger().setLevel(logging.DEBUG)  \n",
     "\n",
-    "#define function to trave back training loss\n",
+    "# define function to trave back training loss\n",
     "def log_to_list(period, lst):\n",
     "    def _callback(param):\n",
     "        \"\"\"The checkpoint function.\"\"\"\n",
@@ -243,7 +267,9 @@
   {
    "cell_type": "code",
    "execution_count": 9,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stderr",
@@ -459,21 +485,24 @@
     "# initilize the parameters for training using Normal.\n",
     "init = mx.init.Normal(0.01)\n",
     "model.fit(nd_iter,  # train data\n",
-    "              initializer=init,\n",
-    "              #if eval_data is supplied, test loss will also be reported\n",
-    "              #eval_data = nd_iter_test,\n",
-    "              optimizer='sgd',  # use SGD to train\n",
-    "              optimizer_params={'learning_rate':1e-3,'wd':1e-2},  \n",
-    "              epoch_end_callback  = None if model_prefix==None else mx.callback.do_checkpoint(model_prefix, 1),   #save parameters for each epoch if model_prefix is supplied\n",
-    "              batch_end_callback = log_to_list(N/batch_size,training_loss), \n",
-    "              num_epoch=100,\n",
-    "              eval_metric = 'Loss')"
+    "          initializer=init,\n",
+    "          # if eval_data is supplied, test loss will also be reported\n",
+    "          # eval_data = nd_iter_test,\n",
+    "          optimizer='sgd',  # use SGD to train\n",
+    "          optimizer_params={'learning_rate':1e-3,'wd':1e-2},  \n",
+    "          # save parameters for each epoch if model_prefix is supplied\n",
+    "          epoch_end_callback = None if model_prefix==None else mx.callback.do_checkpoint(model_prefix, 1),\n",
+    "          batch_end_callback = log_to_list(N/batch_size,training_loss), \n",
+    "          num_epoch=100,\n",
+    "          eval_metric = 'Loss')"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 23,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -497,7 +526,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As expected, the ELBO is monotonically increasing over epoch, and we reproduced the resutls given in the paper [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114/). Now we can extract/load the parameters and then feed the network forward to calculate $y$ which is the reconstructed image, and we can also calculate the ELBO for the test set. "
+    "As expected, the ELBO is monotonically increasing over epoch, and we reproduced the results given in the paper [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114/). Now we can extract/load the parameters and then feed the network forward to calculate $y$ which is the reconstructed image, and we can also calculate the ELBO for the test set. "
    ]
   },
   {
@@ -510,9 +539,9 @@
    "source": [
     "arg_params = model.get_params()[0]\n",
     "\n",
-    "#if saved the parameters, can load them at e.g. 100th epoch\n",
-    "#sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 100)\n",
-    "#assert sym.tojson() == output.tojson()\n",
+    "# if saved the parameters, can load them using `load_checkpoint` method at e.g. 100th epoch\n",
+    "# sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 100)\n",
+    "# assert sym.tojson() == output.tojson()\n",
     "\n",
     "e = y.bind(mx.cpu(), {'data': nd_iter_test.data[0][1],\n",
     "                     'encoder_h_weight': arg_params['encoder_h_weight'],\n",
@@ -535,6 +564,7 @@
    "cell_type": "code",
    "execution_count": 78,
    "metadata": {
+    "collapsed": false,
     "scrolled": true
    },
    "outputs": [
@@ -566,7 +596,9 @@
   {
    "cell_type": "code",
    "execution_count": 37,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -580,7 +612,7 @@
     }
    ],
    "source": [
-    "#calculate the ELBO which is minus the loss for test set\n",
+    "# calculate the ELBO which is minus the loss for test set\n",
     "metric = mx.metric.Loss()\n",
     "model.score(nd_iter_test, metric)"
    ]
@@ -607,14 +639,21 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "One can directly call the class `VAE` to do the training. The outputs are the learned model and training loss.\n",
-    "```VAE(n_latent=5,num_hidden_ecoder=400,num_hidden_decoder=400,x_train=None,x_valid=None,batch_size=100,learning_rate=0.001,weight_decay=0.01,num_epoch=100,optimizer='sgd',model_prefix=None, initializer = mx.init.Normal(0.01),likelihood=Bernoulli)```"
+    "One can directly call the class `VAE` to do the training:\n",
+    "\n",
+    "```VAE(n_latent=5,num_hidden_ecoder=400,num_hidden_decoder=400,x_train=None,x_valid=None,\n",
+    "batch_size=100,learning_rate=0.001,weight_decay=0.01,num_epoch=100,optimizer='sgd',model_prefix=None,\n",
+    "initializer = mx.init.Normal(0.01),likelihood=Bernoulli)```\n",
+    "\n",
+    "The outputs are the learned model and training loss."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stderr",
@@ -830,13 +869,7 @@
       "INFO:root:Epoch[103] Time cost=10.267\n",
       "INFO:root:Epoch[104] Train-loss=168.181174\n",
       "INFO:root:Epoch[104] Time cost=11.132\n",
-      "INFO:root:Epoch[105] Train-loss=168.021498\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
+      "INFO:root:Epoch[105] Train-loss=168.021498\n",
       "INFO:root:Epoch[105] Time cost=10.187\n",
       "INFO:root:Epoch[106] Train-loss=167.858251\n",
       "INFO:root:Epoch[106] Time cost=10.676\n",
@@ -1030,11 +1063,11 @@
     }
    ],
    "source": [
-    "# can initilize weights and biases with the learned parameters \n",
-    "#init = mx.initializer.Load(params)\n",
+    "# can initilize weights and biases with the learned parameters as follows: \n",
+    "# init = mx.initializer.Load(params)\n",
     "\n",
-    "# call the VAE , output model contains the learned model and training loss\n",
-    "out = VAE(n_latent=2,x_train=image,x_valid=None,num_epoch=200) "
+    "# call the VAE, output model contains the learned model and training loss\n",
+    "out = VAE(n_latent=2, x_train=image, x_valid=None, num_epoch=200) "
    ]
   },
   {
@@ -1047,7 +1080,7 @@
    "source": [
     "# encode test images to obtain mu and logvar which are used for sampling\n",
     "[mu,logvar] = VAE.encoder(out,image_test)\n",
-    "#sample in the latent space\n",
+    "# sample in the latent space\n",
     "z = VAE.sampler(mu,logvar)\n",
     "# decode from the latent space to obtain reconstructed images\n",
     "x_construction = VAE.decoder(out,z)\n"
@@ -1056,7 +1089,9 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -1085,7 +1120,9 @@
   {
    "cell_type": "code",
    "execution_count": 78,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -1145,21 +1182,21 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python [Root]",
    "language": "python",
-   "name": "python3"
+   "name": "Python [Root]"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.1"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,

-- 
To stop receiving notification emails like this one, please contact
['"commits@mxnet.apache.org" <co...@mxnet.apache.org>'].