You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/11/08 17:53:35 UTC

[GitHub] indhub closed pull request #13068: Updates to several examples

indhub closed pull request #13068: Updates to several examples
URL: https://github.com/apache/incubator-mxnet/pull/13068
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/example/reinforcement-learning/ddpg/README.md b/example/reinforcement-learning/ddpg/README.md
index 37f42a8292c..2e299dd5daa 100644
--- a/example/reinforcement-learning/ddpg/README.md
+++ b/example/reinforcement-learning/ddpg/README.md
@@ -1,6 +1,8 @@
 # mx-DDPG
 MXNet Implementation of DDPG
 
+## /!\ This example depends on RLLAB which is deprecated /!\
+
 # Introduction
 
 This is the MXNet implementation of [DDPG](https://arxiv.org/abs/1509.02971). It is tested in the rllab cart pole environment against rllab's native implementation and achieves comparably similar results. You can substitute with this anywhere you use rllab's DDPG with minor modifications.
diff --git a/example/reinforcement-learning/dqn/setup.sh b/example/reinforcement-learning/dqn/setup.sh
index 3069fef62ec..012ff8fb1c0 100755
--- a/example/reinforcement-learning/dqn/setup.sh
+++ b/example/reinforcement-learning/dqn/setup.sh
@@ -26,11 +26,11 @@ pip install pygame
 
 # Install arcade learning environment
 if [[ "$OSTYPE" == "linux-gnu" ]]; then
-    sudo apt-get install libsdl1.2-dev libsdl-gfx1.2-dev libsdl-image1.2-dev cmake
+    sudo apt-get install libsdl1.2-dev libsdl-gfx1.2-dev libsdl-image1.2-dev cmake ninja-build
 elif [[ "$OSTYPE" == "darwin"* ]]; then
     brew install sdl sdl_image sdl_mixer sdl_ttf portmidi
 fi
-git clone git@github.com:mgbellemare/Arcade-Learning-Environment.git || true
+git clone https://github.com/mgbellemare/Arcade-Learning-Environment || true
 pushd .
 cd Arcade-Learning-Environment
 mkdir -p build
@@ -43,6 +43,6 @@ popd
 cp Arcade-Learning-Environment/ale.cfg .
 
 # Copy roms
-git clone git@github.com:npow/atari.git || true
+git clone https://github.com/npow/atari || true
 cp -R atari/roms .
 
diff --git a/example/restricted-boltzmann-machine/README.md b/example/restricted-boltzmann-machine/README.md
index 129120ba996..a8769a51e05 100644
--- a/example/restricted-boltzmann-machine/README.md
+++ b/example/restricted-boltzmann-machine/README.md
@@ -8,6 +8,58 @@ Here are some samples generated by the RBM with the default hyperparameters. The
 
 <p style="text-align:center"><img src="samples.png"/></p>
 
+Usage:
+
+```
+python binary_rbm_gluon.py --help
+usage: binary_rbm_gluon.py [-h] [--num-hidden NUM_HIDDEN] [--k K]
+                           [--batch-size BATCH_SIZE] [--num-epoch NUM_EPOCH]
+                           [--learning-rate LEARNING_RATE]
+                           [--momentum MOMENTUM]
+                           [--ais-batch-size AIS_BATCH_SIZE]
+                           [--ais-num-batch AIS_NUM_BATCH]
+                           [--ais-intermediate-steps AIS_INTERMEDIATE_STEPS]
+                           [--ais-burn-in-steps AIS_BURN_IN_STEPS] [--cuda]
+                           [--no-cuda] [--device-id DEVICE_ID]
+                           [--data-loader-num-worker DATA_LOADER_NUM_WORKER]
+
+Restricted Boltzmann machine learning MNIST
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --num-hidden NUM_HIDDEN
+                        number of hidden units
+  --k K                 number of Gibbs sampling steps used in the PCD
+                        algorithm
+  --batch-size BATCH_SIZE
+                        batch size
+  --num-epoch NUM_EPOCH
+                        number of epochs
+  --learning-rate LEARNING_RATE
+                        learning rate for stochastic gradient descent
+  --momentum MOMENTUM   momentum for the stochastic gradient descent
+  --ais-batch-size AIS_BATCH_SIZE
+                        batch size for AIS to estimate the log-likelihood
+  --ais-num-batch AIS_NUM_BATCH
+                        number of batches for AIS to estimate the log-
+                        likelihood
+  --ais-intermediate-steps AIS_INTERMEDIATE_STEPS
+                        number of intermediate distributions for AIS to
+                        estimate the log-likelihood
+  --ais-burn-in-steps AIS_BURN_IN_STEPS
+                        number of burn in steps for each intermediate
+                        distributions of AIS to estimate the log-likelihood
+  --cuda                train on GPU with CUDA
+  --no-cuda             train on CPU
+  --device-id DEVICE_ID
+                        GPU device id
+  --data-loader-num-worker DATA_LOADER_NUM_WORKER
+                        number of multithreading workers for the data loader
+```
+Default:
+```
+Namespace(ais_batch_size=100, ais_burn_in_steps=10, ais_intermediate_steps=10, ais_num_batch=10, batch_size=80, cuda=True, data_loader_num_worker=4, device_id=0, k=30, learning_rate=0.1, momentum=0.3, num_epoch=130, num_hidden=500)
+```
 [1] G E Hinton &amp; R R Salakhutdinov, Reducing the Dimensionality of Data with Neural Networks Science **313**, 5786 (2006)<br/>
 [2] R M Neal, Annealed importance sampling. Stat Comput **11** 2 (2001)<br/>
 [3] R Salakhutdinov &amp; I Murray, On the quantitative analysis of deep belief networks. In Proc. ICML '08 **25** (2008)
\ No newline at end of file
diff --git a/example/rnn-time-major/bucket_io.py b/example/rnn-time-major/bucket_io.py
deleted file mode 100644
index e689ff11267..00000000000
--- a/example/rnn-time-major/bucket_io.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme
-# pylint: disable=superfluous-parens, no-member, invalid-name
-
-from __future__ import print_function
-import numpy as np
-import mxnet as mx
-
-# The interface of a data iter that works for bucketing
-#
-# DataIter
-#   - default_bucket_key: the bucket key for the default symbol.
-#
-# DataBatch
-#   - provide_data: same as DataIter, but specific to this batch
-#   - provide_label: same as DataIter, but specific to this batch
-#   - bucket_key: the key for the bucket that should be used for this batch
-
-def default_read_content(path):
-    with open(path) as ins:
-        content = ins.read()
-        content = content.replace('\n', ' <eos> ').replace('. ', ' <eos> ')
-        return content
-
-def default_build_vocab(path):
-    content = default_read_content(path)
-    content = content.split(' ')
-    idx = 1 # 0 is left for zero-padding
-    the_vocab = {}
-    the_vocab[' '] = 0 # put a dummy element here so that len(vocab) is correct
-    for word in content:
-        if len(word) == 0:
-            continue
-        if not word in the_vocab:
-            the_vocab[word] = idx
-            idx += 1
-    return the_vocab
-
-def default_text2id(sentence, the_vocab):
-    words = sentence.split(' ')
-    words = [the_vocab[w] for w in words if len(w) > 0]
-    return words
-
-def default_gen_buckets(sentences, batch_size, the_vocab):
-    len_dict = {}
-    max_len = -1
-    for sentence in sentences:
-        words = default_text2id(sentence, the_vocab)
-        if len(words) == 0:
-            continue
-        if len(words) > max_len:
-            max_len = len(words)
-        if len(words) in len_dict:
-            len_dict[len(words)] += 1
-        else:
-            len_dict[len(words)] = 1
-    print(len_dict)
-
-    tl = 0
-    buckets = []
-    for l, n in len_dict.items(): # TODO: There are better heuristic ways to do this
-        if n + tl >= batch_size:
-            buckets.append(l)
-            tl = 0
-        else:
-            tl += n
-    if tl > 0:
-        buckets.append(max_len)
-    return buckets
-
-class SimpleBatch(object):
-    def __init__(self, data_names, data, data_layouts, label_names, label, label_layouts, bucket_key):
-        self.data = data
-        self.label = label
-        self.data_names = data_names
-        self.label_names = label_names
-        self.data_layouts = data_layouts
-        self.label_layouts = label_layouts
-        self.bucket_key = bucket_key
-
-        self.pad = 0
-        self.index = None # TODO: what is index?
-
-    @property
-    def provide_data(self):
-        return [mx.io.DataDesc(n, x.shape, layout=l) for n, x, l in zip(self.data_names, self.data, self.data_layouts)]
-
-    @property
-    def provide_label(self):
-        return [mx.io.DataDesc(n, x.shape, layout=l) for n, x, l in zip(self.label_names, self.label, self.label_layouts)]
-
-class DummyIter(mx.io.DataIter):
-    "A dummy iterator that always return the same batch, used for speed testing"
-    def __init__(self, real_iter):
-        super(DummyIter, self).__init__()
-        self.real_iter = real_iter
-        self.provide_data = real_iter.provide_data
-        self.provide_label = real_iter.provide_label
-        self.batch_size = real_iter.batch_size
-
-        for batch in real_iter:
-            self.the_batch = batch
-            break
-
-    def __iter__(self):
-        return self
-
-    def next(self):
-        return self.the_batch
-
-class BucketSentenceIter(mx.io.DataIter):
-    def __init__(self, path, vocab, buckets, batch_size,
-                 init_states, data_name='data', label_name='label',
-                 seperate_char=' <eos> ', text2id=None, read_content=None,
-                 time_major=True):
-        super(BucketSentenceIter, self).__init__()
-
-        if text2id is None:
-            self.text2id = default_text2id
-        else:
-            self.text2id = text2id
-        if read_content is None:
-            self.read_content = default_read_content
-        else:
-            self.read_content = read_content
-        content = self.read_content(path)
-        sentences = content.split(seperate_char)
-
-        if len(buckets) == 0:
-            buckets = default_gen_buckets(sentences, batch_size, vocab)
-
-        self.vocab_size = len(vocab)
-        self.data_name = data_name
-        self.label_name = label_name
-        self.time_major = time_major
-
-        buckets.sort()
-        self.buckets = buckets
-        self.data = [[] for _ in buckets]
-
-        # pre-allocate with the largest bucket for better memory sharing
-        self.default_bucket_key = max(buckets)
-
-        for sentence in sentences:
-            sentence = self.text2id(sentence, vocab)
-            if len(sentence) == 0:
-                continue
-            for i, bkt in enumerate(buckets):
-                if bkt >= len(sentence):
-                    self.data[i].append(sentence)
-                    break
-            # we just ignore the sentence it is longer than the maximum
-            # bucket size here
-
-        # convert data into ndarrays for better speed during training
-        data = [np.zeros((len(x), buckets[i])) for i, x in enumerate(self.data)]
-        for i_bucket in range(len(self.buckets)):
-            for j in range(len(self.data[i_bucket])):
-                sentence = self.data[i_bucket][j]
-                data[i_bucket][j, :len(sentence)] = sentence
-        self.data = data
-
-        # Get the size of each bucket, so that we could sample
-        # uniformly from the bucket
-        bucket_sizes = [len(x) for x in self.data]
-
-        print("Summary of dataset ==================")
-        for bkt, size in zip(buckets, bucket_sizes):
-            print("bucket of len %3d : %d samples" % (bkt, size))
-
-        self.batch_size = batch_size
-        self.make_data_iter_plan()
-
-        self.init_states = init_states
-        self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states]
-
-        if self.time_major:
-            self.provide_data = [mx.io.DataDesc('data', (self.default_bucket_key, batch_size), layout='TN')] + init_states
-            self.provide_label = [mx.io.DataDesc('softmax_label', (self.default_bucket_key, batch_size), layout='TN')]
-        else:
-            self.provide_data = [('data', (batch_size, self.default_bucket_key))] + init_states
-            self.provide_label = [('softmax_label', (self.batch_size, self.default_bucket_key))]
-
-    def make_data_iter_plan(self):
-        "make a random data iteration plan"
-        # truncate each bucket into multiple of batch-size
-        bucket_n_batches = []
-        for i in range(len(self.data)):
-            bucket_n_batches.append(len(self.data[i]) / self.batch_size)
-            self.data[i] = self.data[i][:int(bucket_n_batches[i]*self.batch_size)]
-
-        bucket_plan = np.hstack([np.zeros(int(n), int)+i for i, n in enumerate(bucket_n_batches)])
-        np.random.shuffle(bucket_plan)
-
-        bucket_idx_all = [np.random.permutation(len(x)) for x in self.data]
-
-        self.bucket_plan = bucket_plan
-        self.bucket_idx_all = bucket_idx_all
-        self.bucket_curr_idx = [0 for x in self.data]
-
-        self.data_buffer = []
-        self.label_buffer = []
-        for i_bucket in range(len(self.data)):
-            if self.time_major:
-                data = np.zeros((self.buckets[i_bucket], self.batch_size))
-                label = np.zeros((self.buckets[i_bucket], self.batch_size))
-            else:
-                data = np.zeros((self.batch_size, self.buckets[i_bucket]))
-                label = np.zeros((self.batch_size, self.buckets[i_bucket]))
-
-            self.data_buffer.append(data)
-            self.label_buffer.append(label)
-
-    def __iter__(self):
-        for i_bucket in self.bucket_plan:
-            data = self.data_buffer[i_bucket]
-            i_idx = self.bucket_curr_idx[i_bucket]
-            idx = self.bucket_idx_all[i_bucket][i_idx:i_idx+self.batch_size]
-            self.bucket_curr_idx[i_bucket] += self.batch_size
-
-            init_state_names = [x[0] for x in self.init_states]
-
-            if self.time_major:
-                data[:] = self.data[i_bucket][idx].T
-            else:
-                data[:] = self.data[i_bucket][idx]
-
-            label = self.label_buffer[i_bucket]
-            if self.time_major:
-                label[:-1, :] = data[1:, :]
-                label[-1, :] = 0
-            else:
-                label[:, :-1] = data[:, 1:]
-                label[:, -1] = 0
-
-            data_all = [mx.nd.array(data)] + self.init_state_arrays
-            label_all = [mx.nd.array(label)]
-            data_names = ['data'] + init_state_names
-            label_names = ['softmax_label']
-
-            data_batch = SimpleBatch(data_names, data_all, [x.layout for x in self.provide_data],
-                                     label_names, label_all, [x.layout for x in self.provide_label],
-                                     self.buckets[i_bucket])
-            yield data_batch
-
-
-    def reset(self):
-        self.bucket_curr_idx = [0 for x in self.data]
diff --git a/example/rnn-time-major/get_sherlockholmes_data.sh b/example/rnn-time-major/get_sherlockholmes_data.sh
deleted file mode 100755
index 43c8669e003..00000000000
--- a/example/rnn-time-major/get_sherlockholmes_data.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-echo
-echo "NOTE: To continue, you need to review the licensing of the data sets used by this script"
-echo "See https://www.gutenberg.org/wiki/Gutenberg:The_Project_Gutenberg_License for the licensing"
-read -p "Please confirm you have reviewed the licensing [Y/n]:" -n 1 -r
-echo
-
-if [ $REPLY != "Y" ]
-then
-    echo "License was not reviewed, aborting script."
-    exit 1
-fi
-
-RNN_DIR=$(cd `dirname $0`; pwd)
-DATA_DIR="${RNN_DIR}/data/"
-
-if [[ ! -d "${DATA_DIR}" ]]; then
-  echo "${DATA_DIR} doesn't exist, will create one";
-  mkdir -p ${DATA_DIR}
-fi
-
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.train.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.valid.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.test.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
diff --git a/example/rnn-time-major/readme.md b/example/rnn-time-major/readme.md
deleted file mode 100644
index b30b8410b04..00000000000
--- a/example/rnn-time-major/readme.md
+++ /dev/null
@@ -1,24 +0,0 @@
-Time major data layout for RNN
-==============================
-
-This example demonstrates an RNN implementation with Time-major layout. This implementation shows 1.5x-2x speedups compared to Batch-major RNN.
-	
-As example of Batch-major RNN is available in MXNet [RNN Bucketing example](https://github.com/apache/incubator-mxnet/tree/master/example/rnn/bucketing)
-	
-## Running the example
-- Prerequisite: an instance with GPU compute resources is required to run MXNet RNN
-- Make the shell script ```get_sherlockholmes_data.sh``` executable:
-    ```bash 
-    chmod +x get_sherlockholmes_data.sh
-    ```
-- Run ```get_sherlockholmes_data.sh``` to download the sherlockholmes dataset, and follow the instructions to review the license:
-    ```bash
-    ./get_sherlockholmes_data.sh
-    ```
-    The sherlockholmes data sets will be downloaded into ./data directory, and available for the example to train on.
-- Run the example:
-    ```bash
-    python rnn_cell_demo.py
-    ```
-    
-    If everything goes well, console will plot training speed and perplexity that you can compare to the batch major RNN.
diff --git a/example/rnn-time-major/rnn_cell_demo.py b/example/rnn-time-major/rnn_cell_demo.py
deleted file mode 100644
index 80b281b3bdb..00000000000
--- a/example/rnn-time-major/rnn_cell_demo.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""A simple demo of new RNN cell with Sherlock Holmes language model."""
-
-################################################################################
-# Speed test (time major is 1.5~2 times faster than batch major).
-#
-# -- This script (time major) -----
-# 2016-10-10 18:43:21,890 Epoch[0] Batch [50]     Speed: 1717.76 samples/sec      Train-Perplexity=4311.345018
-# 2016-10-10 18:43:25,959 Epoch[0] Batch [100]    Speed: 1573.17 samples/sec      Train-Perplexity=844.092421
-# 2016-10-10 18:43:29,807 Epoch[0] Batch [150]    Speed: 1663.17 samples/sec      Train-Perplexity=498.080716
-# 2016-10-10 18:43:33,871 Epoch[0] Batch [200]    Speed: 1574.84 samples/sec      Train-Perplexity=455.051252
-# 2016-10-10 18:43:37,720 Epoch[0] Batch [250]    Speed: 1662.87 samples/sec      Train-Perplexity=410.500066
-# 2016-10-10 18:43:40,766 Epoch[0] Batch [300]    Speed: 2100.81 samples/sec      Train-Perplexity=274.317460
-# 2016-10-10 18:43:44,571 Epoch[0] Batch [350]    Speed: 1682.45 samples/sec      Train-Perplexity=350.132577
-# 2016-10-10 18:43:48,377 Epoch[0] Batch [400]    Speed: 1681.41 samples/sec      Train-Perplexity=320.674884
-# 2016-10-10 18:43:51,253 Epoch[0] Train-Perplexity=336.210212
-# 2016-10-10 18:43:51,253 Epoch[0] Time cost=33.529
-# 2016-10-10 18:43:53,373 Epoch[0] Validation-Perplexity=282.453883
-#
-# -- ../rnn/rnn_cell_demo.py (batch major) -----
-# 2016-10-10 18:44:34,133 Epoch[0] Batch [50]     Speed: 1004.50 samples/sec      Train-Perplexity=4398.428571
-# 2016-10-10 18:44:39,874 Epoch[0] Batch [100]    Speed: 1114.85 samples/sec      Train-Perplexity=771.401960
-# 2016-10-10 18:44:45,528 Epoch[0] Batch [150]    Speed: 1132.03 samples/sec      Train-Perplexity=525.207444
-# 2016-10-10 18:44:51,564 Epoch[0] Batch [200]    Speed: 1060.37 samples/sec      Train-Perplexity=453.741140
-# 2016-10-10 18:44:57,865 Epoch[0] Batch [250]    Speed: 1015.78 samples/sec      Train-Perplexity=411.914237
-# 2016-10-10 18:45:04,032 Epoch[0] Batch [300]    Speed: 1037.92 samples/sec      Train-Perplexity=381.302188
-# 2016-10-10 18:45:10,153 Epoch[0] Batch [350]    Speed: 1045.49 samples/sec      Train-Perplexity=363.326871
-# 2016-10-10 18:45:16,062 Epoch[0] Batch [400]    Speed: 1083.21 samples/sec      Train-Perplexity=377.929014
-# 2016-10-10 18:45:19,993 Epoch[0] Train-Perplexity=294.675899
-# 2016-10-10 18:45:19,993 Epoch[0] Time cost=52.604
-# 2016-10-10 18:45:21,401 Epoch[0] Validation-Perplexity=294.345659
-################################################################################
-
-import os
-import numpy as np
-import mxnet as mx
-
-from bucket_io import BucketSentenceIter, default_build_vocab
-
-data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
-
-
-def Perplexity(label, pred):
-    """ Calculates prediction perplexity
-
-    Args:
-        label (mx.nd.array): labels array
-        pred (mx.nd.array): prediction array
-
-    Returns:
-        float: calculated perplexity
-
-    """
-
-    # collapse the time, batch dimension
-    label = label.reshape((-1,))
-    pred = pred.reshape((-1, pred.shape[-1]))
-
-    loss = 0.
-    for i in range(pred.shape[0]):
-        loss += -np.log(max(1e-10, pred[i][int(label[i])]))
-    return np.exp(loss / label.size)
-
-
-if __name__ == '__main__':
-    batch_size = 128
-    buckets = [10, 20, 30, 40, 50, 60]
-    num_hidden = 200
-    num_embed = 200
-    num_lstm_layer = 2
-
-    num_epoch = 2
-    learning_rate = 0.01
-    momentum = 0.0
-
-    # Update count per available GPUs
-    gpu_count = 1
-    contexts = [mx.context.gpu(i) for i in range(gpu_count)]
-
-    vocab = default_build_vocab(os.path.join(data_dir, 'sherlockholmes.train.txt'))
-
-    init_h = [mx.io.DataDesc('LSTM_state', (num_lstm_layer, batch_size, num_hidden), layout='TNC')]
-    init_c = [mx.io.DataDesc('LSTM_state_cell', (num_lstm_layer, batch_size, num_hidden), layout='TNC')]
-    init_states = init_c + init_h
-
-    data_train = BucketSentenceIter(os.path.join(data_dir, 'sherlockholmes.train.txt'),
-                                    vocab, buckets, batch_size, init_states,
-                                    time_major=True)
-    data_val = BucketSentenceIter(os.path.join(data_dir, 'sherlockholmes.valid.txt'),
-                                  vocab, buckets, batch_size, init_states,
-                                  time_major=True)
-
-    def sym_gen(seq_len):
-        """ Generates the MXNet symbol for the RNN
-
-        Args:
-            seq_len (int): input sequence length
-
-        Returns:
-            tuple: tuple containing symbol, data_names, label_names
-
-        """
-        data = mx.sym.Variable('data')
-        label = mx.sym.Variable('softmax_label')
-        embed = mx.sym.Embedding(data=data, input_dim=len(vocab),
-                                 output_dim=num_embed, name='embed')
-
-        # TODO(tofix)
-        # currently all the LSTM parameters are concatenated as
-        # a huge vector, and named '<name>_parameters'. By default
-        # mxnet initializer does not know how to initilize this
-        # guy because its name does not ends with _weight or _bias
-        # or anything familiar. Here we just use a temp workaround
-        # to create a variable and name it as LSTM_bias to get
-        # this demo running. Note by default bias is initialized
-        # as zeros, so this is not a good scheme. But calling it
-        # LSTM_weight is not good, as this is 1D vector, while
-        # the initialization scheme of a weight parameter needs
-        # at least two dimensions.
-        rnn_params = mx.sym.Variable('LSTM_bias')
-
-        # RNN cell takes input of shape (time, batch, feature)
-        rnn = mx.sym.RNN(data=embed, state_size=num_hidden,
-                         num_layers=num_lstm_layer, mode='lstm',
-                         name='LSTM',
-                         # The following params can be omitted
-                         # provided we do not need to apply the
-                         # workarounds mentioned above
-                         parameters=rnn_params)
-
-        # the RNN cell output is of shape (time, batch, dim)
-        # if we need the states and cell states in the last time
-        # step (e.g. when building encoder-decoder models), we
-        # can set state_outputs=True, and the RNN cell will have
-        # extra outputs: rnn['LSTM_output'], rnn['LSTM_state']
-        # and for LSTM, also rnn['LSTM_state_cell']
-
-        # now we collapse the time and batch dimension to do the
-        # final linear logistic regression prediction
-        hidden = mx.sym.Reshape(data=rnn, shape=(-1, num_hidden))
-
-        pred = mx.sym.FullyConnected(data=hidden, num_hidden=len(vocab),
-                                     name='pred')
-
-        # reshape to be of compatible shape as labels
-        pred_tm = mx.sym.Reshape(data=pred, shape=(seq_len, -1, len(vocab)))
-
-        sm = mx.sym.SoftmaxOutput(data=pred_tm, label=label, preserve_shape=True,
-                                  name='softmax')
-
-        data_names = ['data', 'LSTM_state', 'LSTM_state_cell']
-        label_names = ['softmax_label']
-
-        return sm, data_names, label_names
-
-    if len(buckets) == 1:
-        mod = mx.mod.Module(*sym_gen(buckets[0]), context=contexts)
-    else:
-        mod = mx.mod.BucketingModule(sym_gen,
-                                     default_bucket_key=data_train.default_bucket_key,
-                                     context=contexts)
-
-    import logging
-    head = '%(asctime)-15s %(message)s'
-    logging.basicConfig(level=logging.DEBUG, format=head)
-
-    mod.fit(data_train, eval_data=data_val, num_epoch=num_epoch,
-            eval_metric=mx.metric.np(Perplexity),
-            batch_end_callback=mx.callback.Speedometer(batch_size, 50),
-            initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
-            optimizer='sgd',
-            optimizer_params={'learning_rate': learning_rate,
-                              'momentum': momentum, 'wd': 0.00001})
diff --git a/example/rnn/README.md b/example/rnn/README.md
index a0846fa3da8..1d1df6ed768 100644
--- a/example/rnn/README.md
+++ b/example/rnn/README.md
@@ -1,6 +1,11 @@
 Recurrent Neural Network Examples
 ===========
 
+For more current implementations of NLP and RNN models with MXNet, please visit [gluon-nlp](http://gluon-nlp.mxnet.io/index.html)
+
+------
+
+
 This directory contains functions for creating recurrent neural networks
 models using high level mxnet.rnn interface.
 
diff --git a/example/rnn/bucketing/README.md b/example/rnn/bucketing/README.md
index 9bbeefd21e4..7b7883d79ad 100644
--- a/example/rnn/bucketing/README.md
+++ b/example/rnn/bucketing/README.md
@@ -2,6 +2,15 @@ RNN Example
 ===========
 This folder contains RNN examples using high level mxnet.rnn interface.
 
+--------------
+
+## Gluon Implementation
+
+You can check this improved [Gluon implementation](http://gluon-nlp.mxnet.io/model_zoo/language_model/index.html#word-language-model) in gluon-nlp, the largest LSTM model reaches a perplexity of 65.62.
+
+--------------
+
+
 ## Data
 1) Review the license for the Sherlock Holmes dataset and ensure that you agree to it. Then uncomment the lines in the 'get_sherlockholmes_data.sh' script that download the dataset.
 
@@ -23,11 +32,11 @@ This folder contains RNN examples using high level mxnet.rnn interface.
 
   For Python2 (GPU support only): can take 50+ minutes on AWS-EC2-p2.16xlarge
 
-      $ python [cudnn_lstm_bucketing.py](cudnn_lstm_bucketing.py) --gpus 0,1,2,3
+      $ python [cudnn_rnn_bucketing.py](cudnn_rnn_bucketing.py) --gpus 0,1,2,3
 
   For Python3 (GPU support only): can take 50+ minutes on AWS-EC2-p2.16xlarge
 
-      $ python3 [cudnn_lstm_bucketing.py](cudnn_lstm_bucketing.py) --gpus 0,1,2,3
+      $ python3 [cudnn_rnn_bucketing.py](cudnn_rnn_bucketing.py) --gpus 0,1,2,3
 
 
 ### Performance Note:
diff --git a/example/rnn/large_word_lm/data.py b/example/rnn/large_word_lm/data.py
index b9cc3e8a89e..0ca500628d0 100644
--- a/example/rnn/large_word_lm/data.py
+++ b/example/rnn/large_word_lm/data.py
@@ -174,7 +174,7 @@ def __init__(self, data_file, vocab, batch_size, bptt):
         self._iter = self._dataset.iterate_once(batch_size, bptt)
 
     def iter_next(self):
-        data = self._iter.next()
+        data = next(self._iter)
         if data is None:
             return False
         self._next_data = mx.nd.array(data[0], dtype=np.int32)
diff --git a/example/rnn/large_word_lm/readme.md b/example/rnn/large_word_lm/readme.md
deleted file mode 100644
index 465aaa1c44b..00000000000
--- a/example/rnn/large_word_lm/readme.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# Large-Scale Language Model
-This example implements the baseline model in
-[Exploring the Limits of Language Modeling](https://arxiv.org/abs/1602.02410) on the
-[Google 1-Billion Word](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) (GBW) dataset.
-
-This example reaches 48.0 test perplexity after 6 training epochs on a 1-layer, 2048-unit, 512-projection LSTM Language Model.
-It reaches 44.2 test perplexity after 35 epochs of training.
-
-The main differences with the original implementation include:
-* Synchronized gradient updates instead of asynchronized updates
-
-Each epoch for training (excluding time for evaluation on test set) takes around 80 minutes on a p3.8xlarge instance, which comes with 4 Volta V100 GPUs.
-
-# Setup dataset and build sampler
-1. Download 1-Billion Word Dataset: [Link](http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz)
-2. Download pre-processed vocabulary file which maps tokens into ids.
-3. Build sampler with cython by running `make` in the current directory. If you do not have cython installed, run `pip install cython`
-
-# Run the Script
-```
-usage: train.py [-h] [--data DATA] [--test TEST] [--vocab VOCAB]
-                [--emsize EMSIZE] [--nhid NHID] [--num-proj NUM_PROJ]
-                [--nlayers NLAYERS] [--epochs EPOCHS]
-                [--batch-size BATCH_SIZE] [--dropout DROPOUT] [--eps EPS]
-                [--bptt BPTT] [--k K] [--gpus GPUS]
-                [--log-interval LOG_INTERVAL] [--seed SEED]
-                [--checkpoint-dir CHECKPOINT_DIR] [--lr LR] [--clip CLIP]
-                [--rescale-embed RESCALE_EMBED]
-
-Language Model on GBW
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --data DATA           location of the training data
-  --test TEST           location of the test data
-  --vocab VOCAB         location of the corpus vocabulary file
-  --emsize EMSIZE       size of word embeddings
-  --nhid NHID           number of hidden units per layer
-  --num-proj NUM_PROJ   number of projection units per layer
-  --nlayers NLAYERS     number of LSTM layers
-  --epochs EPOCHS       number of epoch for training
-  --batch-size BATCH_SIZE
-                        batch size per gpu
-  --dropout DROPOUT     dropout applied to layers (0 = no dropout)
-  --eps EPS             epsilon for adagrad
-  --bptt BPTT           sequence length
-  --k K                 number of noise samples for estimation
-  --gpus GPUS           list of gpus to run, e.g. 0 or 0,2,5. empty means
-                        using gpu(0).
-  --log-interval LOG_INTERVAL
-                        report interval
-  --seed SEED           random seed
-  --checkpoint-dir CHECKPOINT_DIR
-                        dir for checkpoint
-  --lr LR               initial learning rate
-  --clip CLIP           gradient clipping by global norm.
-  --rescale-embed RESCALE_EMBED
-                        scale factor for the gradients of the embedding layer
-```
-
-To reproduce the result, run
-```
-train.py --gpus=0,1,2,3 --clip=10 --lr=0.2 --dropout=0.1 --eps=1 --rescale-embed=256
---test=/path/to/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050
---data=/path/to/training-monolingual.tokenized.shuffled/*
-```
diff --git a/example/rnn/word_lm/README.md b/example/rnn/word_lm/README.md
index beed6fc8d89..ab0a8d704b9 100644
--- a/example/rnn/word_lm/README.md
+++ b/example/rnn/word_lm/README.md
@@ -16,7 +16,7 @@ The Sherlock Holmes data is a copyright free copy of Sherlock Holmes from[(Proje
 Example runs and the results:
 
 ```
-python train.py --tied --nhid 650 --emsize 650 --dropout 0.5        # Test ppl of 75.4
+python train.py --tied --nhid 650 --emsize 650 --dropout 0.5        # Test ppl of 44.26
 ```
 
 ```
diff --git a/example/sparse/linear_classification/data.py b/example/sparse/linear_classification/data.py
index 02984734fb9..bc5619a4bfb 100644
--- a/example/sparse/linear_classification/data.py
+++ b/example/sparse/linear_classification/data.py
@@ -24,10 +24,9 @@ def get_avazu_data(data_dir, data_name, url):
         os.mkdir(data_dir)
     os.chdir(data_dir)
     if (not os.path.exists(data_name)):
-        print("Dataset " + data_name + " not present. Downloading now ...")
-        import urllib
+        print("Dataset " + data_name + " not present. Downloading now ...") 
         zippath = os.path.join(data_dir, data_name + ".bz2")
-        urllib.urlretrieve(url + data_name + ".bz2", zippath)
+        mx.test_utils.download(url + data_name + ".bz2", zippath)
         os.system("bzip2 -d %r" % data_name + ".bz2")
         print("Dataset " + data_name + " is now present.")
     os.chdir("..")
diff --git a/example/speech_recognition/README.md b/example/speech_recognition/README.md
index 74bad43a4fb..6f01911e130 100644
--- a/example/speech_recognition/README.md
+++ b/example/speech_recognition/README.md
@@ -28,7 +28,9 @@ With rich functionalities and convenience explained above, you can build your ow
 <pre>
 <code>pip install soundfile</code>
 </pre>
-- Warp CTC: Follow [this instruction](https://github.com/baidu-research/warp-ctc) to compile Baidu's Warp CTC.
+- Warp CTC: Follow [this instruction](https://github.com/baidu-research/warp-ctc) to compile Baidu's Warp CTC. (Note: If you are using V100, make sure to use this [fix](https://github.com/baidu-research/warp-ctc/pull/118))
+- You need to compile MXNet with WarpCTC, follow the instructions [here](https://github.com/apache/incubator-mxnet/tree/master/example/ctc)
+- You might need to set `LD_LIBRARY_PATH` to the right path if MXNet fails to find your `libwarpctc.so`
 - **We strongly recommend that you first test a model of small networks.**
 
 
diff --git a/example/speech_recognition/label_util.py b/example/speech_recognition/label_util.py
index dab1d1ef1b4..8563736052b 100644
--- a/example/speech_recognition/label_util.py
+++ b/example/speech_recognition/label_util.py
@@ -29,7 +29,7 @@ class LabelUtil:
 
     # dataPath
     def __init__(self):
-        self._log = LogUtil().getlogger()
+        self._log = LogUtil.getInstance().getlogger()
         self._log.debug("LabelUtil init")
 
     def load_unicode_set(self, unicodeFilePath):
diff --git a/example/speech_recognition/log_util.py b/example/speech_recognition/log_util.py
index e61407f5f4d..65c465811fd 100644
--- a/example/speech_recognition/log_util.py
+++ b/example/speech_recognition/log_util.py
@@ -17,48 +17,44 @@
 
 import logging
 import logging.handlers
+from singleton import Singleton
 
+@Singleton
+class LogUtil:
 
-class SingletonType(type):
-    def __call__(cls, *args, **kwargs):
-        try:
-            return cls.__instance
-        except AttributeError:
-            cls.__instance = super(SingletonType, cls).__call__(*args, **kwargs)
-            return cls.__instance
-
-
-class LogUtil(object):
-    __metaclass__ = SingletonType
     _logger = None
     _filename = None
 
-    def __init__(self, filename=None):
-        self._filename = filename
-
-        # logger
-        self._logger = logging.getLogger('logger')
-        # remove default handler
-        self._logger.propagate = False
-
-        stream_handler = logging.StreamHandler()
-        stream_formatter = logging.Formatter('[%(levelname)8s][%(asctime)s.%(msecs)03d] %(message)s',
-                                             datefmt='%Y/%m/%d %H:%M:%S')
-        stream_handler.setFormatter(stream_formatter)
-
-        if self._filename is not None:
-            file_max_bytes = 10 * 1024 * 1024
-
-            file_handler = logging.handlers.RotatingFileHandler(filename='./log/' + self._filename,
-                                                               maxBytes=file_max_bytes,
-                                                               backupCount=10)
-            file_formatter = logging.Formatter('[%(levelname)8s][%(asctime)s.%(msecs)03d] %(message)s',
-                                               datefmt='%Y/%m/%d %H:%M:%S')
-            file_handler.setFormatter(file_formatter)
-            self._logger.addHandler(file_handler)
-
-        self._logger.addHandler(stream_handler)
-        self._logger.setLevel(logging.DEBUG)
-
-    def getlogger(self):
+    def getlogger(self, filename=None):
+        if self._logger is not None and filename is not None:
+            self._logger.warning('Filename %s ignored, logger is already instanciated with %s' % (filename, self._filename))
+        if self._logger is None:
+            self._filename = filename
+
+            # logger
+            self._logger = logging.getLogger('logger')
+            # remove default handler
+            self._logger.propagate = False
+
+            stream_handler = logging.StreamHandler()
+            stream_formatter = logging.Formatter('[%(levelname)8s][%(asctime)s.%(msecs)03d] %(message)s',
+                                                 datefmt='%Y/%m/%d %H:%M:%S')
+            stream_handler.setFormatter(stream_formatter)
+
+            if self._filename is not None:
+                file_max_bytes = 10 * 1024 * 1024
+
+                file_handler = logging.handlers.RotatingFileHandler(filename='./log/' + self._filename,
+                                                                   maxBytes=file_max_bytes,
+                                                                   backupCount=10)
+                file_formatter = logging.Formatter('[%(levelname)8s][%(asctime)s.%(msecs)03d] %(message)s',
+                                                   datefmt='%Y/%m/%d %H:%M:%S')
+                file_handler.setFormatter(file_formatter)
+                self._logger.addHandler(file_handler)
+
+            self._logger.addHandler(stream_handler)
+            self._logger.setLevel(logging.DEBUG)
+            
+        
         return self._logger
+
diff --git a/example/speech_recognition/main.py b/example/speech_recognition/main.py
index e45026343de..b2ea42eca0b 100644
--- a/example/speech_recognition/main.py
+++ b/example/speech_recognition/main.py
@@ -38,6 +38,8 @@
 os.environ['MXNET_ENGINE_TYPE'] = "ThreadedEnginePerDevice"
 os.environ['MXNET_ENABLE_GPU_P2P'] = "0"
 
+logUtil = LogUtil.getInstance()
+
 class WHCS:
     width = 0
     height = 0
@@ -91,7 +93,7 @@ def load_data(args):
     max_duration = args.config.getfloat('data', 'max_duration')
     language = args.config.get('data', 'language')
 
-    log = LogUtil().getlogger()
+    log = logUtil.getlogger()
     labelUtil = LabelUtil.getInstance()
     if mode == "train" or mode == "load":
         data_json = args.config.get('data', 'train_json')
@@ -276,7 +278,7 @@ def load_model(args, contexts, data_train):
         mx.random.seed(hash(datetime.now()))
     # set log file name
     log_filename = args.config.get('common', 'log_filename')
-    log = LogUtil(filename=log_filename).getlogger()
+    log = logUtil.getlogger(filename=log_filename)
 
     # set parameters from data section(common)
     mode = args.config.get('common', 'mode')
diff --git a/example/speech_recognition/singleton.py b/example/speech_recognition/singleton.py
index 01717e4df06..fdb20c06b14 100644
--- a/example/speech_recognition/singleton.py
+++ b/example/speech_recognition/singleton.py
@@ -18,23 +18,41 @@
 
 import logging as log
 
+
 class Singleton:
+    """
+    A non-thread-safe helper class to ease implementing singletons.
+    This should be used as a decorator -- not a metaclass -- to the
+    class that should be a singleton.
+
+    The decorated class can define one `__init__` function that
+    takes only the `self` argument. Also, the decorated class cannot be
+    inherited from. Other than that, there are no restrictions that apply
+    to the decorated class.
+
+    To get the singleton instance, use the `instance` method. Trying
+    to use `__call__` will result in a `TypeError` being raised.
+
+    """
+
     def __init__(self, decorated):
-        log.debug("Singleton Init %s" % decorated)
         self._decorated = decorated
 
     def getInstance(self):
+        """
+        Returns the singleton instance. Upon its first call, it creates a
+        new instance of the decorated class and calls its `__init__` method.
+        On all subsequent calls, the already created instance is returned.
+
+        """
         try:
             return self._instance
         except AttributeError:
             self._instance = self._decorated()
             return self._instance
 
-    def __new__(cls, *args, **kwargs):
-        print("__new__")
-        cls._instance = super(Singleton, cls).__new__(cls, *args, **kwargs)
-        return cls._instance
-
     def __call__(self):
-        raise TypeError("Singletons must be accessed through 'getInstance()'")
+        raise TypeError('Singletons must be accessed through `getInstance()`.')
 
+    def __instancecheck__(self, inst):
+        return isinstance(inst, self._decorated)
\ No newline at end of file
diff --git a/example/speech_recognition/stt_datagenerator.py b/example/speech_recognition/stt_datagenerator.py
index 8fafa790937..e1f8f13b7ba 100644
--- a/example/speech_recognition/stt_datagenerator.py
+++ b/example/speech_recognition/stt_datagenerator.py
@@ -28,6 +28,8 @@
 from stt_bi_graphemes_util import generate_bi_graphemes_label
 from multiprocessing import cpu_count, Process, Manager
 
+logUtil = LogUtil.getInstance()
+
 class DataGenerator(object):
     def __init__(self, save_dir, model_name, step=10, window=20, max_freq=8000, desc_file=None):
         """
@@ -86,7 +88,7 @@ def load_metadata_from_desc_file(self, desc_file, partition='train',
             max_duration (float): In seconds, the maximum duration of
                 utterances to train or test on
         """
-        logger = LogUtil().getlogger()
+        logger = logUtil.getlogger()
         logger.info('Reading description file: {} for partition: {}'
                     .format(desc_file, partition))
         audio_paths, durations, texts = [], [], []
@@ -245,7 +247,7 @@ def sample_normalize(self, k_samples=1000, overwrite=False):
         Params:
             k_samples (int): Use this number of samples for estimation
         """
-        log = LogUtil().getlogger()
+        log = logUtil.getlogger()
         log.info("Calculating mean and std from samples")
         # if k_samples is negative then it goes through total dataset
         if k_samples < 0:
diff --git a/example/speech_recognition/stt_io_iter.py b/example/speech_recognition/stt_io_iter.py
index 6c9bacd1a52..216dae8ac68 100644
--- a/example/speech_recognition/stt_io_iter.py
+++ b/example/speech_recognition/stt_io_iter.py
@@ -86,7 +86,7 @@ def __init__(self, count, datagen, batch_size, num_label, init_states, seq_lengt
             audio_paths = audio_paths
             texts = texts
 
-        self.trainDataList = zip(durations, audio_paths, texts)
+        self.trainDataList = list(zip(durations, audio_paths, texts))
         # to shuffle data
         if not sort_by_duration:
             random.shuffle(self.trainDataList)
@@ -103,11 +103,11 @@ def __iter__(self):
             texts = []
             for i in range(self.batch_size):
                 try:
-                    duration, audio_path, text = self.trainDataIter.next()
+                    duration, audio_path, text = next(self.trainDataIter)
                 except:
                     random.shuffle(self.trainDataList)
                     self.trainDataIter = iter(self.trainDataList)
-                    duration, audio_path, text = self.trainDataIter.next()
+                    duration, audio_path, text = next(self.trainDataIter)
                 audio_paths.append(audio_path)
                 texts.append(text)
             if self.is_first_epoch:
diff --git a/example/speech_recognition/stt_metric.py b/example/speech_recognition/stt_metric.py
index ec74fc063dc..26609627ea5 100644
--- a/example/speech_recognition/stt_metric.py
+++ b/example/speech_recognition/stt_metric.py
@@ -51,7 +51,7 @@ def __init__(self, batch_size, num_gpu, is_epoch_end=False, is_logging=True):
     def update(self, labels, preds):
         check_label_shapes(labels, preds)
         if self.is_logging:
-            log = LogUtil().getlogger()
+            log = LogUtil.getInstance().getlogger()
             labelUtil = LabelUtil.getInstance()
         self.batch_loss = 0.
 
diff --git a/example/speech_recognition/stt_utils.py b/example/speech_recognition/stt_utils.py
index 0539d59f37a..cc024722331 100644
--- a/example/speech_recognition/stt_utils.py
+++ b/example/speech_recognition/stt_utils.py
@@ -15,16 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import logging
 import os
 import os.path
 
 import numpy as np
-import soundfile
 from numpy.lib.stride_tricks import as_strided
-
-
-logger = logging.getLogger(__name__)
+import soundfile
 
 
 def calc_feat_dim(window, max_freq):
diff --git a/example/speech_recognition/train.py b/example/speech_recognition/train.py
index b1ae50b0755..e585bfd05e6 100644
--- a/example/speech_recognition/train.py
+++ b/example/speech_recognition/train.py
@@ -51,7 +51,7 @@ def do_training(args, module, data_train, data_val, begin_epoch=0):
     from distutils.dir_util import mkpath
     from log_util import LogUtil
 
-    log = LogUtil().getlogger()
+    log = LogUtil.getInstance().getlogger()
     mkpath(os.path.dirname(get_checkpoint_path(args)))
 
     #seq_len = args.config.get('arch', 'max_t_count')
diff --git a/example/ssd/README.md b/example/ssd/README.md
index cc034689c7b..ec6b4b5c31a 100644
--- a/example/ssd/README.md
+++ b/example/ssd/README.md
@@ -4,6 +4,14 @@ SSD is an unified framework for object detection with a single network.
 
 You can use the code to train/evaluate/test for object detection task.
 
+-------------------
+
+## Gluon Implementation
+
+You can find a Gluon implementation on [gluon-cv](https://gluon-cv.mxnet.io/build/examples_detection/train_ssd_voc.html).
+
+-------------------
+
 ### Disclaimer
 This is a re-implementation of original SSD which is based on caffe. The official
 repository is available [here](https://github.com/weiliu89/caffe/tree/ssd).
diff --git a/example/stochastic-depth/sd_cifar10.py b/example/stochastic-depth/sd_cifar10.py
index 7eb32028701..e995ea44f76 100644
--- a/example/stochastic-depth/sd_cifar10.py
+++ b/example/stochastic-depth/sd_cifar10.py
@@ -78,9 +78,6 @@
 import mxnet as mx
 import logging
 
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
-from utils import get_data
-
 import sd_module
 
 def residual_module(death_rate, n_channel, name_scope, context, stride=1, bn_momentum=0.9):
@@ -199,13 +196,31 @@ def get_death_rate(i_res_block):
 batch_end_callbacks = [mx.callback.Speedometer(batch_size, 50)]
 epoch_end_callbacks = [mx.callback.do_checkpoint('sd-%d' % (n_residual_blocks * 6 + 2))]
 
-
-args = type('', (), {})()
-args.batch_size = batch_size
-args.data_dir = os.path.join(os.path.dirname(__file__), "data")
+data_dir = os.path.join(os.path.dirname(__file__), "data", "cifar")
 kv = mx.kvstore.create(kv_store)
 
-train, val = get_data.get_cifar10_iterator(args, kv)
+mx.test_utils.get_cifar10()
+
+data_shape = (3, 28, 28)
+train = mx.io.ImageRecordIter(
+    path_imgrec = os.path.join(data_dir, "train.rec"),
+    mean_img    = os.path.join(data_dir, "mean.bin"),
+    data_shape  = data_shape,
+    batch_size  = batch_size,
+    rand_crop   = True,
+    rand_mirror = True,
+    num_parts   = kv.num_workers,
+    part_index  = kv.rank)
+
+val = mx.io.ImageRecordIter(
+    path_imgrec = os.path.join(data_dir, "test.rec"),
+    mean_img    = os.path.join(data_dir, "mean.bin"),
+    rand_crop   = False,
+    rand_mirror = False,
+    data_shape  = data_shape,
+    batch_size  = batch_size,
+    num_parts   = kv.num_workers,
+    part_index  = kv.rank)
 
 logging.basicConfig(level=logging.DEBUG)
 mod_seq.fit(train, val,
diff --git a/example/stochastic-depth/sd_mnist.py b/example/stochastic-depth/sd_mnist.py
index 7eb93741ff5..6c95a23bf23 100644
--- a/example/stochastic-depth/sd_mnist.py
+++ b/example/stochastic-depth/sd_mnist.py
@@ -25,9 +25,6 @@
 import mxnet as mx
 import logging
 
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
-from utils import get_data
-
 import sd_module
 
 def get_conv(
@@ -121,8 +118,7 @@ def get_conv(
 batch_size = 100
 
 basedir = os.path.dirname(__file__)
-get_data.get_mnist(os.path.join(basedir, "data"))
-
+mx.test_utils.get_mnist_ubyte()
 train = mx.io.MNISTIter(
         image=os.path.join(basedir, "data", "train-images-idx3-ubyte"),
         label=os.path.join(basedir, "data", "train-labels-idx1-ubyte"),
diff --git a/example/svm_mnist/svm_mnist.py b/example/svm_mnist/svm_mnist.py
index 679540198d2..3fc0362f6b0 100644
--- a/example/svm_mnist/svm_mnist.py
+++ b/example/svm_mnist/svm_mnist.py
@@ -20,16 +20,23 @@
 ## Please read the README.md document for better reference ##
 #############################################################
 from __future__ import print_function
+
+import logging
+import random
+
 import mxnet as mx
 import numpy as np
 from sklearn.datasets import fetch_mldata
 from sklearn.decomposition import PCA
-# import matplotlib.pyplot as plt
-import logging
+
 
 logger = logging.getLogger()
 logger.setLevel(logging.DEBUG)
 
+np.random.seed(1234) # set seed for deterministic ordering
+mx.random.seed(1234)
+random.seed(1234)
+
 # Network declaration as symbols. The following pattern was based
 # on the article, but feel free to play with the number of nodes
 # and with the activation function
@@ -41,60 +48,77 @@
 fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=10)
 
 # Here we add the ultimate layer based on L2-SVM objective
-mlp = mx.symbol.SVMOutput(data=fc3, name='svm')
+mlp_svm_l2 = mx.symbol.SVMOutput(data=fc3, name='svm_l2')
+
+# With L1-SVM objective
+mlp_svm_l1 = mx.symbol.SVMOutput(data=fc3, name='svm_l1', use_linear=True)
 
-# To use L1-SVM objective, comment the line above and uncomment the line below
-# mlp = mx.symbol.SVMOutput(data=fc3, name='svm', use_linear=True)
+# Compare with softmax cross entropy loss
+mlp_softmax = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
+
+print("Preparing data...")
+mnist_data = mx.test_utils.get_mnist()
+X = np.concatenate([mnist_data['train_data'], mnist_data['test_data']])
+Y = np.concatenate([mnist_data['train_label'], mnist_data['test_label']])
+X = X.reshape((X.shape[0], -1)).astype(np.float32) * 255
 
 # Now we fetch MNIST dataset, add some noise, as the article suggests,
 # permutate and assign the examples to be used on our network
-mnist = fetch_mldata('MNIST original')
-mnist_pca = PCA(n_components=70).fit_transform(mnist.data)
+mnist_pca = PCA(n_components=70).fit_transform(X)
 noise = np.random.normal(size=mnist_pca.shape)
 mnist_pca += noise
-np.random.seed(1234) # set seed for deterministic ordering
 p = np.random.permutation(mnist_pca.shape[0])
-X = mnist_pca[p]
-Y = mnist.target[p]
-X_show = mnist.data[p]
+X = mnist_pca[p] / 255.
+Y = Y[p]
+X_show = X[p]
 
 # This is just to normalize the input and separate train set and test set
-X = X.astype(np.float32)/255
 X_train = X[:60000]
 X_test = X[60000:]
 X_show = X_show[60000:]
 Y_train = Y[:60000]
 Y_test = Y[60000:]
-
+print("Data prepared.")
 # Article's suggestion on batch size
 batch_size = 200
-train_iter = mx.io.NDArrayIter(X_train, Y_train, batch_size=batch_size, label_name='svm_label')
-test_iter = mx.io.NDArrayIter(X_test, Y_test, batch_size=batch_size, label_name='svm_label')
-
-# Here we instatiate and fit the model for our data
-# The article actually suggests using 400 epochs,
-# But I reduced to 10, for convinience
-mod = mx.mod.Module(
-    context = mx.cpu(0),  # Run on CPU 0
-    symbol = mlp,         # Use the network we just defined
-    label_names = ['svm_label'],
-)
-mod.fit(
-    train_data=train_iter,
-    eval_data=test_iter,  # Testing data set. MXNet computes scores on test set every epoch
-    batch_end_callback = mx.callback.Speedometer(batch_size, 200),  # Logging module to print out progress
-    num_epoch = 10,       # Train for 10 epochs
-    optimizer_params = {
-        'learning_rate': 0.1,  # Learning rate
-        'momentum': 0.9,       # Momentum for SGD with momentum
-        'wd': 0.00001,         # Weight decay for regularization
-    },
-)
-
-# Uncomment to view an example
-# plt.imshow((X_show[0].reshape((28,28))*255).astype(np.uint8), cmap='Greys_r')
-# plt.show()
-# print 'Result:', model.predict(X_test[0:1])[0].argmax()
-
-# Now it prints how good did the network did for this configuration
-print('Accuracy:', mod.score(test_iter, mx.metric.Accuracy())[0][1]*100, '%')
+
+ctx = mx.gpu() if len(mx.test_utils.list_gpus()) > 0 else mx.cpu()
+
+results = {}
+for output in [mlp_svm_l2, mlp_svm_l1, mlp_softmax]:
+    
+    print("\nTesting with %s \n" % output.name)
+    
+    label = output.name + "_label"
+    
+    train_iter = mx.io.NDArrayIter(X_train, Y_train, batch_size=batch_size, label_name=label)
+    test_iter = mx.io.NDArrayIter(X_test, Y_test, batch_size=batch_size, label_name=label)
+
+    # Here we instatiate and fit the model for our data
+    # The article actually suggests using 400 epochs,
+    # But I reduced to 10, for convenience
+
+    mod = mx.mod.Module(
+        context = ctx, 
+        symbol = output,         # Use the network we just defined
+        label_names = [label],
+    )
+    mod.fit(
+        train_data=train_iter,
+        eval_data=test_iter,  # Testing data set. MXNet computes scores on test set every epoch
+        batch_end_callback = mx.callback.Speedometer(batch_size, 200),  # Logging module to print out progress
+        num_epoch = 10,       # Train for 10 epochs
+        optimizer_params = {
+            'learning_rate': 0.1,  # Learning rate
+            'momentum': 0.9,       # Momentum for SGD with momentum
+            'wd': 0.00001,         # Weight decay for regularization
+        })
+    results[output.name] = mod.score(test_iter, mx.metric.Accuracy())[0][1]*100
+    print('Accuracy for %s:'%output.name, mod.score(test_iter, mx.metric.Accuracy())[0][1]*100, '%\n')
+    
+for key, value in results.items():
+    print(key, value, "%s")
+
+#svm_l2 97.85 %s
+#svm_l1 98.15 %s
+#softmax 97.69 %s
\ No newline at end of file
diff --git a/example/svrg_module/README.md b/example/svrg_module/README.md
index 63e7ba2f2bf..250995a5715 100644
--- a/example/svrg_module/README.md
+++ b/example/svrg_module/README.md
@@ -1,7 +1,9 @@
 ## SVRGModule Example
+
 SVRGModule is an extension to the Module API that implements SVRG optimization, which stands for Stochastic
 Variance Reduced Gradient. SVRG is an optimization technique that complements SGD and has several key
 properties: 
+
 * Employs explicit variance reduction by using a different update rule compared to SGD.
 * Ability to use relatively large learning rate, which leads to faster convergence compared to SGD.
 * Guarantees for fast convergence for smooth and strongly convex functions.
@@ -18,7 +20,9 @@ training script.
 
 ##### Dataset
 YearPredictionMSD: contains predictions of the release year of a song from audio features. It has over 
-400,000 samples with 90 features. Please uncomment data downloading script from data_reader.py to download the data. 
+400,000 samples with 90 features. It will be automatically downloaded on first execution and cached.
+
+YearPredictionMSD dataset: https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd
 
 #### Benchmarks:
 An initial set of benchmarks has been performed on YearPredictionDatasetMSD with linear regression model.  A jupyter 
diff --git a/example/svrg_module/benchmarks/svrg_benchmark.ipynb b/example/svrg_module/benchmarks/svrg_benchmark.ipynb
index db02938af46..54ae81281db 100644
--- a/example/svrg_module/benchmarks/svrg_benchmark.ipynb
+++ b/example/svrg_module/benchmarks/svrg_benchmark.ipynb
@@ -16,17 +16,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import mxnet as mx\n",
-    "from sklearn.datasets import load_svmlight_file\n",
-    "import numpy as np\n",
+    "import os\n",
     "import json\n",
+    "import sys\n",
     "import tempfile\n",
-    "import os\n",
-    "from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule\n"
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.patches as mpatches\n",
+    "import mxnet as mx\n",
+    "from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "from sklearn.datasets import load_svmlight_file\n",
+    "\n",
+    "sys.path.insert(0, \"../linear_regression\")\n",
+    "from data_reader import get_year_prediction_data\n",
+    "\n",
+    "%matplotlib inline"
    ]
   },
   {
@@ -39,47 +50,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Download data file\n",
-    "# from subprocess import call\n",
-    "# YearPredictionMSD dataset: https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd\n",
-    "# call(['wget', 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/YearPredictionMSD.bz2'])\n",
-    "# call(['bzip2', '-d', 'YearPredictionMSD.bz2'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Extracting data...\n",
       "Reading data from disk...\n"
      ]
     }
    ],
    "source": [
-    "feature_dim = 90\n",
-    "print(\"Reading data from disk...\")\n",
-    "train_features, train_labels = load_svmlight_file('YearPredictionMSD', n_features=feature_dim, dtype=np.float32)\n",
-    "train_features = train_features.todense()\n",
-    "\n",
-    "# normalize the data: subtract means and divide by standard deviations\n",
-    "label_mean = train_labels.mean()\n",
-    "label_std = np.sqrt(np.square(train_labels - label_mean).mean())\n",
-    "feature_means = train_features.mean(axis=0)\n",
-    "feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0))\n",
-    "\n",
-    "train_features = (train_features - feature_means) / feature_stds\n",
-    "train_labels = (train_labels - label_mean) / label_std\n",
-    "\n",
+    "feature_dim, train_features, train_labels = get_year_prediction_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "train_features = train_features[-5000:]\n",
-    "train_labels = train_labels[-5000:]"
+    "train_labels   = train_labels[-5000:]"
    ]
   },
   {
@@ -91,7 +85,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -119,7 +113,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -162,7 +156,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -194,19 +188,6 @@
     "  "
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "import seaborn as sns\n",
-    "import matplotlib.pyplot as plt\n",
-    "import matplotlib.patches as mpatches\n",
-    "import pandas as pd"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -217,7 +198,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -227,7 +208,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -236,13 +217,13 @@
        "Text(0.5,0,'Epochs')"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     },
     {
      "data": {
-      "image/png": "\n",
+      "image/png": "\n",
       "text/plain": [
        "<Figure size 1440x864 with 1 Axes>"
       ]
@@ -286,7 +267,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -298,7 +279,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -307,13 +288,13 @@
        "Text(0.5,0,'Epochs')"
       ]
      },
-     "execution_count": 34,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     },
     {
      "data": {
-      "image/png": "\n",
+      "image/png": "\n",
       "text/plain": [
        "<Figure size 1440x864 with 1 Axes>"
       ]
@@ -345,7 +326,7 @@
     "sns.pointplot(x3, dataplot3['sgd_mse_lr_0.001'], color=color[8])\n",
     "sns.pointplot(x3, dataplot3['sgd_mse_lr_0.0025'], color=color[3])\n",
     "sns.pointplot(x3, dataplot3['sgd_mse_lr_0.005'], color=color[7])\n",
-    "color_patch1 = mpatches.Patch(color=color[9], label=\"svrg_mse_0.025\")\n",
+    "color_patch1 = mpatches.Patch(color=color[9], label=\"svrg_mse_lr_0.025\")\n",
     "color_patch2 = mpatches.Patch(color=color[8], label=\"sgd_mse_lr_0.001\")\n",
     "color_patch3 = mpatches.Patch(color=color[3], label=\"sgd_mse_lr_0.0025\")\n",
     "color_patch4 = mpatches.Patch(color=color[7], label=\"sgd_mse_lr_0.005\")\n",
@@ -357,21 +338,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.15"
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
   }
  },
  "nbformat": 4,
diff --git a/example/svrg_module/linear_regression/data_reader.py b/example/svrg_module/linear_regression/data_reader.py
index d56ae03a5f4..23847d53194 100644
--- a/example/svrg_module/linear_regression/data_reader.py
+++ b/example/svrg_module/linear_regression/data_reader.py
@@ -15,21 +15,35 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import bz2
+import os
+import shutil
 
+import mxnet as mx
 import numpy as np
 from sklearn.datasets import load_svmlight_file
 
+
 # Download data file
-# from subprocess import call
 # YearPredictionMSD dataset: https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd
-# call(['wget', 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/YearPredictionMSD.bz2'])
-# call(['bzip2', '-d', 'YearPredictionMSD.bz2'])
 
 
-def read_year_prediction_data(fileName):
+def get_year_prediction_data(dirname=None):
     feature_dim = 90
+    if dirname is None:
+        dirname = os.path.join(os.path.dirname(__file__), 'data')
+    filename = 'YearPredictionMSD'
+    download_filename = os.path.join(dirname, "%s.bz2" % filename)
+    extracted_filename = os.path.join(dirname, filename)
+    if not os.path.isfile(download_filename):
+        print("Downloading data...")
+        mx.test_utils.download('https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/%s.bz2' % filename, dirname=dirname)
+    if not os.path.isfile(extracted_filename):
+        print("Extracting data...")
+        with bz2.BZ2File(download_filename) as fr, open(extracted_filename,"wb") as fw:
+            shutil.copyfileobj(fr,fw)
     print("Reading data from disk...")
-    train_features, train_labels = load_svmlight_file(fileName, n_features=feature_dim, dtype=np.float32)
+    train_features, train_labels = load_svmlight_file(extracted_filename, n_features=feature_dim, dtype=np.float32)
     train_features = train_features.todense()
 
     # normalize the data: subtract means and divide by standard deviations
diff --git a/example/svrg_module/linear_regression/train.py b/example/svrg_module/linear_regression/train.py
index b3d942973f1..6b6574c9618 100644
--- a/example/svrg_module/linear_regression/train.py
+++ b/example/svrg_module/linear_regression/train.py
@@ -19,7 +19,7 @@
 import argparse
 import mxnet as mx
 from common import create_lin_reg_network, create_logger
-from data_reader import read_year_prediction_data
+from data_reader import get_year_prediction_data
 
 parser = argparse.ArgumentParser()
 parser.add_argument('-e', dest='epochs', help='number of epochs for training phase', type=int, default=100)
@@ -37,7 +37,7 @@
 logger = create_logger()
 kv = mx.kvstore.create(args.kv_store)
 
-feature_dim, train_features, train_labels = read_year_prediction_data('YearPredictionMSD')
+feature_dim, train_features, train_labels = get_year_prediction_data()
 train_iter, mod = create_lin_reg_network(train_features, train_labels, feature_dim, args.batch_size, args.updateFreq,
                                          ctx, logger)
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services