You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2019/01/08 16:51:53 UTC
[GitHub] ArmageddonKnight closed pull request #11364: [MXNET-490] Added OpenLSTMRNN together with benchmarks and Tensorboard callback routines.

ArmageddonKnight closed pull request #11364: [MXNET-490] Added OpenLSTMRNN together with benchmarks and Tensorboard callback routines.
URL: https://github.com/apache/incubator-mxnet/pull/11364
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/example/rnn-backends/.gitignore b/example/rnn-backends/.gitignore
new file mode 100644
index 00000000000..1bf2c7a12ca
--- /dev/null
+++ b/example/rnn-backends/.gitignore
@@ -0,0 +1,6 @@
+# Datasets
+*.txt
+*.tokens
+
+# Logging Directories
+log
diff --git a/example/rnn-backends/README.md b/example/rnn-backends/README.md
new file mode 100644
index 00000000000..91c3464a49d
--- /dev/null
+++ b/example/rnn-backends/README.md
@@ -0,0 +1,8 @@
+# rnn-backends
+
+This directory provides comparison between different LSTM RNN backend implementations.
+To run the examples, please download the dataset using the provided script in the `dataset` directory.
+Please also make sure you have 
+[`mxboard`](https://github.com/awslabs/mxboard) and 
+[`tensorboard`](https://github.com/tensorflow/tensorboard) installed 
+(`sudo -H pip install mxboard tensorboard`), as all benchmarks use `tensorboard` to keep track of throughput.
diff --git a/example/rnn-backends/bucketing/dataset/download_sherlockholmes.sh b/example/rnn-backends/bucketing/dataset/download_sherlockholmes.sh
new file mode 100755
index 00000000000..01e8a3a434e
--- /dev/null
+++ b/example/rnn-backends/bucketing/dataset/download_sherlockholmes.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file downloads the Sherlock Holmes dataset.
+
+mkdir -p $(cd $(dirname $0) && pwd)/sherlockholmes
+
+cd $(cd $(dirname $0) && pwd)/sherlockholmes && \
+	wget https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.train.txt && \
+	wget https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.valid.txt && \
+	wget https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.test.txt
+
diff --git a/example/rnn-backends/bucketing/lstm_bucketing.py b/example/rnn-backends/bucketing/lstm_bucketing.py
new file mode 100644
index 00000000000..a4a4fdb1600
--- /dev/null
+++ b/example/rnn-backends/bucketing/lstm_bucketing.py
@@ -0,0 +1,191 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Module: lstm_bucketing
+
+Description:
+This file compares different RNN implementation on the Sherlock Holmes dataset
+using the LSTM bucketing module.
+Note that this file is not exactly the same with the original source file.
+Specifically, the following major changes have been made:
+    (1) ported FusedRNNCell (from cudnn_lstm_bucketing.py) and OpenLSTMRNNCell
+    (2) replaced MXNet Speedometer with TensorboardSpeedometer
+    (3) ported TensorboardLogValidationMetricsCallback
+"""
+
+import os
+import argparse
+import mxnet as mx
+
+parser = argparse.ArgumentParser(description="Train RNN on PennTreeBank",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-layers', type=int, default=2,
+                    help='number of stacked RNN layers')
+parser.add_argument('--num-hidden', type=int, default=200,
+                    help='hidden layer size')
+parser.add_argument('--num-embed', type=int, default=200,
+                    help='embedding layer size')
+parser.add_argument('--kv-store', type=str, default='device',
+                    help='key-value store type')
+parser.add_argument('--num-epochs', type=int, default=25,
+                    help='max num of epochs')
+parser.add_argument('--lr', type=float, default=0.01,
+                    help='initial learning rate')
+parser.add_argument('--optimizer', type=str, default='sgd',
+                    help='the optimizer type')
+parser.add_argument('--mom', type=float, default=0.0,
+                    help='momentum for sgd')
+parser.add_argument('--wd', type=float, default=0.00001,
+                    help='weight decay for sgd')
+parser.add_argument('--batch-size', type=int, default=32,
+                    help='the batch size.')
+parser.add_argument('--disp-batches', type=int, default=50,
+                    help='show progress for every n batches')
+parser.add_argument('--backend', type=str, default='open',
+                    help='select the RNN backend implementation')
+
+
+def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0):
+    """
+    Tokenize the text file.
+    """
+    # pylint: disable=redefined-outer-name
+    assert os.path.isfile(fname), \
+        "Cannot find the dataset, please make sure that you have " \
+        "run the download script in the dataset directory."
+    lines = open(fname).readlines()
+    lines = [filter(None, i.split(' ')) for i in lines]
+    sentences, vocab = mx.rnn.encode_sentences(sentences=lines, vocab=vocab,
+                                               start_label=start_label,
+                                               invalid_label=invalid_label)
+
+    return sentences, vocab
+
+
+if __name__ == '__main__':
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)-15s %(message)s')
+
+    args = parser.parse_args()
+
+    buckets = [10, 20, 30, 40, 50, 60]
+
+    start_label, invalid_label = 1, 0
+
+    train_sentences, vocab = tokenize_text("./dataset/sherlockholmes/sherlockholmes.train.txt",
+                                           start_label=start_label,
+                                           invalid_label=invalid_label)
+    valid_sentences, _ = tokenize_text("./dataset/sherlockholmes/sherlockholmes.valid.txt", vocab=vocab,
+                                       start_label=start_label,
+                                       invalid_label=invalid_label)
+
+    backend = args.backend
+
+    data_layout = 'TN' if backend == 'cudnn' else 'NT'
+    data_train = mx.rnn.BucketSentenceIter(sentences=train_sentences, batch_size=args.batch_size,
+                                           buckets=buckets, invalid_label=invalid_label,
+                                           layout=data_layout)
+    data_valid = mx.rnn.BucketSentenceIter(sentences=valid_sentences, batch_size=args.batch_size,
+                                           buckets=buckets, invalid_label=invalid_label,
+                                           layout=data_layout)
+
+    rnn = mx.rnn.BaseRNNCell()
+
+    if backend == 'default':
+        rnn = mx.rnn.SequentialRNNCell()
+
+        for i in range(args.num_layers):
+            rnn.add(mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_l%d_'%i))
+    elif backend == 'cudnn':
+        rnn = mx.rnn. FusedRNNCell(num_hidden=args.num_hidden,
+                                   num_layers=args.num_layers,
+                                   prefix='lstm_')
+    elif backend == 'open':
+        rnn = mx.rnn.OpenLSTMRNNCell(num_hidden=args.num_hidden,
+                                     num_layers=args.num_layers,
+                                     prefix='lstm_')
+    else:
+        assert 0, "Invalid backend argument. " \
+                  "Valid ones are default/cudnn/open."
+
+    def sym_gen(seq_len):
+        """
+        Symbol generator for sequences of different length.
+        """
+        data = mx.sym.Variable('data')
+        label = mx.sym.Variable('softmax_label')
+        embed = mx.sym.Embedding(data=data, input_dim=len(vocab),
+                                 output_dim=args.num_embed, name='embed')
+
+        rnn.reset()
+
+        if backend == 'default':
+            output, _ = rnn.unroll(length=seq_len,
+                                   inputs=embed,
+                                   layout='NTC',
+                                   merge_outputs=True)
+        elif backend == 'cudnn':
+            output, _ = rnn.unroll(length=seq_len,
+                                   inputs=embed,
+                                   layout='TNC',
+                                   merge_outputs=True)
+        elif backend == 'open':
+            output, _ = rnn.unroll(length=seq_len,
+                                   inputs=embed)
+        else:
+            assert 0, "Invalid backend argument. " \
+                      "Valid ones are default/cudnn/open."
+
+        pred = mx.sym.Reshape(output, shape=(-1, args.num_hidden))
+        pred = mx.sym.FullyConnected(data=pred, num_hidden=len(vocab), name='pred')
+
+        label = mx.sym.Reshape(label, shape=(-1,))
+        pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax')
+
+        return pred, ('data',), ('softmax_label',)
+
+    model = mx.mod.BucketingModule(sym_gen=sym_gen,
+                                   default_bucket_key=data_train.default_bucket_key,
+                                   context=mx.gpu())
+
+    logging.info("MXNet will be training using the RNN backend: %s.", backend)
+
+    try:
+        import mxboard
+    except ImportError:
+        logging.error("Please install mxboard using `sudo -H pip install mxboard`.")
+
+    summary_writer = mxboard.SummaryWriter('./log')
+
+    model.fit(
+        train_data=data_train,
+        eval_data=data_valid,
+        eval_metric=mx.metric.Perplexity(invalid_label),
+        kvstore=args.kv_store,
+        optimizer=args.optimizer,
+        optimizer_params={'learning_rate': args.lr,
+                          'momentum': args.mom, 'wd': args.wd},
+        initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
+        num_epoch=args.num_epochs,
+        batch_end_callback=mx.callback.TensorboardSpeedometer(summary_writer=summary_writer,
+                                                              batch_size=args.batch_size,
+                                                              frequent=args.disp_batches,
+                                                              auto_reset=False),
+        eval_end_callback=mx.callback.TensorboardLogValidationMetricsCallback(summary_writer=summary_writer))
diff --git a/example/rnn-backends/word_lm/data.py b/example/rnn-backends/word_lm/data.py
new file mode 100644
index 00000000000..8cad94959e2
--- /dev/null
+++ b/example/rnn-backends/word_lm/data.py
@@ -0,0 +1,163 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Module: data
+
+Description: The `data` module is responsible for loading the (Sherlockholmes/Wikitext-2) dataset.
+"""
+
+import os
+import numpy as np
+import mxnet as mx
+
+class Dictionary(object):
+    """
+    Dictionary is used to store the mappings between words and indices.
+    """
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = []
+        self.word_count = []
+
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.idx2word.append(word)
+            self.word2idx[word] = len(self.idx2word) - 1
+            self.word_count.append(0)
+        index = self.word2idx[word]
+        self.word_count[index] += 1
+        return index
+
+    def __len__(self):
+        return len(self.idx2word)
+
+class Corpus(object):
+    """
+    Corpus stores the tokenized dataset files.
+    """
+    def __init__(self, path, name):
+        self.dictionary = Dictionary()
+        if name == 'sherlockholmes':
+            self.train = self.tokenize(path + 'sherlockholmes/sherlockholmes.train.txt')
+            self.valid = self.tokenize(path + 'sherlockholmes/sherlockholmes.valid.txt')
+            self.test = self.tokenize(path + 'sherlockholmes/sherlockholmes.test.txt')
+        elif name == 'wikitext-2':
+            self.train = self.tokenize(path + 'wikitext-2/wiki.train.tokens')
+            self.valid = self.tokenize(path + 'wikitext-2/wiki.valid.tokens')
+            self.test = self.tokenize(path + 'wikitext-2/wiki.test.tokens')
+        else:
+            assert 0, "Invalid dataset name %s. " \
+                      "Valid ones are sherlockholmes/wikitext-2." % name
+
+    def tokenize(self, path):
+        """Tokenizes a text file."""
+        assert os.path.exists(path)
+        # Add words to the dictionary
+        with open(path, 'r') as f:
+            tokens = 0
+            for line in f:
+                words = line.split() + ['<eos>']
+                tokens += len(words)
+                for word in words:
+                    self.dictionary.add_word(word)
+
+        # Tokenize file content
+        with open(path, 'r') as f:
+            ids = np.zeros((tokens,), dtype='int32')
+            token = 0
+            for line in f:
+                words = line.split() + ['<eos>']
+                for word in words:
+                    ids[token] = self.dictionary.word2idx[word]
+                    token += 1
+
+        return mx.nd.array(ids, dtype='int32')
+
+def batchify(data, batch_size, layout):
+    """Reshape data into (batch_size, num_examples)"""
+    nbatch = data.shape[0] // batch_size
+    data = data[:nbatch*batch_size]
+
+    if layout == 'NT':
+        data = data.reshape((batch_size, nbatch))
+    else:
+        data = data.reshape((batch_size, nbatch)).T
+
+    return data
+
+class CorpusIter(mx.io.DataIter):
+    """
+    CorpusIter is used to iterate through the corpus and returns the a batch of sequence each time.
+    """
+    def __init__(self, source, batch_size, bptt, layout='NT'):
+        super(CorpusIter, self).__init__()
+        self._batch_size = batch_size
+        self._bptt = bptt
+        self._layout = layout
+
+        if layout == 'NT':
+            self.provide_data = [mx.io.DataDesc(name='data', shape=(batch_size, bptt),
+                                                dtype=np.int32, layout=layout)]
+            self.provide_label = [mx.io.DataDesc(name='label', shape=(batch_size, bptt),
+                                                 dtype=np.float32, layout=layout)]
+        elif layout == 'TN': # CuDNN implementation expects time-major layout.
+            self.provide_data = [mx.io.DataDesc(name='data', shape=(bptt, batch_size),
+                                                dtype=np.int32, layout=layout)]
+            self.provide_label = [mx.io.DataDesc(name='label', shape=(bptt, batch_size),
+                                                 dtype=np.float32, layout=layout)]
+        else:
+            assert 0, "Invalid data layout argument. Valid ones are NT/TN."
+
+        self._index = 0
+        self._source = batchify(source, batch_size, layout)
+
+    def iter_next(self):
+        """
+        Point next `data` and `label` to the next batch of sequence.
+        """
+        layout = self._layout
+        i = self._index
+        if i+self._bptt > self._source.shape[1 if layout == 'NT' else 0] - 1:
+            return False
+
+        if layout == 'NT':
+            self._next_data = self._source[:, i:i+self._bptt].astype(np.int32)
+            self._next_label = self._source[:, i+1:i+1+self._bptt].astype(np.float32)
+        else:
+            self._next_data = self._source[i:i+self._bptt].astype(np.int32)
+            self._next_label = self._source[i+1:i+1+self._bptt].astype(np.float32)
+
+        self._index += self._bptt
+        return True
+
+    def next(self):
+        if self.iter_next():
+            return mx.io.DataBatch(data=self.getdata(), label=self.getlabel())
+        else:
+            raise StopIteration
+
+    def reset(self):
+        self._index = 0
+        self._next_data = None
+        self._next_label = None
+
+    def getdata(self):
+        return [self._next_data]
+
+    def getlabel(self):
+        return [self._next_label]
diff --git a/example/rnn-backends/word_lm/dataset/download_sherlockholmes.sh b/example/rnn-backends/word_lm/dataset/download_sherlockholmes.sh
new file mode 100755
index 00000000000..01e8a3a434e
--- /dev/null
+++ b/example/rnn-backends/word_lm/dataset/download_sherlockholmes.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file downloads the Sherlock Holmes dataset.
+
+mkdir -p $(cd $(dirname $0) && pwd)/sherlockholmes
+
+cd $(cd $(dirname $0) && pwd)/sherlockholmes && \
+	wget https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.train.txt && \
+	wget https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.valid.txt && \
+	wget https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.test.txt
+
diff --git a/example/rnn-backends/word_lm/dataset/download_wikitext.sh b/example/rnn-backends/word_lm/dataset/download_wikitext.sh
new file mode 100644
index 00000000000..8e3a68bded9
--- /dev/null
+++ b/example/rnn-backends/word_lm/dataset/download_wikitext.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file downloads the Wikitext dataset.
+
+cd $(cd $(dirname $0) && pwd) && \
+	wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip && \
+	wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
+
+unzip wikitext-2-v1   && rm -f wikitext-2-v1.zip
+unzip wikitext-103-v1 && rm -f wikitext-103-v1.zip
+
diff --git a/example/rnn-backends/word_lm/model.py b/example/rnn-backends/word_lm/model.py
new file mode 100644
index 00000000000..f5b931a8a11
--- /dev/null
+++ b/example/rnn-backends/word_lm/model.py
@@ -0,0 +1,152 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Module: model
+
+Description: This file creates the computation graph for training purposes.
+"""
+
+import mxnet as mx
+
+
+def rnn(bptt, vocab_size, num_embed, nhid,
+        num_layers, dropout, batch_size, tied, backend):
+    """
+    Creates the computation graph for training.
+    """
+    # ==================================================================================================================
+    # Encoder
+    # ==================================================================================================================
+
+    data = mx.sym.Variable('data')
+    weight = mx.sym.Variable("encoder_weight", init=mx.init.Uniform(0.1))
+    embed = mx.sym.Embedding(data=data, weight=weight, input_dim=vocab_size,
+                             output_dim=num_embed, name='embed')
+
+    outputs = mx.sym.Dropout(embed, p=dropout)
+
+    # ==================================================================================================================
+    # RNN
+    # ==================================================================================================================
+
+    # stacked rnn layers
+
+    # Given below is the original source code, which we do argue makes bad use of CuDNN-RNN
+    # as there is no need to slice each layer apart, please refer to the open issue on github @
+    #     https://github.com/apache/incubator-mxnet/issues/10304
+    # The throughput measurements reported by MXNet speedometer showed that
+    # there is around 10~20% increase in throughput after all the layers have been fused.
+    # The implementation also messed up with the order of cell and hidden state,
+    # because according to the state order specified in rnn_cell.py (+L682),
+    # cell state should appear AFTER hidden state in FusedRNNCell. However, this has zero effect
+    # on the final training results because they are both zero-initialized.
+
+    states = []
+    state_names = []
+
+    # for i in range(num_layers):
+    #     prefix = 'lstm_l%d_' % i
+    #     cell = mx.rnn.FusedRNNCell(num_hidden=nhid, prefix=prefix, get_next_state=True,
+    #                                forget_bias=0.0, dropout=dropout)
+    #     state_shape = (1, batch_size, nhid)
+    #     begin_cell_state_name = prefix + 'cell'
+    #     begin_hidden_state_name = prefix + 'hidden'
+    #     begin_cell_state = mx.sym.var(begin_cell_state_name, shape=state_shape)
+    #     begin_hidden_state = mx.sym.var(begin_hidden_state_name, shape=state_shape)
+    #     state_names += [begin_cell_state_name, begin_hidden_state_name]
+    #     outputs, next_states = cell.unroll(bptt, inputs=outputs,
+    #                                        begin_state=[begin_cell_state, begin_hidden_state],
+    #                                        merge_outputs=True, layout='TNC')
+    #     outputs = mx.sym.Dropout(outputs, p=dropout)
+    #     states += next_states
+
+    prefix = 'lstm_'
+
+    if backend == 'default':
+        cell = mx.rnn.FusedRNNCell(num_hidden=nhid, num_layers=num_layers,
+                                   get_next_state=True, forget_bias=0.0,
+                                   dropout=dropout, prefix=prefix).unfuse()
+        begin_state = []
+
+        for layer_idx in range(num_layers):
+            layer_prefix = '%sl%d_' % (prefix, layer_idx)
+
+            hidden_state_name, cell_state_name = layer_prefix + 'hidden', layer_prefix + 'cell'
+            state_names += [hidden_state_name, cell_state_name]
+            begin_hidden_state = mx.sym.Variable(name=hidden_state_name, shape=(batch_size, nhid))
+            begin_cell_state = mx.sym.Variable(name=cell_state_name, shape=(batch_size, nhid))
+            begin_state += [begin_hidden_state, begin_cell_state]
+
+        outputs, states = cell.unroll(length=bptt, inputs=outputs,
+                                      begin_state=begin_state,
+                                      merge_outputs=True, layout='NTC')
+
+    elif backend == 'cudnn':
+        cell = mx.rnn.FusedRNNCell(num_hidden=nhid, num_layers=num_layers,
+                                   get_next_state=True, forget_bias=0.0,
+                                   dropout=dropout, prefix=prefix)
+        hidden_state_name, cell_state_name = prefix + 'hidden', prefix + 'cell'
+        state_names = [hidden_state_name, cell_state_name]
+        begin_hidden_state = mx.sym.Variable(name=hidden_state_name, shape=(num_layers, batch_size, nhid))
+        begin_cell_state = mx.sym.Variable(name=cell_state_name, shape=(num_layers, batch_size, nhid))
+        outputs, states = cell.unroll(length=bptt, inputs=outputs,
+                                      begin_state=[begin_hidden_state,
+                                                   begin_cell_state],
+                                      merge_outputs=True, layout='TNC')
+    elif backend == 'open':
+        cell = mx.rnn.OpenLSTMRNNCell(num_hidden=nhid, num_layers=num_layers,
+                                      get_next_state=True, forget_bias=0.0,
+                                      dropout=dropout, prefix=prefix)
+        hidden_state_name, cell_state_name = prefix + 'hidden', prefix + 'cell'
+        state_names = [hidden_state_name, cell_state_name]
+        begin_hidden_state = mx.sym.Variable(name=hidden_state_name, shape=(num_layers, batch_size, nhid))
+        begin_cell_state = mx.sym.Variable(name=cell_state_name, shape=(num_layers, batch_size, nhid))
+        outputs, states = cell.unroll(length=bptt, inputs=outputs,
+                                      begin_state=[begin_hidden_state,
+                                                   begin_cell_state])
+    else:
+        assert 0, "Invalid backend argument. " \
+                  "Valid ones are default/cudnn/open."
+
+    outputs = mx.sym.Dropout(outputs, p=dropout)
+
+    # ==================================================================================================================
+    # Decoder
+    # ==================================================================================================================
+
+    pred = mx.sym.Reshape(outputs, shape=(-1, nhid))
+    if tied:
+        assert(nhid == num_embed), \
+               "the number of hidden units and the embedding size must batch for weight tying"
+        pred = mx.sym.FullyConnected(data=pred, weight=weight,
+                                     num_hidden=vocab_size, name='pred')
+    else:
+        pred = mx.sym.FullyConnected(data=pred, num_hidden=vocab_size, name='pred')
+    pred = mx.sym.Reshape(pred, shape=(-1, vocab_size))
+
+    return pred, [mx.sym.stop_gradient(s) for s in states], state_names
+
+
+def softmax_ce_loss(pred):
+    # softmax cross-entropy loss
+    label = mx.sym.Variable('label')
+    label = mx.sym.Reshape(label, shape=(-1,))
+    logits = mx.sym.log_softmax(pred, axis=-1)
+    loss = -mx.sym.pick(logits, label, axis=-1, keepdims=True)
+    loss = mx.sym.mean(loss, axis=0, exclude=True)
+    return mx.sym.make_loss(loss, name='nll')
diff --git a/example/rnn-backends/word_lm/module.py b/example/rnn-backends/word_lm/module.py
new file mode 100644
index 00000000000..be69bc67cb8
--- /dev/null
+++ b/example/rnn-backends/word_lm/module.py
@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Module: module
+
+Description: StatefulModule records the hidden state values after training every iteration and
+restores those values before the start of the next round, this makes it different from MXNet
+default training module.
+"""
+
+import mxnet as mx
+
+class CustomStatefulModule():
+    """CustomStatefulModule is a module that takes a custom loss symbol and state symbols.
+    The custom loss is typically composed by `mx.sym.make_loss` or `mx.sym.MakeLoss`.
+    The states listed in `state_names` will be carried between iterations.
+
+    Parameters
+    ----------
+    loss : Symbol
+        The custom loss symbol
+    states: list of Symbol
+        The symbols of next states
+    state_names : list of str
+        states are similar to data and label, but not provided by data iterator.
+        Instead they are initialized to `initial_states` and can be carried between iterations.
+    data_names : list of str
+        Defaults to `('data')` for a typical model used in image classification.
+    label_names : list of str
+        Defaults to `('softmax_label')` for a typical model used in image
+        classification.
+    logger : Logger
+        Defaults to `logging`.
+    context : Context or list of Context
+        Defaults to ``mx.cpu()``.
+    initial_states: float or list of NDArray
+        Defaults to 0.0.
+    """
+    def __init__(self, loss, states, state_names, data_names=('data',), label_names=('label',),
+                 context=mx.cpu(), initial_states=0.0, **kwargs):
+        if isinstance(states, mx.symbol.Symbol):
+            states = [states]
+        self._net = mx.sym.Group(states + [loss])
+        self._next_states = initial_states
+        self._module = mx.module.Module(self._net, data_names=data_names, label_names=label_names,
+                                        context=context, state_names=state_names, **kwargs)
+
+    def backward(self, out_grads=None):
+        """Backward computation.
+        """
+        self._module.backward(out_grads=out_grads)
+
+    def init_params(self, initializer=mx.init.Uniform(0.01), **kwargs):
+        """Initializes the parameters and auxiliary states.
+        """
+        self._module.init_params(initializer=initializer, **kwargs)
+
+    def init_optimizer(self, **kwargs):
+        """Installs and initializes optimizers, as well as initialize kvstore for
+           distributed training.
+        """
+        self._module.init_optimizer(**kwargs)
+
+    def bind(self, data_shapes, **kwargs):
+        """Binds the symbols to construct executors. This is necessary before one
+        can perform computation with the module.
+        """
+        self._module.bind(data_shapes, **kwargs)
+
+    def forward(self, data_batch, is_train=None, carry_state=True):
+        """Forward computation. States from previous forward computation are carried
+        to the current iteration if `carry_state` is set to `True`.
+        """
+        # propagate states from the previous iteration
+        if carry_state:
+            if isinstance(self._next_states, (int, float)):
+                self._module.set_states(value=self._next_states)
+            else:
+                self._module.set_states(states=self._next_states)
+        self._module.forward(data_batch, is_train=is_train)
+        outputs = self._module.get_outputs(merge_multi_context=False)
+        self._next_states = outputs[:-1]
+
+    def update(self, max_norm=None):
+        """Updates parameters according to the installed optimizer and the gradients computed
+        in the previous forward-backward batch. Gradients are clipped by their global norm
+        if `max_norm` is set.
+
+        Parameters
+        ----------
+        max_norm: float, optional
+            If set, clip values of all gradients the ratio of the sum of their norms.
+        """
+        if max_norm is not None:
+            self._clip_by_global_norm(max_norm)
+        self._module.update()
+
+    def _clip_by_global_norm(self, max_norm):
+        """Clips gradient norm.
+
+        The norm is computed over all gradients together, as if they were
+        concatenated into a single vector. Gradients are modified in-place.
+        The method is first used in
+         `[ICML2013] On the difficulty of training recurrent neural networks`
+
+        Parameters
+        ----------
+        max_norm : float or int
+            The maximum clipping threshold of the gradient norm.
+
+        Returns
+        -------
+        norm_val : float
+            The computed norm of the gradients.
+        """
+        assert self._module.binded and self._module.params_initialized \
+               and self._module.optimizer_initialized
+        grad_array = []
+        for grad in self._module._exec_group.grad_arrays:
+            grad_array += grad
+        return mx.gluon.utils.clip_global_norm(grad_array, max_norm)
+
+    def get_loss(self):
+        """Gets the output loss of the previous forward computation.
+        """
+        return self._module.get_outputs(merge_multi_context=False)[-1]
diff --git a/example/rnn-backends/word_lm/train.py b/example/rnn-backends/word_lm/train.py
new file mode 100644
index 00000000000..67b450acfa4
--- /dev/null
+++ b/example/rnn-backends/word_lm/train.py
@@ -0,0 +1,249 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Module: train
+
+Description:
+This file compares different RNN implementation on the Sherlock Holmes/Wikitext-2 dataset
+using the word-level language modeling.
+Note that this file is not exactly the same with the original source file.
+Specifically, the following changes have been made:
+    (1) fixed the UserWarning on inconsistent batch_size
+    (2) ported MXNet Default and OpenLSTMRNN implementation
+    (3) replaced MXNet Speedometer with TensorboardSpeedometer
+    (4) added MXNet profiler for detailed performance analysis
+"""
+
+import math
+import argparse
+
+import mxnet as mx
+
+# pylint: disable=wildcard-import
+# pylint: disable=ungrouped-imports
+from model import *
+from module import *
+from data import Corpus, CorpusIter
+from mxnet.model import BatchEndParam
+
+parser = argparse.ArgumentParser(description='PennTreeBank LSTM Language Model')
+parser.add_argument('--dataset-dir', type=str, default='./dataset/',
+                    help='location of the data corpus')
+parser.add_argument('--dataset-name', type=str, default='sherlockholmes',
+                    help='name of the data corpus (sherlockholmes/wikitext-2)')
+parser.add_argument('--emsize', type=int, default=650,
+                    help='size of word embeddings')
+parser.add_argument('--nhid', type=int, default=650,
+                    help='number of hidden units per layer')
+parser.add_argument('--nlayers', type=int, default=2,
+                    help='number of layers')
+parser.add_argument('--lr', type=float, default=1.0,
+                    help='initial learning rate')
+parser.add_argument('--clip', type=float, default=0.2,
+                    help='gradient clipping by global norm')
+parser.add_argument('--epochs', type=int, default=40,
+                    help='upper epoch limit')
+parser.add_argument('--batch_size', type=int, default=32,
+                    help='batch size')
+parser.add_argument('--dropout', type=float, default=0.5,
+                    help='dropout applied to layers (0 = no dropout)')
+parser.add_argument('--tied', action='store_true',
+                    help='tie the word embedding and softmax weights')
+parser.add_argument('--bptt', type=int, default=35,
+                    help='sequence length')
+parser.add_argument('--log-interval', type=int, default=200,
+                    help='report interval')
+parser.add_argument('--backend', type=str, default='open',
+                    help='select the RNN backend implementation')
+parser.add_argument('--profiling', action='store_true',
+                    help='whether to enable the MXNet profiling')
+parser.add_argument('--profiling-start', type=int, default=200,
+                    help='the start iteration of profiling')
+parser.add_argument('--profiling-end', type=int, default=201,
+                    help='the end iteration of profiling')
+parser.add_argument('--profiler-output-fname', type=str, default='profiler_output.json',
+                    help='filename of profiler output')
+args = parser.parse_args()
+
+best_loss = float('inf')
+
+
+def evaluate(valid_module, data_iter,
+             epoch, mode,
+             bptt, batch_size,
+             summary_writer):
+             # pylint: disable=redefined-outer-name
+    """
+    Evaluate the validation scores and store to summary_writer.
+    """
+    total_loss = 0.0
+    nbatch = 0
+    for batch in data_iter:
+        valid_module.forward(batch, is_train=False)
+        outputs = valid_module.get_loss()
+        total_loss += mx.nd.sum(outputs[0]).asscalar()
+        nbatch += 1
+    data_iter.reset()
+    loss = total_loss / bptt / batch_size / nbatch
+    perplexity = math.exp(loss)
+    logging.info('Iter[%d] %5s Loss:\t%.7f, Perplexity: %.7f' % \
+                 (epoch, mode, loss, perplexity))
+    summary_writer.add_scalar(tag="%s-Loss" % mode,
+                              value=loss,
+                              global_step=epoch)
+    summary_writer.add_scalar(tag="%s-Perplexity" % mode,
+                              value=perplexity,
+                              global_step=epoch)
+
+    return loss
+
+
+if __name__ == '__main__':
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)-15s %(message)s')
+
+    args = parser.parse_args()
+
+    logging.info(args)
+
+    batch_size = args.batch_size
+    bptt = args.bptt
+    backend = args.backend
+
+    # ==================================================================================================================
+    # Data Preparation
+    # ==================================================================================================================
+
+    corpus = Corpus(args.dataset_dir, args.dataset_name)
+    ntokens = len(corpus.dictionary)
+
+    data_layout = 'TN' if backend == 'cudnn' else 'NT'
+
+    train_data = CorpusIter(source=corpus.train, batch_size=batch_size, bptt=bptt, layout=data_layout)
+    valid_data = CorpusIter(source=corpus.valid, batch_size=batch_size, bptt=bptt, layout=data_layout)
+    test_data = CorpusIter(source=corpus.test, batch_size=batch_size, bptt=bptt, layout=data_layout)
+
+    # ==================================================================================================================
+    # Training Model
+    # ==================================================================================================================
+
+    pred, states, state_names = rnn(bptt=bptt, vocab_size=ntokens, num_embed=args.emsize, nhid=args.nhid,
+                                    num_layers=args.nlayers, dropout=args.dropout,
+                                    batch_size=batch_size, tied=args.tied, backend=backend)
+    loss = softmax_ce_loss(pred)
+
+    # ==================================================================================================================
+    # Training Module
+    # ==================================================================================================================
+
+    module = CustomStatefulModule(loss, states, state_names=state_names, context=mx.gpu())
+    module.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
+    module.init_params(initializer=mx.init.Xavier())
+    optimizer = mx.optimizer.create('sgd',
+                                    learning_rate=args.lr,
+                                    rescale_grad=1.0/batch_size)
+    module.init_optimizer(optimizer=optimizer)
+
+    # ==================================================================================================================
+    # Monitor
+    # ==================================================================================================================
+
+    try:
+        import mxboard
+    except ImportError:
+        logging.error("Please install mxboard using `sudo -H pip install mxboard`.")
+
+    summary_writer = mxboard.SummaryWriter('./log')
+    speedometer = mx.callback.TensorboardSpeedometer(summary_writer=summary_writer,
+                                                     batch_size=batch_size,
+                                                     frequent=args.log_interval)
+    logging.info("MXNet will be training using the RNN backend: %s.", backend)
+
+    # ==================================================================================================================
+    # Profiling
+    # ==================================================================================================================
+
+    if args.profiling:
+        mx.profiler.profiler_set_config(mode='symbolic', filename='./log/%s' % args.profiler_output_fname)
+        # pylint: disable=logging-not-lazy
+        logging.info("Profiling has been enabled. MXNet will be profiling from iteration %s to %s." % \
+                     (args.profiling_start, args.profiling_end))
+        assert args.profiling_end > args.profiling_start, \
+            "Profiling start iteration must precede profiling end iteration."
+
+    # ==================================================================================================================
+    # Training
+    # ==================================================================================================================
+
+    logging.info("Training started ... ")
+
+    global_step = 0
+
+    for epoch in range(args.epochs):
+        total_loss = 0.0
+        nbatch = 0
+        for batch in train_data:
+            # profiling
+            if args.profiling:
+                if global_step == args.profiling_start:
+                    mx.profiler.profiler_set_state('run')
+                if global_step == args.profiling_end:
+                    mx.profiler.profiler_set_state('stop')
+            # train
+            module.forward(batch)
+            module.backward()
+            module.update(max_norm=args.clip * bptt * batch_size)
+            # update metric
+            outputs = module.get_loss()
+            total_loss += mx.nd.sum(outputs[0]).asscalar()
+            speedometer_param = BatchEndParam(epoch=epoch, nbatch=nbatch,
+                                              eval_metric=None, locals=locals())
+            speedometer(speedometer_param)
+            if nbatch % args.log_interval == 0 and nbatch > 0:
+                loss = total_loss / bptt / batch_size / args.log_interval
+                perplexity = math.exp(loss)
+                # pylint: disable=logging-not-lazy
+                logging.info('Iter[%d] Batch [%d]\tLoss: %.7f,\tPerplexity: %.7f' % \
+                             (epoch, nbatch, loss, perplexity))
+                summary_writer.add_scalar(tag="Loss",
+                                          value=loss,
+                                          global_step=global_step)
+                summary_writer.add_scalar(tag="Perplexity",
+                                          value=perplexity,
+                                          global_step=global_step)
+                total_loss = 0.0
+            nbatch += 1
+            global_step += 1
+        # validation
+        valid_loss = evaluate(valid_module=module, data_iter=valid_data,
+                              epoch=epoch, mode='Valid',
+                              bptt=bptt, batch_size=batch_size,
+                              summary_writer=summary_writer)
+        if valid_loss < best_loss:
+            best_loss = valid_loss
+            # test
+            test_loss = evaluate(valid_module=module, data_iter=test_data,
+                                 epoch=epoch, mode='Test',
+                                 bptt=bptt, batch_size=batch_size,
+                                 summary_writer=summary_writer)
+        else:
+            optimizer.lr *= 0.25
+        train_data.reset()
+    logging.info("Training completed.")
diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py
index e1c1714445d..90c7682505c 100644
--- a/python/mxnet/callback.py
+++ b/python/mxnet/callback.py
@@ -181,6 +181,75 @@ def __call__(self, param):
             self.tic = time.time()
 
 
+class TensorboardSpeedometer(object):
+    """Logs training speed and evaluation metrics periodically,
+    and at the same time, dump the results using tensorboard.
+    Parameters
+    ----------
+    summary_writer : Tensorboard Summary Writer
+    batch_size : int
+        Batch size of data.
+    frequent : int
+        Specifies how frequently training speed and evaluation metrics
+        must be logged. Default behavior is to log once every 50 batches.
+    auto_reset : bool
+        Reset the evaluation metrics after each log.
+    """
+
+    def __init__(self, summary_writer, batch_size, frequent=50, auto_reset=True):
+        self.summary_writer = summary_writer
+
+        self.batch_size = batch_size
+        self.frequent = frequent
+        self.init = False
+        self.tic = 0
+        self.last_count = 0
+        self.auto_reset = auto_reset
+
+        self.global_step = 0
+
+    def __call__(self, param):
+        """Callback to show speed."""
+        count = param.nbatch
+        if self.last_count > count:
+            self.init = False
+        self.last_count = count
+
+        if self.init:
+            if count % self.frequent == 0:
+                speed = self.frequent * self.batch_size / (time.time() - self.tic)
+
+                if param.eval_metric is not None:
+                    name_value = param.eval_metric.get_name_value()
+                    if self.auto_reset:
+                        param.eval_metric.reset()
+
+                    for name, value in dict(name_value).items():
+                        self.summary_writer.add_scalar(tag=name,
+                                                       value=value,
+                                                       global_step=self.global_step)
+
+                    msg = 'Global Step[%d] Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec'
+                    msg += '\t%s=%f' * len(name_value)
+                    logging.info(msg, self.global_step, param.epoch, count, speed,
+                                 *sum(name_value, ()))
+
+                else:
+                    logging.info("Iter[%d] Batch [%d]\tSpeed: %.2f samples/sec",
+                                 param.epoch, count, speed)
+
+                self.summary_writer.add_scalar(tag='Speed',
+                                               value=speed,
+                                               global_step=self.global_step)
+
+                self.tic = time.time()
+        else:
+            self.init = True
+            self.tic = time.time()
+
+        self.global_step += 1
+
+
 class ProgressBar(object):
     """Displays a progress bar, indicating the percentage of batches processed within each epoch.
 
@@ -220,3 +289,28 @@ def __call__(self, param):
         name_value = param.eval_metric.get_name_value()
         for name, value in name_value:
             logging.info('Epoch[%d] Validation-%s=%f', param.epoch, name, value)
+
+
+class TensorboardLogValidationMetricsCallback(object):
+    """Logs the eval metrics at the end of an epoch using tensorboard.
+    Parameters
+    ----------
+    summary_writer : Tensorboard Summary Writer
+    """
+
+    def __init__(self, summary_writer, prefix='validation-'):
+        self.summary_writer = summary_writer
+        self.prefix = prefix
+        self.global_step = 0
+
+    def __call__(self, param):
+        if not param.eval_metric:
+            return
+
+        name_value = param.eval_metric.get_name_value()
+        for name, value in name_value:
+            self.summary_writer.add_scalar(tag=self.prefix + name,
+                                           value=value,
+                                           global_step=self.global_step)
+
+        self.global_step += 1
diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
index 8ae729f3ccf..abb91b8ca8b 100755
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -161,7 +161,7 @@ def _legacy_init(self, name, arr):
         Parameters
         ----------
         name : str
-            Name of corresponding NDArray.
+            Name of corrosponding NDArray.
 
         arr : NDArray
             NDArray to be initialized.
@@ -424,14 +424,12 @@ def _init_weight(self, _, arr):
 
 @register
 class Constant(Initializer):
-    """Initializes the weights to a given value.
-    The value passed in can be a scalar or a NDarray that matches the shape
-    of the parameter to be set.
+    """Initializes the weights to a scalar value.
 
     Parameters
     ----------
-    value : float, NDArray
-        Value to set.
+    value : float
+        Fill value.
     """
     def __init__(self, value):
         super(Constant, self).__init__(value=value)
@@ -653,7 +651,7 @@ def _init_weight(self, _, arr):
 
 @register
 class LSTMBias(Initializer):
-    """Initialize all biases of an LSTMCell to 0.0 except for
+    """Initialize all bias of an LSTMCell to 0.0 except for
     the forget gate whose bias is set to custom value.
 
     Parameters
@@ -726,3 +724,197 @@ def _init_weight(self, desc, arr): # pylint: disable=arguments-differ
                 self._init(arg_desc, args[name])
 
         arr[:] = cell.pack_weights(args)['parameters']
+
+@register
+class OpenLSTMRNNiWInit(Initializer):
+    """
+    OpenLSTMRNN Input->Hidden Weight Initializer
+    """
+    def __init__(self, prefix, num_hidden, num_layers):
+        super(OpenLSTMRNNiWInit, self).__init__(prefix=prefix,
+                                                num_hidden=num_hidden,
+                                                num_layers=num_layers)
+        self._prefix = prefix
+        self._num_hidden = num_hidden
+        self._num_layers = num_layers
+
+    def _slice_and_init(self, desc, arr):
+
+        num_hidden = self._num_hidden
+        num_layers = self._num_layers
+
+        from .rnn import rnn_cell
+
+        rnn_cell = rnn_cell.OpenLSTMRNNCell(num_hidden=num_hidden,
+                                            num_layers=num_layers)
+
+        gate_names = rnn_cell._gate_names
+
+        num_input = arr.size // rnn_cell._num_gates // num_hidden - \
+                    num_hidden * (num_layers - 1)
+
+        iw_idx = 0
+
+        for layer in range(num_layers):
+            for direction in ['l']:
+                for gate in gate_names:
+                    iw_name = '%s%s%d_i2h%s_weight' % (self._prefix, direction, layer, gate)
+                    arg_desc = InitDesc(iw_name, global_init=desc.global_init)
+
+                    if layer > 0:
+                        desc.global_init(arg_desc,
+                                         arr[iw_idx:iw_idx+num_hidden*num_hidden]
+                                         .reshape((num_hidden, num_hidden)))
+                        iw_idx += num_hidden*num_hidden
+                    else:
+                        desc.global_init(arg_desc,
+                                         arr[iw_idx:iw_idx+num_hidden*num_input]
+                                         .reshape((num_hidden, num_input)))
+                        iw_idx += num_hidden*num_input
+
+        assert iw_idx == arr.size, "Size of accmulated weights/biases (%d) do not match " \
+            "what has been expected (%d)." % (iw_idx, arr.size)
+
+    def _init_weight(self, desc, arr): # pylint: disable=arguments-differ
+        self._slice_and_init(desc=desc, arr=arr)
+
+@register
+class OpenLSTMRNNhWInit(Initializer):
+    """
+    OpenLSTMRNN Hidden->Hidden Weight Initializer
+    """
+    def __init__(self, prefix, num_hidden, num_layers):
+        super(OpenLSTMRNNhWInit, self).__init__(prefix=prefix,
+                                                num_hidden=num_hidden,
+                                                num_layers=num_layers)
+        self._prefix = prefix
+        self._num_hidden = num_hidden
+        self._num_layers = num_layers
+
+    def _slice_and_init(self, desc, arr):
+
+        num_hidden = self._num_hidden
+        num_layers = self._num_layers
+
+        from .rnn import rnn_cell
+
+        rnn_cell = rnn_cell.OpenLSTMRNNCell(num_hidden=num_hidden,
+                                            num_layers=num_layers)
+
+        gate_names = rnn_cell._gate_names
+
+        hw_idx = 0
+
+        for layer in range(num_layers):
+            for direction in ['l']:
+                for gate in gate_names:
+                    hw_name = '%s%s%d_h2h%s_weight' % (self._prefix, direction, layer, gate)
+                    arg_desc = InitDesc(hw_name, global_init=desc.global_init)
+
+                    desc.global_init(arg_desc,
+                                     arr[hw_idx:hw_idx+num_hidden*num_hidden]
+                                     .reshape((num_hidden, num_hidden)))
+                    hw_idx += num_hidden*num_hidden
+
+        assert hw_idx == arr.size, "Size of accmulated weights/biases (%d) do not match " \
+            "what has been expected (%d)." % (hw_idx, arr.size)
+
+    def _init_weight(self, desc, arr): # pylint: disable=arguments-differ
+        self._slice_and_init(desc=desc, arr=arr)
+
+@register
+class OpenLSTMRNNiBInit(Initializer):
+    """
+    OpenLSTMRNN Input->Hidden Bias Initializer
+    """
+    def __init__(self, prefix, num_hidden, num_layers, forget_bias=1.0):
+        super(OpenLSTMRNNiBInit, self).__init__(prefix=prefix,
+                                                num_hidden=num_hidden,
+                                                num_layers=num_layers,
+                                                forget_bias=forget_bias)
+        self._prefix = prefix
+        self._num_hidden = num_hidden
+        self._num_layers = num_layers
+        self._forget_bias = forget_bias
+
+    def _slice_and_init(self, desc, arr):
+
+        num_hidden = self._num_hidden
+        num_layers = self._num_layers
+
+        from .rnn import rnn_cell
+
+        rnn_cell = rnn_cell.OpenLSTMRNNCell(num_hidden=num_hidden,
+                                            num_layers=num_layers)
+
+        gate_names = rnn_cell._gate_names
+
+        ib_idx = 0
+
+        for layer in range(num_layers):
+            for direction in ['l']:
+                for gate in gate_names:
+                    ib_name = '%s%s%d_i2h%s_bias' % (self._prefix, direction, layer, gate)
+                    arg_desc = InitDesc(ib_name, global_init=desc.global_init)
+
+                    if gate == 'f':
+                        arr[ib_idx:ib_idx+num_hidden] = self._forget_bias
+                    else:
+                        # fall back to global initializer
+                        desc.global_init(arg_desc,
+                                         arr[ib_idx:ib_idx+num_hidden].reshape((num_hidden,)))
+                    ib_idx += num_hidden
+
+        assert ib_idx == arr.size, "Size of accmulated weights/biases (%d) do not match " \
+            "what has been expected (%d)." % (ib_idx, arr.size)
+
+    def _init_weight(self, desc, arr): # pylint: disable=arguments-differ
+        self._slice_and_init(desc=desc, arr=arr)
+
+@register
+class OpenLSTMRNNhBInit(Initializer):
+    """
+    OpenLSTMRNN Hidden->Hidden Bias Initializer
+    """
+    def __init__(self, prefix, num_hidden, num_layers, forget_bias=1.0):
+        super(OpenLSTMRNNhBInit, self).__init__(prefix=prefix,
+                                                num_hidden=num_hidden,
+                                                num_layers=num_layers,
+                                                forget_bias=forget_bias)
+        self._prefix = prefix
+        self._num_hidden = num_hidden
+        self._num_layers = num_layers
+        self._forget_bias = forget_bias
+
+    def _slice_and_init(self, desc, arr):
+
+        num_hidden = self._num_hidden
+        num_layers = self._num_layers
+
+        from .rnn import rnn_cell
+
+        rnn_cell = rnn_cell.OpenLSTMRNNCell(num_hidden=num_hidden,
+                                            num_layers=num_layers)
+
+        gate_names = rnn_cell._gate_names
+
+        hb_idx = 0
+
+        for layer in range(num_layers):
+            for direction in ['l']:
+                for gate in gate_names:
+                    hb_name = '%s%s%d_h2h%s_bias' % (self._prefix, direction, layer, gate)
+                    arg_desc = InitDesc(hb_name, global_init=desc.global_init)
+
+                    if gate == 'f':
+                        arr[hb_idx:hb_idx+num_hidden] = self._forget_bias
+                    else:
+                        # fall back to global initializer
+                        desc.global_init(arg_desc,
+                                         arr[hb_idx:hb_idx+num_hidden].reshape((num_hidden,)))
+                    hb_idx += num_hidden
+        assert hb_idx == arr.size, "Size of accmulated weights/biases (%d) do not match " \
+            "what has been expected (%d)." % (hb_idx, arr.size)
+
+    def _init_weight(self, desc, arr): # pylint: disable=arguments-differ
+        self._slice_and_init(desc=desc, arr=arr)
diff --git a/python/mxnet/rnn/rnn_cell.py b/python/mxnet/rnn/rnn_cell.py
index 3301102ba90..66dc3c4d4ee 100644
--- a/python/mxnet/rnn/rnn_cell.py
+++ b/python/mxnet/rnn/rnn_cell.py
@@ -745,6 +745,202 @@ def unfuse(self):
         return stack
 
 
+class OpenLSTMRNNCell(BaseRNNCell):
+    """Fusing RNN layers across time step into one kernel.
+    Improves speed and is flexible compared with cuDNN implementation.
+    Parameters
+    ----------
+    num_hidden : int
+        Number of units in output symbol.
+    num_layers : int, default 1
+        Number of layers in the cell.
+    dropout : float, default 0.
+        Fraction of the input that gets dropped out during training time.
+    get_next_state : bool, default False
+        Whether to return the states that can be used as starting states next time.
+    forget_bias : bias added to forget gate, default to 1.0.
+        Jozefowicz et al. 2015 recommends setting this to 1.0
+    prefix : str, default 'lstm_'
+        Prefix for names of layers
+        (this prefix is also used for names of weights if `params` is None
+        i.e. if `params` are being created and not reused)
+    params : RNNParams, default None
+        Container for weight sharing between cells. Created if None.
+    """
+    def __init__(self, num_hidden, num_layers=1, forget_bias=1.0,
+                 dropout=0., get_next_state=False, prefix=None, params=None):
+        if prefix is None:
+            prefix = 'lstm_'
+        super(OpenLSTMRNNCell, self).__init__(prefix=prefix, params=params)
+        self._num_hidden = num_hidden
+        self._num_layers = num_layers
+        self._dropout = dropout
+        self._get_next_state = get_next_state
+
+        self._iW = self.params.get('i2h_weight',
+                                   init=init.OpenLSTMRNNiWInit(prefix=prefix,
+                                                               num_hidden=num_hidden,
+                                                               num_layers=num_layers))
+        self._hW = self.params.get('h2h_weight',
+                                   init=init.OpenLSTMRNNhWInit(prefix=prefix,
+                                                               num_hidden=num_hidden,
+                                                               num_layers=num_layers))
+        # We add the forget_bias to i2h_bias, this adds the bias to the forget gate activation.
+        self._iB = self.params.get('i2h_bias',
+                                   init=init.OpenLSTMRNNiBInit(prefix=prefix,
+                                                               num_hidden=num_hidden,
+                                                               num_layers=num_layers,
+                                                               forget_bias=forget_bias))
+        self._hB = self.params.get('h2h_bias',
+                                   init=init.OpenLSTMRNNhBInit(prefix=prefix,
+                                                               num_hidden=num_hidden,
+                                                               num_layers=num_layers,
+                                                               forget_bias=forget_bias))
+
+    @property
+    def state_info(self):
+        return [{'shape': (self._num_layers, 0, self._num_hidden), '__layout__': 'LNC'},
+                {'shape': (self._num_layers, 0, self._num_hidden), '__layout__': 'LNC'}]
+
+    @property
+    def _gate_names(self):
+        return ['_i', '_f', '_c', '_o']
+
+    @property
+    def _num_gates(self):
+        return len(self._gate_names)
+
+    def _slice_weights(self, iW, iB, hW, hB, num_layers, num_input, num_hidden):
+        args = {}
+
+        gate_names = self._gate_names
+
+        iw_idx, ib_idx, hw_idx, hb_idx = 0, 0, 0, 0
+
+        for layer in range(num_layers):
+            for direction in ['l']:
+                for gate in gate_names:
+                    iw_name = '%s%s%d_i2h%s_weight' % (self._prefix, direction, layer, gate)
+                    ib_name = '%s%s%d_i2h%s_bias'   % (self._prefix, direction, layer, gate)
+                    hw_name = '%s%s%d_h2h%s_weight' % (self._prefix, direction, layer, gate)
+                    hb_name = '%s%s%d_h2h%s_bias'   % (self._prefix, direction, layer, gate)
+                    if layer > 0:
+                        args[iw_name] = iW[iw_idx:iw_idx+num_hidden*num_hidden]\
+                            .reshape((num_hidden, num_hidden))
+                        args[ib_name] = iB[ib_idx:ib_idx+num_hidden]\
+                            .reshape((num_hidden,))
+                        args[hw_name] = hW[hw_idx:hw_idx+num_hidden*num_hidden]\
+                            .reshape((num_hidden, num_hidden))
+                        args[hb_name] = hB[hb_idx:hb_idx+num_hidden]\
+                            .reshape((num_hidden,))
+                        iw_idx += num_hidden * num_hidden
+                        ib_idx += num_hidden
+                        hw_idx += num_hidden * num_hidden
+                        hb_idx += num_hidden
+                    else:
+                        args[iw_name] = iW[iw_idx:iw_idx+num_hidden*num_input]\
+                            .reshape((num_hidden, num_input))
+                        args[ib_name] = iB[ib_idx:ib_idx+num_hidden]\
+                            .reshape((num_hidden,))
+                        args[hw_name] = hW[hw_idx:hw_idx+num_hidden*num_hidden]\
+                            .reshape((num_hidden, num_hidden))
+                        args[hb_name] = hB[hb_idx:hb_idx+num_hidden]\
+                            .reshape((num_hidden,))
+                        iw_idx += num_hidden * num_input
+                        ib_idx += num_hidden
+                        hw_idx += num_hidden * num_hidden
+                        hb_idx += num_hidden
+
+        assert iw_idx == iW.size and ib_idx == iB.size and \
+            hw_idx == hW.size and hb_idx == hB.size, \
+            "Size of accmulated weights/biases (%d,%d,%d,%d) " \
+            "do not match what has been expected (%d,%d,%d,%d)." % \
+            (iw_idx, ib_idx, hw_idx, hb_idx, iW.size, iB.size, hW.size, hB.size)
+
+        return args
+
+    def unpack_weights(self, args):
+        args = args.copy()
+
+        iW = args.pop(self._iW.name)
+        iB = args.pop(self._iB.name)
+        hW = args.pop(self._hW.name)
+        hB = args.pop(self._hB.name)
+
+        # compute the input (embedding) dimension
+        num_hidden = self._num_hidden
+        num_layers = self._num_layers
+        num_input = iW.size // self._num_gates // num_hidden - \
+                    num_hidden * (num_layers - 1)
+
+        sliced_args = self._slice_weights(iW=iW, iB=iB, hW=hW, hB=hB,
+                                          num_input=num_input,
+                                          num_layers=num_layers,
+                                          num_hidden=num_hidden)
+        args.update({name: nd.copy() for name, nd in sliced_args.items()})
+
+        return args
+
+    def pack_weights(self, args):
+        args = args.copy()
+
+        w0 = args['%sl0_i2h%s_weight' % (self._prefix, self._gate_names[0])]
+
+        num_gates = self._num_gates
+        num_hidden = self._num_hidden
+        num_layers = self._num_layers
+        num_input = w0.shape[1]
+
+        iW_size = num_gates * num_hidden * num_input + \
+                  (num_layers - 1) * num_gates * num_hidden * num_hidden
+        iB_size = num_layers * num_gates * num_hidden
+        hW_size = num_layers * num_gates * num_hidden * num_hidden
+        hB_size = num_layers * num_gates * num_hidden
+
+        iW = ndarray.zeros((iW_size,), ctx=w0.context, dtype=w0.dtype)
+        iB = ndarray.zeros((iB_size,), ctx=w0.context, dtype=w0.dtype)
+        hW = ndarray.zeros((hW_size,), ctx=w0.context, dtype=w0.dtype)
+        hB = ndarray.zeros((hB_size,), ctx=w0.context, dtype=w0.dtype)
+
+        for name, nd in self._slice_weights(iW=iW, iB=iB, hW=hW, hB=hB,
+                                            num_input=num_input,
+                                            num_layers=num_layers,
+                                            num_hidden=num_hidden).items():
+            nd[:] = args.pop(name)
+
+        args[self._iW.name] = iW
+        args[self._iB.name] = iB
+        args[self._hW.name] = hW
+        args[self._hB.name] = hB
+
+        return args
+
+    def __call__(self, inputs, states):
+        raise NotImplementedError("OpenLSTMRNNCell cannot be stepped. Please use unroll")
+
+    def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None):
+        assert layout == 'NTC', "OpenLSTMRNNCell is expecting NTC layout."
+
+        self.reset()
+
+        if begin_state is None:
+            begin_state = self.begin_state()
+
+        rnn = symbol.OpenLSTMRNN(data=inputs, num_layers=self._num_layers,
+                                 num_hidden_units=self._num_hidden,
+                                 init_hidden=begin_state[1],
+                                 init_cell=begin_state[0],
+                                 i2h_weight=self._iW, i2h_bias=self._iB,
+                                 h2h_weight=self._hW, h2h_bias=self._hB,
+                                 i_dp_prob=self._dropout, state_outputs=self._get_next_state,
+                                 name=self._prefix+'rnn')
+
+        if not self._get_next_state:
+            return rnn, []
+        else:
+            return rnn[0], [rnn[1], rnn[2]]
+
+
 class SequentialRNNCell(BaseRNNCell):
     """Sequantially stacking multiple RNN cells.
 
diff --git a/src/operator/contrib/cu_open_lstm_rnn-inl.cuh b/src/operator/contrib/cu_open_lstm_rnn-inl.cuh
new file mode 100644
index 00000000000..dc8a85b33cd
--- /dev/null
+++ b/src/operator/contrib/cu_open_lstm_rnn-inl.cuh
@@ -0,0 +1,907 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file open_lstm_rnn-inl.cuh
+ * \brief LSTM RNN Open-Source CUDA Implementation
+ * \author Bojian (Jack) Zheng, Gennady Pekhimenko, Jeremy Appleyard
+ */
+#ifndef MXNET_OPERATOR_CONTRIB_CU_OPEN_LSTM_RNN_INL_CUH_
+#define MXNET_OPERATOR_CONTRIB_CU_OPEN_LSTM_RNN_INL_CUH_
+
+#include <mxnet/storage.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include <cstdint>
+#include "./open_lstm_rnn-inl.h"
+#include "./open_lstm_rnn_include/dropout.cuh"
+#include "./open_lstm_rnn_include/lstm_cell.cuh"
+#include "./open_lstm_rnn_include/cublas_matmul.h"
+#include "./open_lstm_rnn_include/cublas_transpose.h"
+
+#define RE_CAST(ptr) reinterpret_cast < float * > (ptr)
+
+namespace mxnet {
+namespace op {
+
+class CUOpenLSTMRNNOp : public Operator {
+ public:
+  explicit CUOpenLSTMRNNOp(OpenLSTMRNNParam param) {
+    this->param_ = param; initialized_ = false;
+  }
+
+  ~CUOpenLSTMRNNOp() {
+    //=========================================================================
+    // Free the allocated workspace memory.
+    //=========================================================================
+    Storage::Get()->Free(m_data_T_major);
+    Storage::Get()->Free(m_data_T_major_grad);
+    Storage::Get()->Free(m_cell);
+    Storage::Get()->Free(m_hidden);
+    Storage::Get()->Free(m_i2h_workspace);
+    Storage::Get()->Free(m_h2h_workspace);
+    Storage::Get()->Free(m_i2h_grad_workspace);
+    Storage::Get()->Free(m_h2h_grad_workspace);
+    Storage::Get()->Free(m_linear_gates);
+    //=========================================================================
+    // Destroy the cuBLAS handle.
+    //=========================================================================
+    CUBLAS_CALL(cublasDestroy(m_cublas_handle));
+    //=========================================================================
+    // Free the workers (cudaStream and cudaEvent).
+    //=========================================================================
+    for (unsigned layer_idx = 0; layer_idx < param_.num_layers; ++layer_idx) {
+      CUDA_CALL(cudaStreamDestroy(m_stream_i2h[layer_idx]));
+      m_stream_i2h[layer_idx] = NULL;
+      CUDA_CALL(cudaStreamDestroy(m_stream_h2h[layer_idx]));
+      m_stream_h2h[layer_idx] = NULL;
+    }
+    delete [] m_stream_i2h; m_stream_i2h = NULL;
+    delete [] m_stream_h2h; m_stream_h2h = NULL;
+    for (unsigned layer_idx = 0; layer_idx < param_.num_layers; ++layer_idx) {
+      for (unsigned seq_idx = 0; seq_idx < param_.seq_len; ++seq_idx) {
+        CUDA_CALL(cudaEventDestroy(m_event_i2h[layer_idx][seq_idx]));
+        m_event_i2h[layer_idx][seq_idx] = NULL;
+        CUDA_CALL(cudaEventDestroy(m_event_h2h[layer_idx][seq_idx]));
+        m_event_h2h[layer_idx][seq_idx] = NULL;
+      }
+      delete [] m_event_i2h[layer_idx]; m_event_i2h[layer_idx] = NULL;
+      delete [] m_event_h2h[layer_idx]; m_event_h2h[layer_idx] = NULL;
+    }
+    delete [] m_event_i2h; m_event_i2h = NULL;
+    delete [] m_event_h2h; m_event_h2h = NULL;
+    //=========================================================================
+    // Destroy the cuRAND handle and associated workspace.
+    //=========================================================================
+    if (param_.i_dp_prob != 0 && param_.num_layers > 1) {
+      CURAND_CALL(curandDestroyGenerator(m_rng));
+      Storage::Get()->Free(m_i_dp_uniform_rv);
+      Storage::Get()->Free(m_i_dp_workspace);
+    }
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+
+    //=========================================================================
+    // IO Data
+    //=========================================================================
+    std::size_t in_expected = 7, out_expected = param_.state_outputs ? 3 : 1;
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    Tensor<gpu, 3, float> data        = in_data[open_lstm_rnn_enum::kData]
+                                        .get<gpu, 3, float>(s);
+    Tensor<gpu, 3, float> init_hidden = in_data[open_lstm_rnn_enum::kInitHidden]
+                                        .get<gpu, 3, float>(s);
+    Tensor<gpu, 3, float> init_cell   = in_data[open_lstm_rnn_enum::kInitCell]
+                                        .get<gpu, 3, float>(s);
+    Tensor<gpu, 1, float> i2h_weight  = in_data[open_lstm_rnn_enum::ki2hWeight]
+                                        .get<gpu, 1, float>(s);
+    Tensor<gpu, 1, float> i2h_bias    = in_data[open_lstm_rnn_enum::ki2hBias]
+                                        .get<gpu, 1, float>(s);
+    Tensor<gpu, 1, float> h2h_weight  = in_data[open_lstm_rnn_enum::kh2hWeight]
+                                        .get<gpu, 1, float>(s);
+    Tensor<gpu, 1, float> h2h_bias    = in_data[open_lstm_rnn_enum::kh2hBias]
+                                        .get<gpu, 1, float>(s);
+    Tensor<gpu, 3, float> concat_hidden_states = out_data[open_lstm_rnn_enum::
+                                                          kConcatHiddenStates]
+                                        .get<gpu, 3, float>(s);
+    CHECK_EQ(data       .CheckContiguous(), true);
+    CHECK_EQ(init_hidden.CheckContiguous(), true);
+    CHECK_EQ(init_cell  .CheckContiguous(), true);
+    CHECK_EQ(i2h_weight .CheckContiguous(), true);
+    CHECK_EQ(i2h_bias   .CheckContiguous(), true);
+    CHECK_EQ(h2h_weight .CheckContiguous(), true);
+    CHECK_EQ(h2h_bias   .CheckContiguous(), true);
+    CHECK_EQ(concat_hidden_states.CheckContiguous(), true);
+    float *ptr_final_hidden = NULL, *ptr_final_cell = NULL;
+    if (param_.state_outputs) {
+      Tensor<gpu, 3, float> final_hidden =
+        out_data[open_lstm_rnn_enum::kFinalHidden].get<gpu, 3, float>(s);
+      Tensor<gpu, 3, float> final_cell =
+        out_data[open_lstm_rnn_enum::kFinalCell]  .get<gpu, 3, float>(s);
+      CHECK_EQ(final_hidden.CheckContiguous(), true);
+      CHECK_EQ(final_cell  .CheckContiguous(), true);
+      ptr_final_hidden = final_hidden.dptr_;
+      ptr_final_cell   = final_cell  .dptr_;
+    }
+    //=========================================================================
+    // Initialization
+    //=========================================================================
+    if (!initialized_)
+      Init(s, in_data, out_data);
+    //=========================================================================
+    // Forward Pass
+    //=========================================================================
+    // generate random variable if i_dp_prob is nonzero
+    if (param_.i_dp_prob != 0 && param_.num_layers > 0 && ctx.is_train) {
+      CURAND_CALL(curandSetStream(m_rng, m_stream_i2h[1]));
+      CURAND_CALL(curandGenerateUniform(m_rng,
+                                        RE_CAST(m_i_dp_uniform_rv.dptr),
+                                        (param_.num_layers - 1) *
+                                          param_.seq_len *
+                                          m_num_hidden_units_x_batch_size));
+    }
+    CUBLAS_CALL(cublasSetStream(m_cublas_handle, m_stream_i2h[0]));
+    transpose(m_cublas_handle,
+              RE_CAST(m_data_T_major.dptr), data.dptr_,
+              param_.batch_size,
+              param_.seq_len * param_.embed_dim);
+    // use ScheduleList to implement wavefront parallelism
+    for (ScheduleList::iterator iter  = m_forward_schedule.begin();
+                                iter != m_forward_schedule.end(); ++iter) {
+      // obtain the precomputed schedule
+      unsigned layer_begin = iter->m_layer_begin, layer_end = iter->m_layer_end,
+        seq_begin = iter->m_seq_begin, seq_end = iter->m_seq_end;
+
+      for (unsigned layer_idx = layer_begin; layer_idx < layer_end; ++layer_idx) {
+        //=====================================================================
+        // Input -> Hidden
+        //=====================================================================
+        // Comment: If you find it difficult to interpret the code due to
+        // pointer operations, please kindly refer to the code in the
+        // block comment section for equivalent implementation. Thanks.
+        CUBLAS_CALL(cublasSetStream(m_cublas_handle, m_stream_i2h[layer_idx]));
+        //=====================================================================
+        // wait here until m_hidden is ready
+        // i2h of next layer needs to wait for h2h of previous layer
+        for (unsigned seq_idx = seq_begin; seq_idx < seq_end; ++seq_idx)
+          if (layer_idx != 0)
+            CUDA_CALL(cudaStreamWaitEvent(m_stream_i2h[layer_idx],
+                                          m_event_h2h[layer_idx - 1][seq_idx], 0));
+        //=====================================================================
+        if (layer_idx == 0) {
+          matmul_stridedbatched(m_cublas_handle,
+                                RE_CAST(m_i2h_workspace.dptr) +
+                                  seq_begin * 4 * m_num_hidden_units_x_batch_size,
+                                i2h_weight.dptr_,
+                                RE_CAST(m_data_T_major.dptr) +
+                                  seq_begin * param_.embed_dim * param_.batch_size,
+                                4 * param_.num_hidden_units,
+                                param_.batch_size,
+                                param_.embed_dim,
+                                4 * m_num_hidden_units_x_batch_size,
+                                0,
+                                param_.embed_dim * param_.batch_size,
+                                seq_end - seq_begin);
+        } else {
+          if (param_.i_dp_prob != 0 && ctx.is_train) {
+            __cuda_dropout_forward
+              <<< (m_num_hidden_units_x_batch_size * (seq_end - seq_begin) - 1) / 128 + 1,
+                  128, 0, m_stream_i2h[layer_idx]
+              >>> (RE_CAST(m_i_dp_workspace.dptr) +
+                     (layer_idx - 1) * param_.seq_len * m_num_hidden_units_x_batch_size +
+                                            seq_begin * m_num_hidden_units_x_batch_size,
+                   RE_CAST(m_hidden.dptr) +
+                     (layer_idx - 1) * (param_.seq_len + 1) * m_num_hidden_units_x_batch_size +
+                                            (seq_begin + 1) * m_num_hidden_units_x_batch_size,
+                   RE_CAST(m_i_dp_uniform_rv.dptr) +
+                     (layer_idx - 1) * param_.seq_len * m_num_hidden_units_x_batch_size +
+                                            seq_begin * m_num_hidden_units_x_batch_size,
+                   param_.i_dp_prob, m_num_hidden_units_x_batch_size * (seq_end - seq_begin));
+          }  // i_dp_prob
+          matmul_stridedbatched(m_cublas_handle,
+                                RE_CAST(m_i2h_workspace.dptr) +
+                                  seq_begin * 4 * m_num_hidden_units_x_batch_size,
+                                i2h_weight.dptr_ +
+                                  4 * param_.num_hidden_units * param_.embed_dim +
+                                  (layer_idx - 1) * 4 * param_.num_hidden_units *
+                                    param_.num_hidden_units,
+                                param_.i_dp_prob != 0 && ctx.is_train ?
+                                  RE_CAST(m_i_dp_workspace.dptr) +
+                                    (layer_idx - 1) * param_.seq_len *
+                                      m_num_hidden_units_x_batch_size +
+                                    seq_begin * m_num_hidden_units_x_batch_size :
+                                  RE_CAST(m_hidden.dptr) +
+                                    (layer_idx - 1) * (param_.seq_len + 1) *
+                                      m_num_hidden_units_x_batch_size +
+                                    (seq_begin + 1) * m_num_hidden_units_x_batch_size,
+                                  4 * param_.num_hidden_units,
+                                  param_.batch_size,
+                                  param_.num_hidden_units,
+                                  4 * m_num_hidden_units_x_batch_size,
+                                  0,
+                                  m_num_hidden_units_x_batch_size,
+                                  seq_end - seq_begin);
+        }  // layer_idx == 0
+        //=====================================================================
+        // record that we are computing m_i2h_workspace
+        for (unsigned seq_idx = seq_begin; seq_idx != seq_end; ++seq_idx)
+          CUDA_CALL(cudaEventRecord(m_event_i2h[layer_idx][seq_idx],
+                                    m_stream_i2h[layer_idx]));
+        //=====================================================================
+        // Hidden -> Hidden
+        //=====================================================================
+        CUBLAS_CALL(cublasSetStream(m_cublas_handle, m_stream_h2h[layer_idx]));
+        for (unsigned seq_idx = seq_begin; seq_idx < seq_end; ++seq_idx) {
+          if (seq_idx == 0) {
+            transpose(m_cublas_handle,
+                      RE_CAST(m_hidden.dptr) +
+                        layer_idx * (param_.seq_len + 1) *
+                          m_num_hidden_units_x_batch_size,
+                      init_hidden.dptr_ +
+                        layer_idx * m_num_hidden_units_x_batch_size,
+                      param_.batch_size, param_.num_hidden_units);
+          }  // seq_idx
+          matmul(m_cublas_handle,
+                 RE_CAST(m_h2h_workspace.dptr) +
+                   layer_idx * 4 * m_num_hidden_units_x_batch_size,
+                 h2h_weight.dptr_ +
+                   layer_idx * 4 * param_.num_hidden_units * param_.num_hidden_units,
+                 RE_CAST(m_hidden.dptr) +
+                   layer_idx * (param_.seq_len + 1) * m_num_hidden_units_x_batch_size +
+                                       seq_idx      * m_num_hidden_units_x_batch_size,
+                 4 * param_.num_hidden_units,
+                 param_.batch_size,
+                 param_.num_hidden_units);
+          if (seq_idx == 0) {
+            transpose(m_cublas_handle,
+                      RE_CAST(m_cell.dptr) +
+                        layer_idx * (param_.seq_len + 1) *
+                          m_num_hidden_units_x_batch_size,
+                      init_cell.dptr_ +
+                        layer_idx * m_num_hidden_units_x_batch_size,
+                      param_.batch_size,
+                      param_.num_hidden_units);
+          }
+          //=====================================================================
+          // wait here until the data in m_i2h_workspace is ready
+          // h2h needs to wait for i2h
+          CUDA_CALL(cudaStreamWaitEvent(m_stream_h2h[layer_idx],
+                                        m_event_i2h[layer_idx][seq_idx], 0));
+          __cuda_fused_lstm_forward
+            <<< (m_num_hidden_units_x_batch_size - 1) / 128 + 1,
+                128, 0, m_stream_h2h[layer_idx]
+            >>> (RE_CAST(m_i2h_workspace.dptr) +
+                   seq_idx * 4 * m_num_hidden_units_x_batch_size,
+                 RE_CAST(m_h2h_workspace.dptr) +
+                   layer_idx * 4 * m_num_hidden_units_x_batch_size,
+                 i2h_bias.dptr_ + layer_idx * 4 * param_.num_hidden_units,
+                 h2h_bias.dptr_ + layer_idx * 4 * param_.num_hidden_units,
+                 RE_CAST(m_cell.dptr) +
+                    layer_idx * (param_.seq_len + 1) * m_num_hidden_units_x_batch_size +
+                                        seq_idx      * m_num_hidden_units_x_batch_size,
+                 ctx.is_train ?
+                   RE_CAST(m_linear_gates.dptr) +
+                     layer_idx * param_.seq_len * 4 * m_num_hidden_units_x_batch_size +
+                                        seq_idx * 4 * m_num_hidden_units_x_batch_size
+                   : NULL,
+                 RE_CAST(m_cell.dptr) +
+                   layer_idx * (param_.seq_len + 1) * m_num_hidden_units_x_batch_size +
+                                      (seq_idx + 1) * m_num_hidden_units_x_batch_size,
+                 RE_CAST(m_hidden.dptr) +
+                   layer_idx * (param_.seq_len + 1) * m_num_hidden_units_x_batch_size +
+                                      (seq_idx + 1) * m_num_hidden_units_x_batch_size,
+                 param_.num_hidden_units,
+                 param_.batch_size);
+          // record that we are computing m_hidden
+          if (layer_idx != param_.num_layers - 1)
+            CUDA_CALL(cudaEventRecord(m_event_h2h[layer_idx][seq_idx],
+                                      m_stream_h2h[layer_idx]));
+          // output final hidden and cell state if at the end of sequence
+          if (param_.state_outputs && seq_idx == (param_.seq_len - 1)) {
+            transpose(m_cublas_handle,
+                      ptr_final_hidden +
+                        layer_idx * m_num_hidden_units_x_batch_size,
+                      RE_CAST(m_hidden.dptr) +
+                        layer_idx * (param_.seq_len + 1) * m_num_hidden_units_x_batch_size +
+                                           (seq_idx + 1) * m_num_hidden_units_x_batch_size,
+                      param_.num_hidden_units,
+                      param_.batch_size);
+            transpose(m_cublas_handle,
+                      ptr_final_cell +
+                        layer_idx * m_num_hidden_units_x_batch_size,
+                      RE_CAST(m_cell.dptr) +
+                        layer_idx * (param_.seq_len + 1) * m_num_hidden_units_x_batch_size +
+                                           (seq_idx + 1) * m_num_hidden_units_x_batch_size,
+                      param_.num_hidden_units,
+                      param_.batch_size);
+          }  // state_outputs
+        }  // seq_idx
+      }  // layer_idx
+    }  // schedule
+    transpose(m_cublas_handle,
+              concat_hidden_states.dptr_,
+              RE_CAST(m_hidden.dptr) +
+                (param_.num_layers - 1) * (param_.seq_len + 1) * m_num_hidden_units_x_batch_size +
+                                                             1 * m_num_hidden_units_x_batch_size,
+              param_.seq_len * param_.num_hidden_units,
+              param_.batch_size);
+    if (!param_.state_outputs) {
+      CUDA_CALL(cudaStreamSynchronize(m_stream_h2h[param_.num_layers - 1]));
+    } else {
+      for (unsigned layer_idx = 0; layer_idx < param_.num_layers; ++layer_idx)
+        CUDA_CALL(cudaStreamSynchronize(m_stream_h2h[layer_idx]));
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+
+    std::size_t in_expected = 7, out_expected = param_.state_outputs ? 3 : 1;
+    //=========================================================================
+    // IO Data
+    //=========================================================================
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(in_grad.size(), in_expected);
+    CHECK_EQ(req.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
+    CHECK_EQ(out_grad.size(), out_expected);
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    Tensor<gpu, 3, float> data        = in_data[open_lstm_rnn_enum::kData]
+                                        .get<gpu, 3, float>(s);
+    Tensor<gpu, 3, float> init_hidden = in_data[open_lstm_rnn_enum::kInitHidden]
+                                        .get<gpu, 3, float>(s);
+    Tensor<gpu, 3, float> init_cell   = in_data[open_lstm_rnn_enum::kInitCell]
+                                        .get<gpu, 3, float>(s);
+    Tensor<gpu, 1, float> i2h_weight  = in_data[open_lstm_rnn_enum::ki2hWeight]
+                                        .get<gpu, 1, float>(s);
+    Tensor<gpu, 1, float> i2h_bias    = in_data[open_lstm_rnn_enum::ki2hBias]
+                                        .get<gpu, 1, float>(s);
+    Tensor<gpu, 1, float> h2h_weight  = in_data[open_lstm_rnn_enum::kh2hWeight]
+                                        .get<gpu, 1, float>(s);
+    Tensor<gpu, 1, float> h2h_bias    = in_data[open_lstm_rnn_enum::kh2hBias]
+                                        .get<gpu, 1, float>(s);
+    Tensor<gpu, 3, float> data_grad       = in_grad[open_lstm_rnn_enum::kData]
+                                            .get<gpu, 3, float>(s);
+    Tensor<gpu, 1, float> i2h_weight_grad = in_grad[open_lstm_rnn_enum::ki2hWeight]
+                                            .get<gpu, 1, float>(s);
+    Tensor<gpu, 1, float> i2h_bias_grad   = in_grad[open_lstm_rnn_enum::ki2hBias]
+                                            .get<gpu, 1, float>(s);
+    Tensor<gpu, 1, float> h2h_weight_grad = in_grad[open_lstm_rnn_enum::kh2hWeight]
+                                            .get<gpu, 1, float>(s);
+    Tensor<gpu, 1, float> h2h_bias_grad   = in_grad[open_lstm_rnn_enum::kh2hBias]
+                                            .get<gpu, 1, float>(s);
+    Tensor<gpu, 3, float> concat_hidden_states_grad = out_grad[open_lstm_rnn_enum::
+                                                               kConcatHiddenStates]
+                                                      .get<gpu, 3, float>(s);
+    CHECK_EQ(data       .CheckContiguous(), true);
+    CHECK_EQ(init_hidden.CheckContiguous(), true);
+    CHECK_EQ(init_cell  .CheckContiguous(), true);
+    CHECK_EQ(i2h_weight .CheckContiguous(), true);
+    CHECK_EQ(i2h_bias   .CheckContiguous(), true);
+    CHECK_EQ(h2h_weight .CheckContiguous(), true);
+    CHECK_EQ(h2h_bias   .CheckContiguous(), true);
+    CHECK_EQ(data_grad      .CheckContiguous(), true);
+    CHECK_EQ(i2h_weight_grad.CheckContiguous(), true);
+    CHECK_EQ(i2h_bias_grad  .CheckContiguous(), true);
+    CHECK_EQ(h2h_weight_grad.CheckContiguous(), true);
+    CHECK_EQ(h2h_bias_grad  .CheckContiguous(), true);
+    CHECK_EQ(concat_hidden_states_grad.CheckContiguous(), true);
+    float *ptr_final_hidden_grad = NULL, *ptr_final_cell_grad = NULL;
+    if (param_.state_outputs) {
+      Tensor<gpu, 3, float> final_hidden_grad = out_grad[open_lstm_rnn_enum::kFinalHidden]
+                                                .get<gpu, 3, float>(s);
+      Tensor<gpu, 3, float> final_cell_grad   = out_grad[open_lstm_rnn_enum::kFinalCell]
+                                                .get<gpu, 3, float>(s);
+      CHECK_EQ(final_hidden_grad.CheckContiguous(), true);
+      CHECK_EQ(final_cell_grad  .CheckContiguous(), true);
+      ptr_final_hidden_grad = final_hidden_grad.dptr_;
+      ptr_final_cell_grad   = final_cell_grad  .dptr_;
+    }
+    //=========================================================================
+    // Preparation
+    //=========================================================================
+    unsigned block_size;
+    switch (m_algo) {
+      case enumBackwardReduceAlgo:: _32_HIERARCHICAL: block_size =  32; break;
+      case enumBackwardReduceAlgo:: _64_HIERARCHICAL: block_size =  64; break;
+      case enumBackwardReduceAlgo::_128_HIERARCHICAL: block_size = 128; break;
+      default: block_size = param_.batch_size <= 1024 ? param_.batch_size : 128;
+    }
+    CUDA_CALL(cudaMemsetAsync(i2h_weight_grad.dptr_,
+                              0,
+                              i2h_weight_grad.shape_[0] * sizeof(float),
+                              m_stream_i2h[param_.num_layers - 1]));
+    CUDA_CALL(cudaMemsetAsync(h2h_weight_grad.dptr_,
+                              0,
+                              h2h_weight_grad.shape_[0] * sizeof(float),
+                              m_stream_h2h[param_.num_layers - 1]));
+    CUDA_CALL(cudaMemsetAsync(h2h_bias_grad  .dptr_,
+                              0,
+                              h2h_bias_grad  .shape_[0] * sizeof(float),
+                              m_stream_h2h[param_.num_layers - 1]));
+    // There is no need to clear the i2h_bias_grad, as we always directly
+    // copy h2h_bias_grad to i2h_bias_grad.
+    CUDA_CALL(cudaMemsetAsync(m_cell_grad.dptr,
+                              0,
+                              m_cell_grad.size,
+                              m_stream_h2h[param_.num_layers - 1]));
+    //=========================================================================
+    // Backward Pass
+    //=========================================================================
+    CUBLAS_CALL(cublasSetStream(m_cublas_handle, m_stream_h2h[param_.num_layers - 1]));
+    transpose(m_cublas_handle,
+              RE_CAST(m_i2h_grad_workspace.dptr),
+              concat_hidden_states_grad.dptr_,
+              param_.batch_size,
+              param_.seq_len * param_.num_hidden_units);
+    for (ScheduleList::iterator iter  = m_backward_schedule.begin();
+                                iter != m_backward_schedule.end(); ++iter) {
+      // obtain the precomputed schedule
+      int layer_rbegin = param_.num_layers - 1 - iter->m_layer_begin,
+          layer_rend   = param_.num_layers - 1 - iter->m_layer_end,
+          seq_rbegin = param_.seq_len - 1 - iter->m_seq_begin,
+          seq_rend   = param_.seq_len - 1 - iter->m_seq_end;
+      for (int layer_idx = layer_rbegin; layer_idx > layer_rend; --layer_idx) {
+        //=====================================================================
+        // Hidden -> Hidden
+        //=====================================================================
+        CUBLAS_CALL(cublasSetStream(m_cublas_handle, m_stream_h2h[layer_idx]));
+        for (int seq_idx = seq_rbegin; seq_idx > seq_rend; --seq_idx) {
+          if (layer_idx != static_cast < int > (param_.num_layers - 1))
+            // wait here until the data in m_i2h_grad_workspace is ready
+            // h2h of previous layer needs to wait for i2h of next layer
+            CUDA_CALL(cudaStreamWaitEvent(m_stream_h2h[layer_idx],
+                                          m_event_i2h[layer_idx + 1][seq_idx], 0));
+          if (seq_idx == static_cast < int > (param_.seq_len - 1)) {
+            if (param_.state_outputs) {
+              // Under the condition that final cell and hidden states
+              // have gradients (e.g. training using stateful module),
+              // those gradients must also propagate back through the network.
+              transpose(m_cublas_handle,
+                        RE_CAST(m_h2h_grad_workspace.dptr) +
+                          layer_idx * m_num_hidden_units_x_batch_size,
+                        ptr_final_hidden_grad +
+                          layer_idx * m_num_hidden_units_x_batch_size,
+                        param_.batch_size,
+                        param_.num_hidden_units);
+              transpose(m_cublas_handle,
+                        RE_CAST(m_cell_grad.dptr) +
+                          layer_idx * m_num_hidden_units_x_batch_size,
+                        ptr_final_cell_grad +
+                          layer_idx * m_num_hidden_units_x_batch_size,
+                        param_.batch_size,
+                        param_.num_hidden_units);
+            }
+            __cuda_fused_lstm_backward
+              <<< dim3(param_.num_hidden_units,
+                       (param_.batch_size - 1) / block_size + 1),
+                  dim3(block_size),
+                  (m_algo == enumBackwardReduceAlgo::PURE_ATOMICS ||
+                   m_algo == enumBackwardReduceAlgo::_32_HIERARCHICAL) ?
+                    0 : 4 * block_size * sizeof(float),
+                  m_stream_h2h[layer_idx]
+              >>> (RE_CAST(m_i2h_workspace.dptr) +
+                       seq_idx * 4 * m_num_hidden_units_x_batch_size,
+                   RE_CAST(m_h2h_workspace.dptr) +
+                     layer_idx * 4 * m_num_hidden_units_x_batch_size,
+                   h2h_bias_grad.dptr_ +
+                     layer_idx * 4 * param_.num_hidden_units,
+                   RE_CAST(m_cell_grad.dptr) +
+                     layer_idx * m_num_hidden_units_x_batch_size,
+                   RE_CAST(m_cell.dptr) +
+                     layer_idx * (param_.seq_len + 1) * m_num_hidden_units_x_batch_size +
+                                         seq_idx      * m_num_hidden_units_x_batch_size,
+                   RE_CAST(m_linear_gates.dptr) +
+                     layer_idx *  param_.seq_len * 4  * m_num_hidden_units_x_batch_size +
+                                         seq_idx * 4  * m_num_hidden_units_x_batch_size,
+                   RE_CAST(m_cell.dptr) +
+                     layer_idx * (param_.seq_len + 1) * m_num_hidden_units_x_batch_size +
+                                        (seq_idx + 1) * m_num_hidden_units_x_batch_size,
+                   RE_CAST(m_i2h_grad_workspace.dptr) +
+                       seq_idx * m_num_hidden_units_x_batch_size,
+                   param_.state_outputs ?
+                     RE_CAST(m_h2h_grad_workspace.dptr) +
+                       layer_idx * m_num_hidden_units_x_batch_size :
+                       NULL,
+                   param_.batch_size,
+                   m_algo);
+          } else {
+            __cuda_fused_lstm_backward
+              <<< dim3(param_.num_hidden_units,
+                       (param_.batch_size - 1) / block_size + 1),
+                  dim3(block_size),
+                  (m_algo == enumBackwardReduceAlgo::PURE_ATOMICS ||
+                   m_algo == enumBackwardReduceAlgo::_32_HIERARCHICAL) ?
+                    0 : 4 * block_size * sizeof(float),
+                  m_stream_h2h[layer_idx]
+              >>> (RE_CAST(m_i2h_workspace.dptr) +
+                       seq_idx * 4 * m_num_hidden_units_x_batch_size,
+                   RE_CAST(m_h2h_workspace.dptr) +
+                     layer_idx * 4 * m_num_hidden_units_x_batch_size,
+                   h2h_bias_grad.dptr_ +
+                     layer_idx * 4 * param_.num_hidden_units,
+                   RE_CAST(m_cell_grad.dptr) +
+                     layer_idx * m_num_hidden_units_x_batch_size,
+                   RE_CAST(m_cell.dptr) +
+                     layer_idx * (param_.seq_len + 1) * m_num_hidden_units_x_batch_size +
+                                         seq_idx      * m_num_hidden_units_x_batch_size,
+                   RE_CAST(m_linear_gates.dptr) +
+                     layer_idx *  param_.seq_len * 4  * m_num_hidden_units_x_batch_size +
+                                         seq_idx * 4  * m_num_hidden_units_x_batch_size,
+                   RE_CAST(m_cell.dptr) +
+                     layer_idx * (param_.seq_len + 1) * m_num_hidden_units_x_batch_size +
+                                        (seq_idx + 1) * m_num_hidden_units_x_batch_size,
+                   RE_CAST(m_i2h_grad_workspace.dptr) +
+                       seq_idx * m_num_hidden_units_x_batch_size,
+                   RE_CAST(m_h2h_grad_workspace.dptr) +
+                     layer_idx * m_num_hidden_units_x_batch_size,
+                   param_.batch_size,
+                   m_algo);
+          }  // if (seq_idx == static_cast < int > (param_.seq_len - 1))
+          // record that we are computing m_i2h_workspace
+          CUDA_CALL(cudaEventRecord(m_event_h2h[layer_idx][seq_idx],
+                                    m_stream_h2h[layer_idx]));
+          matmul_ex(m_cublas_handle,
+                    h2h_weight_grad.dptr_ +
+                      layer_idx * 4 * param_.num_hidden_units * param_.num_hidden_units,
+                    RE_CAST(m_h2h_workspace.dptr) +
+                      layer_idx * 4 * m_num_hidden_units_x_batch_size,
+                    RE_CAST(m_hidden.dptr) +
+                      layer_idx * (param_.seq_len + 1) * m_num_hidden_units_x_batch_size +
+                                          seq_idx      * m_num_hidden_units_x_batch_size,
+                    CUBLAS_OP_N, CUBLAS_OP_T,
+                    4 * param_.num_hidden_units, param_.batch_size,
+                        param_.num_hidden_units, param_.batch_size,
+                    1.0, 1.0);
+          if (seq_idx > 0) {
+            matmul_ex(m_cublas_handle,
+              RE_CAST(m_h2h_grad_workspace.dptr) +
+                layer_idx * m_num_hidden_units_x_batch_size,
+              h2h_weight.dptr_ +
+                layer_idx * 4 * param_.num_hidden_units * param_.num_hidden_units,
+              RE_CAST(m_h2h_workspace.dptr) +
+                layer_idx * 4 * m_num_hidden_units_x_batch_size,
+              CUBLAS_OP_T, CUBLAS_OP_N,
+              4 * param_.num_hidden_units, param_.num_hidden_units,
+              4 * param_.num_hidden_units, param_.batch_size,
+              1.0, 0.0);
+          }  // if (seq_idx > 0)
+        }  // seq_idx
+        //=====================================================================
+        // Input -> Hidden
+        //=====================================================================
+        CUBLAS_CALL(cublasSetStream(m_cublas_handle, m_stream_i2h[layer_idx]));
+        // wait here until the data in m_i2h_workspace is ready
+        // i2h needs to wait for h2h
+        for (int seq_idx = seq_rbegin; seq_idx > seq_rend; --seq_idx)
+          CUDA_CALL(cudaStreamWaitEvent(m_stream_i2h[layer_idx],
+                                        m_event_h2h[layer_idx][seq_idx], 0));
+        if (layer_idx > 0) {
+          for (int seq_idx = seq_rbegin; seq_idx > seq_rend; --seq_idx)
+            matmul_ex(m_cublas_handle,
+                      i2h_weight_grad.dptr_ +
+                        4 * param_.num_hidden_units * param_.embed_dim +
+                        (layer_idx - 1) * 4 * param_.num_hidden_units * param_.num_hidden_units,
+                      RE_CAST(m_i2h_workspace.dptr) +
+                        seq_idx * 4 * m_num_hidden_units_x_batch_size,
+                      param_.i_dp_prob != 0 ?
+                        RE_CAST(m_i_dp_workspace.dptr) +
+                          (layer_idx - 1) * param_.seq_len * m_num_hidden_units_x_batch_size +
+                                                   seq_idx * m_num_hidden_units_x_batch_size :
+                        RE_CAST(m_hidden.dptr) +
+                          (layer_idx - 1) * (param_.seq_len + 1) *
+                            m_num_hidden_units_x_batch_size +
+                                                   (seq_idx + 1) *
+                            m_num_hidden_units_x_batch_size,
+                        CUBLAS_OP_N, CUBLAS_OP_T,
+                        4 * param_.num_hidden_units, param_.batch_size,
+                            param_.num_hidden_units, param_.batch_size,
+                        1.0, 1.0);
+          matmul_stridedbatched_ex(m_cublas_handle,
+                                   RE_CAST(m_i2h_grad_workspace.dptr) +
+                                     (seq_rend + 1) * m_num_hidden_units_x_batch_size,
+                                   i2h_weight.dptr_ +
+                                     4 * param_.num_hidden_units * param_.embed_dim +
+                                     (layer_idx - 1) * 4 * param_.num_hidden_units *
+                                       param_.num_hidden_units,
+                                   RE_CAST(m_i2h_workspace.dptr) +
+                                     (seq_rend + 1) * 4 * m_num_hidden_units_x_batch_size,
+                                   CUBLAS_OP_T, CUBLAS_OP_N,
+                                   4 * param_.num_hidden_units, param_.num_hidden_units,
+                                   4 * param_.num_hidden_units, param_.batch_size,
+                                   m_num_hidden_units_x_batch_size,
+                                   0,
+                                   4 * m_num_hidden_units_x_batch_size,
+                                   seq_rbegin - seq_rend,
+                                   1.0, 0.0);
+          if (param_.i_dp_prob != 0) {
+            __cuda_dropout_backward
+              <<< (m_num_hidden_units_x_batch_size * (seq_rbegin - seq_rend) - 1) / 128 + 1,
+                  128,
+                  0,
+                  m_stream_i2h[layer_idx]
+              >>> (
+                RE_CAST(m_i2h_grad_workspace.dptr) +
+                  (seq_rend + 1) * m_num_hidden_units_x_batch_size,
+                RE_CAST(m_i2h_grad_workspace.dptr) +
+                  (seq_rend + 1) * m_num_hidden_units_x_batch_size,
+                RE_CAST(m_i_dp_uniform_rv.dptr) +
+                  (layer_idx - 1) * param_.seq_len * m_num_hidden_units_x_batch_size +
+                                    (seq_rend + 1) * m_num_hidden_units_x_batch_size,
+                param_.i_dp_prob, m_num_hidden_units_x_batch_size * (seq_rbegin - seq_rend));
+          }  // if (param_.i_dp_prob != 0)
+          // record that we are computing m_i2h_grad_workspace
+          for (int seq_idx = seq_rbegin; seq_idx > seq_rend; --seq_idx)
+            CUDA_CALL(cudaEventRecord(m_event_i2h[layer_idx][seq_idx],
+                                      m_stream_i2h[layer_idx]));
+        } else {
+          for (int seq_idx = seq_rbegin; seq_idx > seq_rend; --seq_idx)
+            matmul_ex(m_cublas_handle,
+                      i2h_weight_grad.dptr_,
+                      RE_CAST(m_i2h_workspace.dptr) +
+                        (seq_rend + 1) * 4 * m_num_hidden_units_x_batch_size,
+                      RE_CAST(m_data_T_major.dptr) +
+                        (seq_rend + 1) * param_.embed_dim * param_.batch_size,
+                      CUBLAS_OP_N, CUBLAS_OP_T,
+                      4 * param_.num_hidden_units, param_.batch_size,
+                          param_.embed_dim       , param_.batch_size,
+                      1.0, 1.0);
+          matmul_stridedbatched_ex(m_cublas_handle,
+                                   RE_CAST(m_data_T_major_grad.dptr) +
+                                     (seq_rend + 1) * param_.embed_dim * param_.batch_size,
+                                   i2h_weight.dptr_,
+                                   RE_CAST(m_i2h_workspace.dptr) +
+                                     (seq_rend + 1) * 4 * m_num_hidden_units_x_batch_size,
+                                   CUBLAS_OP_T, CUBLAS_OP_N,
+                                   4 * param_.num_hidden_units, param_.embed_dim,
+                                   4 * param_.num_hidden_units, param_.batch_size,
+                                   param_.embed_dim * param_.batch_size,
+                                   0,
+                                   4 * m_num_hidden_units_x_batch_size,
+                                   seq_rbegin - seq_rend,
+                                   1.0, 0.0);
+        }  // if (layer_idx > 0)
+      }  // layer_idx
+    }  // schedule
+    CUDA_CALL(cudaMemcpyAsync(i2h_bias_grad.dptr_,
+                              h2h_bias_grad.dptr_,
+                              param_.num_layers * 4 * param_.num_hidden_units * sizeof(float),
+                              cudaMemcpyDeviceToDevice,
+                              m_stream_h2h[0]));
+    transpose(m_cublas_handle,
+              data_grad.dptr_,
+              RE_CAST(m_data_T_major_grad.dptr),
+              param_.seq_len * param_.embed_dim,
+              param_.batch_size);
+    CUDA_CALL(cudaStreamSynchronize(m_stream_i2h[0]));
+    CUDA_CALL(cudaStreamSynchronize(m_stream_h2h[0]));
+  }
+
+ private:
+  void Init(mshadow::Stream<gpu> *s,
+            const std::vector<TBlob> &in_data,
+            const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+
+    std::size_t in_expected = 7, out_expected = param_.state_outputs ? 3 : 1;
+
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
+
+    if (!initialized_) {
+      initialized_ = true;
+      Tensor<gpu, 3, float> data = in_data[open_lstm_rnn_enum::kData].get<gpu, 3, float>(s);
+      //=======================================================================
+      // infer the parameters from the input data
+      param_.batch_size = data.shape_[0];
+      param_.seq_len    = data.shape_[1];
+      param_.embed_dim  = data.shape_[2];
+      m_num_hidden_units_x_batch_size = param_.batch_size * param_.num_hidden_units;
+      //=======================================================================
+      // allocate workspace
+      m_data_T_major       = Storage::Get()->Alloc(param_.seq_len * param_.embed_dim *
+                                                     param_.batch_size * sizeof(float),
+                                                   Context::GPU());
+      m_data_T_major_grad  = Storage::Get()->Alloc(param_.seq_len * param_.embed_dim *
+                                                     param_.batch_size * sizeof(float),
+                                                   Context::GPU());
+      m_cell               = Storage::Get()->Alloc(param_.num_layers * (param_.seq_len + 1) *
+                                                     m_num_hidden_units_x_batch_size *
+                                                     sizeof(float),
+                                                   Context::GPU());
+      m_hidden             = Storage::Get()->Alloc(param_.num_layers * (param_.seq_len + 1) *
+                                                     m_num_hidden_units_x_batch_size *
+                                                     sizeof(float),
+                                                   Context::GPU());
+      m_cell_grad          = Storage::Get()->Alloc(param_.num_layers *
+                                                     m_num_hidden_units_x_batch_size *
+                                                     sizeof(float),
+                                                   Context::GPU());
+      m_i2h_workspace      = Storage::Get()->Alloc(param_.seq_len    * 4 *
+                                                     m_num_hidden_units_x_batch_size *
+                                                     sizeof(float),
+                                                   Context::GPU());
+      m_h2h_workspace      = Storage::Get()->Alloc(param_.num_layers * 4 *
+                                                     m_num_hidden_units_x_batch_size *
+                                                     sizeof(float),
+                                                   Context::GPU());
+      m_i2h_grad_workspace = Storage::Get()->Alloc(param_.seq_len *
+                                                     m_num_hidden_units_x_batch_size *
+                                                     sizeof(float),
+                                                   Context::GPU());
+      m_h2h_grad_workspace = Storage::Get()->Alloc(param_.num_layers *
+                                                     m_num_hidden_units_x_batch_size *
+                                                     sizeof(float),
+                                                   Context::GPU());
+      m_linear_gates       = Storage::Get()->Alloc(param_.num_layers * param_.seq_len * 4 *
+                                                     m_num_hidden_units_x_batch_size *
+                                                     sizeof(float),
+                                                   Context::GPU());
+      //=======================================================================
+      // initialize forward and backward compute schedule
+       m_forward_schedule = ScheduleList(param_.num_layers, param_.seq_len);
+      m_backward_schedule = ScheduleList(param_.num_layers, param_.seq_len);
+      m_forward_schedule.init(1); m_backward_schedule.init(1);
+      //=======================================================================
+      // initialize workers (cudaStream and cudaEvent)
+      CUBLAS_CALL(cublasCreate(&m_cublas_handle));
+      m_stream_i2h = new cudaStream_t[param_.num_layers];
+      m_stream_h2h = new cudaStream_t[param_.num_layers];
+      for (unsigned layer_idx = 0; layer_idx < param_.num_layers; ++layer_idx) {
+        CUDA_CALL(cudaStreamCreateWithPriority(&m_stream_i2h[layer_idx],
+                                               cudaStreamNonBlocking,
+                                               0));
+        CUDA_CALL(cudaStreamCreateWithPriority(&m_stream_h2h[layer_idx],
+                                               cudaStreamNonBlocking,
+                                               -1));
+      }
+      m_event_i2h = new cudaEvent_t*[param_.num_layers];
+      m_event_h2h = new cudaEvent_t*[param_.num_layers];
+      for (unsigned layer_idx = 0; layer_idx < param_.num_layers; ++layer_idx) {
+        m_event_i2h[layer_idx] = new cudaEvent_t[param_.seq_len];
+        m_event_h2h[layer_idx] = new cudaEvent_t[param_.seq_len];
+        for (unsigned seq_idx = 0; seq_idx < param_.seq_len; ++seq_idx) {
+          CUDA_CALL(cudaEventCreateWithFlags(&m_event_i2h[layer_idx][seq_idx],
+                                             cudaEventDisableTiming));
+          CUDA_CALL(cudaEventCreateWithFlags(&m_event_h2h[layer_idx][seq_idx],
+                                             cudaEventDisableTiming));
+        }
+      }
+      //=======================================================================
+      // determine the algorithm for backward pass
+      if ((param_.batch_size % 128) <= 32) {
+        m_algo = enumBackwardReduceAlgo:: _32_HIERARCHICAL;
+      } else if ((param_.batch_size % 128) <= 64) {
+        m_algo = enumBackwardReduceAlgo:: _64_HIERARCHICAL;
+      } else {
+        m_algo = enumBackwardReduceAlgo:: _128_HIERARCHICAL;
+      }
+      //=======================================================================
+      // initialize input dropout random variable, if needed
+      if (param_.i_dp_prob != 0 && param_.num_layers > 1) {
+        CURAND_CALL(curandCreateGenerator(&m_rng,
+                                          CURAND_RNG_PSEUDO_DEFAULT));
+        CURAND_CALL(curandSetPseudoRandomGeneratorSeed(m_rng,
+                                                       time(nullptr)));
+        m_i_dp_uniform_rv = Storage::Get()->Alloc((param_.num_layers - 1) * param_.seq_len *
+                                                    m_num_hidden_units_x_batch_size *
+                                                    sizeof(float),
+                                                  Context::GPU());
+        m_i_dp_workspace  = Storage::Get()->Alloc((param_.num_layers - 1) * param_.seq_len *
+                                                    m_num_hidden_units_x_batch_size *
+                                                    sizeof(float),
+                                                  Context::GPU());
+      }  // if (param_.i_dp_prob != 0 && param_.num_layers > 1)
+    }  // if (!initialized_)
+  }
+
+  bool initialized_; OpenLSTMRNNParam param_;
+
+  struct Schedule {
+    unsigned m_layer_begin, m_layer_end, m_seq_begin, m_seq_end;
+  };
+  struct ScheduleList : public std::vector<Schedule> {
+   private:
+    unsigned m_num_layers, m_seq_len;
+
+   public:
+    ScheduleList() {}
+    ScheduleList(const unsigned & num_layers, const unsigned & seq_len) :
+      m_num_layers(num_layers), m_seq_len(seq_len)
+    {}
+    // Initialize the compute schedule, an essential part for wavefront parallelism.
+    void init(const unsigned & schedule_time_stride) {
+      for (int layer_begin = 0, layer_end = 0, seq_begin = 0, seq_end = 0; ;) {
+        if (layer_end == 0) {
+          layer_begin = 0; layer_end = 1; seq_begin = 0;
+        } else {
+          // move up and left
+          ++layer_begin; ++layer_end;
+          seq_begin -= schedule_time_stride;
+          // over the top or off the left, reset to layer 0
+          if (layer_end > static_cast < int > (m_num_layers) || seq_begin < 0) {
+            seq_begin += (layer_begin + 1) * schedule_time_stride;
+            layer_begin = 0; layer_end = 1;
+          }
+          while (seq_begin >= static_cast < int > (m_seq_len) &&
+                 layer_end <= static_cast < int > (m_num_layers)) {
+            ++layer_begin; ++layer_end;
+            seq_begin -= schedule_time_stride;
+          }
+          // over the top or off the left -> DONE!
+          if (layer_end > static_cast < int > (m_num_layers) || seq_begin < 0)
+            break;
+        }  // if (layer_end == 0)
+        seq_end = seq_begin + schedule_time_stride;
+        // prevent overflow
+        if (seq_end > static_cast < int > (m_seq_len))
+          seq_end = m_seq_len;
+        //==============================================================
+        // End of Scheduling
+        //==============================================================
+        Schedule schedule;
+        schedule.m_layer_begin = layer_begin; schedule.m_layer_end = layer_end;
+        schedule.m_seq_begin = seq_begin; schedule.m_seq_end = seq_end;
+        this->push_back(schedule);
+      }
+    }
+  } m_forward_schedule, m_backward_schedule;
+
+  unsigned m_num_hidden_units_x_batch_size;
+
+  Storage::Handle m_data_T_major, m_data_T_major_grad,
+                  m_cell, m_hidden,
+                  m_cell_grad,
+                  m_i2h_workspace,
+                  m_h2h_workspace,
+                  m_i2h_grad_workspace,
+                  m_h2h_grad_workspace,
+                  m_linear_gates;
+
+  curandGenerator_t m_rng;
+  Storage::Handle m_i_dp_uniform_rv,
+                  m_i_dp_workspace;
+
+  cudaStream_t *m_stream_i2h, *m_stream_h2h;
+  cudaEvent_t **m_event_i2h, **m_event_h2h;
+
+  cublasHandle_t m_cublas_handle;
+
+  enumBackwardReduceAlgo m_algo;
+};  // CUOpenLSTMRNNOp
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_CU_OPEN_LSTM_RNN_INL_CUH_
diff --git a/src/operator/contrib/open_lstm_rnn-inl.h b/src/operator/contrib/open_lstm_rnn-inl.h
new file mode 100644
index 00000000000..20b59b70a99
--- /dev/null
+++ b/src/operator/contrib/open_lstm_rnn-inl.h
@@ -0,0 +1,253 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file open_lstm_rnn-inl.h
+ * \brief LSTM RNN Open-Source CUDA Implementation
+ * \author Bojian (Jack) Zheng, Gennady Pekhimenko
+ */
+#ifndef MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INL_H_
+#define MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include <algorithm>
+#include "../operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace open_lstm_rnn_enum {
+  enum OpenLSTMRNNOpInputs {kData, kInitHidden, kInitCell,
+                            ki2hWeight, ki2hBias,
+                            kh2hWeight, kh2hBias};
+  enum OpenLSTMRNNOpOutputs {kConcatHiddenStates,
+                             kFinalHidden, kFinalCell};
+}
+
+struct OpenLSTMRNNParam : public dmlc::Parameter < OpenLSTMRNNParam > {
+  // parameters that determine RNN configurations
+  uint32_t num_layers, num_hidden_units;
+  float i_dp_prob;  // input dropout probability
+  // parameters that are inferred from input data
+  unsigned batch_size, seq_len, embed_dim;
+
+  bool state_outputs;  // whether to output the final hidden and cell state
+
+  DMLC_DECLARE_PARAMETER(OpenLSTMRNNParam) {
+    DMLC_DECLARE_FIELD(num_layers)
+    .describe("number of stacked layers");
+    DMLC_DECLARE_FIELD(num_hidden_units)
+    .describe("number of hidden units");
+    DMLC_DECLARE_FIELD(i_dp_prob).set_default(0.).set_range(0, 1)
+    .describe("input dropout probability");
+
+    DMLC_DECLARE_FIELD(state_outputs).set_default(false)
+    .describe("Whether to have final hidden and cell states as symbol outputs");
+  }
+};
+
+template < typename xpu, typename DType >
+class OpenLSTMRNNOp : public Operator {
+ public:
+  explicit OpenLSTMRNNOp(OpenLSTMRNNParam p)
+  {}
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    // using namespace mshadow;
+    // using namespace mshadow::expr;
+    // OpenLSTMRNN can only run on the GPU
+  }
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    // using namespace mshadow;
+    // using namespace mshadow::expr;
+    // OpenLSTMRNN can only run on the GPU
+  }
+};
+
+template < typename xpu >
+Operator * CreateOp(OpenLSTMRNNParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class OpenLSTMRNNProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "init_hidden", "init_cell",
+            "i2h_weight", "i2h_bias",
+            "h2h_weight", "h2h_bias"};
+  }
+  std::vector<std::string> ListOutputs() const override {
+    if (param_.state_outputs)
+      return {"concat_hidden_states",
+              "final_hidden", "final_cell"};
+    else
+      return {"concat_hidden_states"};
+  }
+
+  int NumOutputs() const override {
+    if (param_.state_outputs)
+      return 3;
+    else
+      return 1;
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string>> &kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 7U) << "Input: [data, init_hidden, init_cell, "
+                                      "i2h_weight, i2h_bias, "
+                                      "h2h_weight, h2h_bias]";
+    const TShape & dshape = (*in_shape)[open_lstm_rnn_enum::kData];
+    CHECK_EQ(dshape.ndim(), 3U) << "Input data should be rank-3 tensor of dimension "
+                                   "[batch_size, seq_len, embed_dim]";
+    unsigned batch_size = dshape[0], embed_dim = dshape[2],
+             num_layers = param_.num_layers,
+             num_hidden_units = param_.num_hidden_units;
+    SHAPE_ASSIGN_CHECK(*in_shape, open_lstm_rnn_enum::kInitHidden,
+                       Shape3(num_layers, batch_size, num_hidden_units));
+    SHAPE_ASSIGN_CHECK(*in_shape, open_lstm_rnn_enum::kInitCell,
+                       Shape3(num_layers, batch_size, num_hidden_units));
+    SHAPE_ASSIGN_CHECK(*in_shape, open_lstm_rnn_enum::ki2hWeight,
+                       Shape1(4 * num_hidden_units * embed_dim +
+                              (num_layers-1) * 4 * num_hidden_units * num_hidden_units));
+    SHAPE_ASSIGN_CHECK(*in_shape, open_lstm_rnn_enum::ki2hBias,
+                       Shape1(num_layers * 4 * num_hidden_units));
+    SHAPE_ASSIGN_CHECK(*in_shape, open_lstm_rnn_enum::kh2hWeight,
+                       Shape1(num_layers * 4 * num_hidden_units * num_hidden_units));
+    SHAPE_ASSIGN_CHECK(*in_shape, open_lstm_rnn_enum::kh2hBias,
+                       Shape1(num_layers * 4 * num_hidden_units));
+    out_shape->clear();
+    // oshape: [batch_size x seq_len x embed_dim]
+    TShape oshape = dshape; oshape[2] = num_hidden_units;
+    out_shape->push_back(oshape);  // concatenated hidden states
+    if (param_.state_outputs) {
+      oshape[0] = num_layers;
+      oshape[1] = batch_size;
+      oshape[2] = num_hidden_units;
+      out_shape->push_back(oshape);  // final hidden state
+      out_shape->push_back(oshape);  // final cell state
+    }
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (std::size_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1)
+        (*in_type)[i] = dtype;
+      else
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. " <<
+                                          "Expected " << dtype << " v.s. given " <<
+                                          (*in_type)[i] << " at " << ListArguments()[i];
+    }
+    out_type->clear();
+    out_type->push_back(dtype);  // concatenated hidden states
+    if (param_.state_outputs) {
+      out_type->push_back(dtype);
+      out_type->push_back(dtype);
+    }
+    return true;
+  }
+
+  OperatorProperty * Copy() const override {
+    auto ptr = new OpenLSTMRNNProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "OpenLSTMRNN";
+  }
+
+  std::vector<int> DeclareBackwardDependency(const std::vector<int> &out_grad,
+                                             const std::vector<int> &in_data,
+                                             const std::vector<int> &out_data) const override {
+    std::vector<int> dep = {in_data[open_lstm_rnn_enum::kData],
+                            in_data[open_lstm_rnn_enum::kInitHidden],
+                            in_data[open_lstm_rnn_enum::kInitCell],
+                            in_data[open_lstm_rnn_enum::ki2hWeight],
+                            in_data[open_lstm_rnn_enum::ki2hBias],
+                            in_data[open_lstm_rnn_enum::kh2hWeight],
+                            in_data[open_lstm_rnn_enum::kh2hBias],
+                            out_data[open_lstm_rnn_enum::kConcatHiddenStates],
+                            out_grad[open_lstm_rnn_enum::kConcatHiddenStates]};
+    if (param_.state_outputs) {
+      dep.push_back(out_data[open_lstm_rnn_enum::kFinalHidden]);
+      dep.push_back(out_grad[open_lstm_rnn_enum::kFinalHidden]);
+      dep.push_back(out_data[open_lstm_rnn_enum::kFinalCell]);
+      dep.push_back(out_grad[open_lstm_rnn_enum::kFinalCell]);
+    }
+    return dep;
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+    const std::vector<TShape> &in_shape) const override {
+    return {};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+    const std::vector<TShape> &in_shape) const override {
+    return {};
+  }
+
+  Operator * CreateOperator(Context ctx) const override {
+    LOG(ERROR) << "OpenLSTMRNN can only run on the GPU.";
+    return NULL;
+  }
+
+  Operator * CreateOperatorEx(Context ctx,
+                              std::vector<TShape> *in_shape,
+                              std::vector<int> *in_type) const override;
+
+ private:
+  OpenLSTMRNNParam param_;
+};
+
+#endif
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INL_H_
diff --git a/src/operator/contrib/open_lstm_rnn.cc b/src/operator/contrib/open_lstm_rnn.cc
new file mode 100644
index 00000000000..ac2ab4a802d
--- /dev/null
+++ b/src/operator/contrib/open_lstm_rnn.cc
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file open_lstm_rnn.cc
+ * \brief LSTM RNN Open-Source CUDA Implementation
+ * \author Bojian (Jack) Zheng, Gennady Pekhimenko
+ */
+#include "./open_lstm_rnn-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template <>
+Operator *CreateOp<cpu>(OpenLSTMRNNParam param, int dtype) {
+  LOG(ERROR) << "OpenLSTMRNN is only available for gpu at the moment.";
+
+  Operator * op = nullptr;
+
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType,
+                           {op = new OpenLSTMRNNOp<cpu, DType>(param);});
+
+  return op;
+}
+
+Operator *OpenLSTMRNNProp::CreateOperatorEx(Context ctx,
+                                            std::vector<TShape> *in_shape,
+                                            std::vector<int> *in_type) const {
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(OpenLSTMRNNParam);
+
+MXNET_REGISTER_OP_PROPERTY(OpenLSTMRNN, OpenLSTMRNNProp)
+.describe(R"code(Applies a LSTM recurrent layer to input, with multi-layer BUT NOT bidirectional support.
+**LSTM**
+Long Short-Term Memory - Hochreiter, 1997.
+.. math::
+  \begin{array}{ll}
+            i_t = \mathrm{sigmoid}(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
+            f_t = \mathrm{sigmoid}(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
+            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\
+            o_t = \mathrm{sigmoid}(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
+            c_t = f_t * c_{(t-1)} + i_t * g_t \\
+            h_t = o_t * \tanh(c_t)
+            \end{array})code")
+.add_argument("data", "NDArray-or-Symbol", "Input data to EcoRNN")
+.add_argument("init_hidden", "NDArray-or-Symbol", "Initial Hidden State")
+.add_argument("init_cell"  , "NDArray-or-Symbol", "Initial Cell State")
+.add_argument("i2h_weight" , "NDArray-or-Symbol",  "Input-to-Hidden Weight")
+.add_argument("i2h_bias"   , "NDArray-or-Symbol",  "Input-to-Hidden Bias")
+.add_argument("h2h_weight" , "NDArray-or-Symbol", "Hidden-to-Hidden Weight")
+.add_argument("h2h_bias"   , "NDArray-or-Symbol", "Hidden-to-Hidden Bias")
+.add_arguments(OpenLSTMRNNParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/open_lstm_rnn.cu b/src/operator/contrib/open_lstm_rnn.cu
new file mode 100644
index 00000000000..52c304d973e
--- /dev/null
+++ b/src/operator/contrib/open_lstm_rnn.cu
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file open_lstm_rnn.cu
+ * \brief LSTM RNN Open-Source CUDA Implementation
+ * \author Bojian (Jack) Zheng, Gennady Pekhimenko
+ */
+#include "./cu_open_lstm_rnn-inl.cuh"
+
+namespace mxnet {
+namespace op {
+
+template <>
+Operator *CreateOp<gpu>(OpenLSTMRNNParam param, int dtype) {
+  Operator * op = NULL;
+
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType,
+                           {op = new CUOpenLSTMRNNOp(param);});
+
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/open_lstm_rnn_include/cublas_matmul.h b/src/operator/contrib/open_lstm_rnn_include/cublas_matmul.h
new file mode 100644
index 00000000000..348984fcf3f
--- /dev/null
+++ b/src/operator/contrib/open_lstm_rnn_include/cublas_matmul.h
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file cublas_transpose.cuh
+ * \brief Matmul cuBLAS Implementation
+ * \author Bojian (Jack) Zheng, Gennady Pekhimenko
+ */
+#ifndef MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INCLUDE_CUBLAS_MATMUL_H_
+#define MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INCLUDE_CUBLAS_MATMUL_H_
+
+#include "../../../common/cuda_utils.h"
+
+// Matmul using cuBLAS approach.
+// @param1 cublas_handle: cuBLAS Handle
+// @param2 _o: Output
+// [_1_rows x _2_cols]
+// @param3 _1: 1st Operand
+// [_1_rows x _1_cols_2_rows]
+// @param4 _2: 2nd Operand
+// [_1_cols_2_rows x _2_cols]
+// @param5 _1_rows: Rows of 1st Operand
+// @param6 _2_cols: Cols of 2nd Operand
+// @param7 _1_cols_2_rows: Cols of 1st Operand (or Rows of 2nd Operand)
+inline void matmul
+  (
+    const cublasHandle_t cublas_handle,
+          float * const _o,
+    const float * const _1,
+    const float * const _2,
+    const unsigned & _1_rows, const unsigned & _2_cols,
+    const unsigned & _1_cols_2_rows
+  ) {
+  static const float alpha = 1.0, beta = 0.0;
+
+  // cuBLAS Matrix Multiply - C = alpha * op(A) * op(B) + beta * C
+  // @param1  handle: cuBLAS Handle
+  // @param2  transa: Transpose matrix A?
+  // @param3  transb: Transpose matrix B?
+  // @param4  m: Number of Rows of Matrix A and C
+  // @param5  n: Number of Cols of Matrix B and C
+  // @param6  k: Number of Cols of Matrix A and Rows of B
+  // @param7  alpha
+  // @param8  A
+  // @param9  lda: Leading Dimension of A
+  // @param10 B
+  // @param11 ldb: Leading Dimension of B
+  // @param12 beta
+  // @param13 C
+  // @param14 ldc: Leading Dimension of C
+  CUBLAS_CALL(cublasSgemm(cublas_handle,
+    CUBLAS_OP_N, CUBLAS_OP_N,
+    _2_cols, _1_rows, _1_cols_2_rows,
+    &alpha,
+    _2, _2_cols,
+    _1, _1_cols_2_rows,
+    &beta,
+    _o, _2_cols));
+}
+// Matmul using cuBLAS approach (Extended Version).
+// @param1  cublas_handle: cuBLAS Handle
+// @param2  _o: Output
+// [_1_rows x _2_cols]
+// @param3  _1: 1st Operand
+// [_1_rows x _1_cols_2_rows]
+// @param4  _2: 2nd Operand
+// [_1_cols_2_rows x _2_cols]
+// @param5  _1_op: Operation on 1st Operand (Normal? Transpose?)
+// @param6  _2_op: Operation on 2nd Operand (Normal? Transpose?)
+// @param7  _1_rows: Rows of 1st Operand
+// @param8  _1_cols: Cols of 1st Operand
+// @param9  _2_rows: Rows of 2nd Operand
+// @param10 _2_cols: Cols of 2nd Operand
+// @param11 alpha: see the comment below
+// @param12  beta: see the comment below
+inline void matmul_ex
+  (
+    const cublasHandle_t cublas_handle,
+          float * const _o,
+    const float * const _1,
+    const float * const _2,
+    const cublasOperation_t & _1_op,
+    const cublasOperation_t & _2_op,
+    const unsigned & _1_rows, const unsigned & _1_cols,
+    const unsigned & _2_rows, const unsigned & _2_cols,
+    const float & alpha, const float & beta
+  ) {
+  if (_1_op == CUBLAS_OP_N) {
+    if (_2_op == CUBLAS_OP_N) {
+      assert(_1_cols == _2_rows);
+    } else {
+      assert(_1_cols == _2_cols);
+    }
+  } else {
+    if (_2_op == CUBLAS_OP_N) {
+      assert(_1_rows == _2_rows);
+    } else {
+      assert(_1_rows == _2_cols);
+    }
+  }
+
+  // cuBLAS Matrix Multiply - C = alpha * op(A) * op(B) + beta * C
+  // @param1  handle: cuBLAS Handle
+  // @param2  transa: Transpose matrix A?
+  // @param3  transb: Transpose matrix B?
+  // @param4  m: Number of Rows of Matrix op(A) and C
+  // @param5  n: Number of Cols of Matrix op(B) and C
+  // @param6  k: Number of Cols of Matrix op(A) and Rows of op(B)
+  // @param7  alpha
+  // @param8  A
+  // @param9  lda: Leading Dimension of A
+  // @param10 B
+  // @param11 ldb: Leading Dimension of B
+  // @param12 beta
+  // @param13 C
+  // @param14 ldc: Leading Dimension of C
+  CUBLAS_CALL(cublasSgemm(cublas_handle,
+    _2_op, _1_op,
+    _2_op == CUBLAS_OP_N ? _2_cols : _2_rows,
+    _1_op == CUBLAS_OP_N ? _1_rows : _1_cols,
+    _1_op == CUBLAS_OP_N ? _1_cols : _1_rows,
+    &alpha,
+    _2, _2_cols,
+    _1, _1_cols,
+    &beta,
+    _o, _2_op == CUBLAS_OP_N ? _2_cols : _2_rows));
+}
+// Strided Batched Matmul using cuBLAS approach.
+// @param1  cublas_handle: cuBLAS Handle
+// @param2  _o: Output
+// [_1_rows x _2_cols]
+// @param3  _1: 1st Operand
+// [_1_rows x _1_cols_2_rows]
+// @param4  _2: 2nd Operand
+// [_1_cols_2_rows x _2_cols]
+// @param5  _1_rows: Rows of 1st Operand
+// @param6  _2_cols: Cols of 2nd Operand
+// @param7  _1_cols_2_rows: Cols of 1st Operand (or Rows of 2nd Operand)
+// @param8  stride_o: Output Stride
+// @param9  stride_1: Stride of 1st Operand
+// @param10 stride_2: Stride of 2nd Operand
+// @param11 batch_cnt: Batch Count
+inline void matmul_stridedbatched
+  (
+    const cublasHandle_t cublas_handle,
+          float * const _o,
+    const float * const _1,
+    const float * const _2,
+    const unsigned & _1_rows, const unsigned & _2_cols,
+    const unsigned & _1_cols_2_rows,
+    const unsigned & stride_o,
+    const unsigned & stride_1,
+    const unsigned & stride_2,
+    const unsigned & batch_cnt
+  ) {
+  static const float alpha = 1.0, beta = 0.0;
+
+  // Function argument list is very similar to the API above,
+  // except that the operation is now strided batched.
+  CUBLAS_CALL(cublasSgemmStridedBatched(cublas_handle,
+    CUBLAS_OP_N, CUBLAS_OP_N,
+    _2_cols, _1_rows, _1_cols_2_rows,
+    &alpha,
+    _2, _2_cols, stride_2,
+    _1, _1_cols_2_rows, stride_1,
+    &beta,
+    _o, _2_cols, stride_o,
+    batch_cnt));
+}
+// Strided Batched Matmul using cuBLAS approach (Extended Version).
+// @param1  cublas_handle: cuBLAS Handle
+// @param2  _o: Output
+// [_1_rows x _2_cols]
+// @param3  _1: 1st Operand
+// [_1_rows x _1_cols_2_rows]
+// @param4  _2: 2nd Operand
+// [_1_cols_2_rows x _2_cols]
+// @param5  _1_op: Operation on 1st Operand (Normal? Transpose?)
+// @param6  _1_op: Operation on 1st Operand (Normal? Transpose?)
+// @param7  _1_rows: Rows of 1st Operand
+// @param8  _1_cols: Cols of 1st Operand
+// @param9  _2_rows: Rows of 2nd Operand
+// @param10  _2_cols: Cols of 2nd Operand
+// @param11 stride_o: Output Stride
+// @param12 stride_1: Stride of 1st Operand
+// @param13 stride_2: Stride of 2nd Operand
+// @param14 batch_cnt: Batch Count
+// @param15 alpha: refer to matmul_ex
+// @param16  beta: refer to matmul_ex
+inline void matmul_stridedbatched_ex(
+    const cublasHandle_t cublas_handle,
+          float * const _o,
+    const float * const _1,
+    const float * const _2,
+    const cublasOperation_t & _1_op,
+    const cublasOperation_t & _2_op,
+    const unsigned & _1_rows, const unsigned & _1_cols,
+    const unsigned & _2_rows, const unsigned & _2_cols,
+    const unsigned & stride_o,
+    const unsigned & stride_1,
+    const unsigned & stride_2,
+    const unsigned & batch_cnt,
+    const float & alpha, const float & beta
+  ) {
+  if (_1_op == CUBLAS_OP_N) {
+    if (_2_op == CUBLAS_OP_N) {
+      assert(_1_cols == _2_rows);
+    } else {
+      assert(_1_cols == _2_cols);
+    }
+  } else {
+    if (_2_op == CUBLAS_OP_N) {
+      assert(_1_rows == _2_rows);
+    } else {
+      assert(_1_rows == _2_cols);
+    }
+  }
+
+  // Function argument list is very similar to the API above,
+  // except that the operation is now strided batched.
+  CUBLAS_CALL(cublasSgemmStridedBatched(cublas_handle,
+    _2_op, _1_op,
+    _2_op == CUBLAS_OP_N ? _2_cols : _2_rows,
+    _1_op == CUBLAS_OP_N ? _1_rows : _1_cols,
+    _1_op == CUBLAS_OP_N ? _1_cols : _1_rows,
+    &alpha,
+    _2, _2_cols, stride_2,
+    _1, _1_cols, stride_1,
+    &beta,
+    _o, _2_op == CUBLAS_OP_N ? _2_cols : _2_rows, stride_o,
+    batch_cnt));
+}
+
+#endif  // MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INCLUDE_CUBLAS_MATMUL_H_
diff --git a/src/operator/contrib/open_lstm_rnn_include/cublas_transpose.h b/src/operator/contrib/open_lstm_rnn_include/cublas_transpose.h
new file mode 100644
index 00000000000..2dd9e71c81d
--- /dev/null
+++ b/src/operator/contrib/open_lstm_rnn_include/cublas_transpose.h
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file cublas_transpose.h
+ * \brief Transpose cuBLAS Implementation
+ * \author Bojian (Jack) Zheng, Gennady Pekhimenko
+ */
+#ifndef MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INCLUDE_CUBLAS_TRANSPOSE_H_
+#define MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INCLUDE_CUBLAS_TRANSPOSE_H_
+
+#include "../../../common/cuda_utils.h"
+
+// Transpose using cuBLAS approach.
+// @param1 cublas_handle: cuBLAS Handle
+// @param2 _o: Output
+// [cols x rows]
+// @param2 _1: 1st Operand
+// [rows x cols]
+// @param2 rows: Number of Rows
+// @param3 cols: Number of Cols
+inline void transpose(
+  const cublasHandle_t cublas_handle,
+        float * const _o,
+  const float * const _1,
+  const unsigned & rows, const unsigned & cols
+  ) {
+  static const float alpha = 1.0, beta = 0.0;
+
+  // cuBLAS Matrix Transpose - C = alpha * op(A) + beta * op(B)
+  // @param1 handle: cuBLAS Handle
+  // @param2 transa: Transpose matrix A?
+  // @param3 transb: Transpose matrix B?
+  // @param4 alpha
+  // @param5  A
+  // @param6  lda: Leading Dimension of A
+  // @param7 beta
+  // @param8  B
+  // @param9  ldb: Leading Dimension of B
+  // @param10 C
+  // @param11 ldc: Leading Dimension of C
+  CUBLAS_CALL(cublasSgeam(cublas_handle,
+                          CUBLAS_OP_T, CUBLAS_OP_N,
+                          rows, cols,
+                          &alpha, _1, cols,
+                          &beta, nullptr, rows,
+                          _o, rows));
+}
+
+#endif  // MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INCLUDE_CUBLAS_TRANSPOSE_H_
+
diff --git a/src/operator/contrib/open_lstm_rnn_include/dropout.cuh b/src/operator/contrib/open_lstm_rnn_include/dropout.cuh
new file mode 100644
index 00000000000..94dffc65e59
--- /dev/null
+++ b/src/operator/contrib/open_lstm_rnn_include/dropout.cuh
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file dropout.cuh
+ * \brief Dropout CUDA Implementation
+ * \author Bojian (Jack) Zheng, Gennady Pekhimenko
+ */
+#ifndef MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INCLUDE_DROPOUT_CUH_
+#define MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INCLUDE_DROPOUT_CUH_
+
+//******************************************************************************
+// Forward
+//******************************************************************************
+
+// Dropout Layer (Forward Pass)
+// @param1 (output) hidden_state_o: Output Hidden State  (after Dropout)
+// [num_hidden_units x batch_size]
+// @param2  (input) hidden_state_i:  Input Hidden State (before Dropout)
+// [num_hidden_units x batch_size]
+// @param3  (input) uniform_rv: Uniform Random Variable
+// [num_hidden_units x batch_size]
+// @param4  (param) dp_prob: Dropout Probability
+// @param4  (param) hidden_state_size:
+// Number of Hidden Units x Batch Size x Number of Batches
+static __global__ void __cuda_dropout_forward(
+        float * const __restrict__ hidden_state_o,
+  const float * const __restrict__ hidden_state_i,
+  const float * const __restrict__ uniform_rv,
+  const float dp_prob, const unsigned hidden_state_size) {
+  const unsigned g_threadIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (g_threadIdx >= hidden_state_size)
+    return;
+
+  if (uniform_rv[g_threadIdx] <= dp_prob)
+    hidden_state_o[g_threadIdx] = 0;
+  else
+    // scale the kept ones by 1 / (1 - dp_prob)
+    hidden_state_o[g_threadIdx] = hidden_state_i[g_threadIdx] / (1 - dp_prob);
+}
+
+//******************************************************************************
+// Backward
+//******************************************************************************
+
+// Dropout Layer (Backward Pass)
+// @param1  (input) hidden_state_o_grad:
+//   Output Hidden State Gradient  (after Dropout)
+// [num_hidden_units x batch_size]
+// @param2 (output) hidden_state_i_grad:
+//    Input Hidden State Gradient (before Dropout)
+// [num_hidden_units x batch_size]
+// @param3  (input) uniform_rv: Uniform Random Variable
+// [num_hidden_units x batch_size]
+// @param4  (param) dp_prob: Dropout Probability
+// @param5  (param) hidden_state_size:
+// Number of Hidden Units x Batch Size x Number of Batches
+static __global__ void __cuda_dropout_backward(
+  const float * const /* __restrict__ */ hidden_state_o_grad,
+        float * const /* __restrict__ */ hidden_state_i_grad,
+  const float * const /* __restrict__ */ uniform_rv,
+  const float dp_prob, const unsigned hidden_state_size) {
+  const unsigned g_threadIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (g_threadIdx >= hidden_state_size)
+    return;
+
+  if (uniform_rv[g_threadIdx] <= dp_prob)
+    hidden_state_i_grad[g_threadIdx] = 0;
+  else
+    // scale the kept ones by 1 / (1 - dp_prob)
+    hidden_state_i_grad[g_threadIdx] = hidden_state_o_grad[g_threadIdx] /
+                                         (1 - dp_prob);
+}
+
+#endif  // MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INCLUDE_DROPOUT_CUH_
+
diff --git a/src/operator/contrib/open_lstm_rnn_include/lstm_cell.cuh b/src/operator/contrib/open_lstm_rnn_include/lstm_cell.cuh
new file mode 100644
index 00000000000..8ff148b7252
--- /dev/null
+++ b/src/operator/contrib/open_lstm_rnn_include/lstm_cell.cuh
@@ -0,0 +1,475 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file lstm_cell.cuh
+ * \brief LSTM Cell CUDA Implementation
+ * \author Bojian (Jack) Zheng, Gennady Pekhimenko
+ */
+#ifndef MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INCLUDE_LSTM_CELL_CUH_
+#define MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INCLUDE_LSTM_CELL_CUH_
+
+//******************************************************************************
+// Forward
+//******************************************************************************
+
+static __forceinline__ __device__ float __cu_sigmoidf(float i) {
+  return 1.0 / (1.0 + expf(-i));
+}
+
+// Fused Implementation of LSTM Cell (Forward Pass)
+// @param1  (input) i2h_workspace: Input  -> Hidden Temporary Workspace
+// @param2  (input) h2h_workspace: Hidden -> Hidden Temporary Workspace
+// [4 x num_hidden_units x batch_size]
+// @param3  (input) i2h_bias: Input  -> Hidden Bias
+// @param4  (input) h2h_bias: Hidden -> Hidden Bias
+// [4 x num_hidden_units]
+// @param5  (input) prev_cell_state: Previous Cell State
+// [num_hidden_units x batch_size]
+// @param6 (output) linear_gates: Linear Gates (used for Backward Pass)
+// The idea is to replay computations in Forward Pass with minimum cost.
+// [4 x num_hidden_units x batch_size]
+// @param7 (output) curr_cell_state  : Current Cell   State
+// @param8 (output) curr_hidden_state: Current Hidden State
+// [num_hidden_units x batch_size]
+// @param9  (param) num_hidden_units: Number of Hidden Units
+// @param10 (param) batch_size: Batch Size
+static __global__ void __cuda_fused_lstm_forward(
+  const float * const __restrict__ i2h_workspace,
+  const float * const __restrict__ h2h_workspace,
+  const float * const __restrict__ i2h_bias,
+  const float * const __restrict__ h2h_bias,
+  const float * const __restrict__ prev_cell_state,
+        float * const __restrict__ linear_gates,
+        float * const __restrict__ curr_cell_state,
+        float * const __restrict__ curr_hidden_state,
+  const unsigned num_hidden_units, const unsigned batch_size) {
+  const unsigned g_threadIdx = threadIdx.x + blockIdx.x * blockDim.x,
+                 num_hidden_units_x_batch_size = num_hidden_units * batch_size;
+
+  if (g_threadIdx >= num_hidden_units_x_batch_size)
+    return;
+
+  const unsigned hidden_idx = g_threadIdx / batch_size;
+
+  float gate_input[4];
+
+#pragma unroll
+  for (unsigned i = 0; i < 4; ++i) {
+    gate_input[i] =
+      i2h_workspace[i * num_hidden_units_x_batch_size + g_threadIdx] +
+      h2h_workspace[i * num_hidden_units_x_batch_size + g_threadIdx] +
+      i2h_bias[i * num_hidden_units + hidden_idx] +
+      h2h_bias[i * num_hidden_units + hidden_idx];
+    if (linear_gates != NULL)
+      linear_gates[i * num_hidden_units_x_batch_size + g_threadIdx] =
+        gate_input[i];
+  }
+
+  float  input_gate = __cu_sigmoidf(gate_input[0]);
+  float forget_gate = __cu_sigmoidf(gate_input[1]);
+  float  input_actv =          tanh(gate_input[2]);
+  float output_gate = __cu_sigmoidf(gate_input[3]);
+
+  curr_cell_state[g_threadIdx] = forget_gate * prev_cell_state[g_threadIdx] +
+                                 input_gate * input_actv;
+  curr_hidden_state[g_threadIdx] = output_gate *
+                                   tanh(curr_cell_state[g_threadIdx]);
+}
+
+//******************************************************************************
+// Backward
+//******************************************************************************
+
+#if CUDART_VERSION >= 9000
+#define WARP_FULL_MASK 0xffffffff	
+#endif // CUDART_VERSION
+
+enum class enumBackwardReduceAlgo {PURE_ATOMICS, _32_HIERARCHICAL,
+  _64_HIERARCHICAL, _128_HIERARCHICAL};
+
+// This is the most generic reduction subroutine.
+static __forceinline__ __device__ void __cu_bias_grad_reduce_generic(
+  float * const __restrict__ bias_grad,
+  float  input_gate_input_grad,
+  float forget_gate_input_grad,
+  float  input_actv_input_grad,
+  float output_gate_input_grad) {
+  if ( input_gate_input_grad != 0)
+    atomicAdd(&bias_grad[0 * gridDim.x + blockIdx.x],  input_gate_input_grad);
+  if (forget_gate_input_grad != 0)
+    atomicAdd(&bias_grad[1 * gridDim.x + blockIdx.x], forget_gate_input_grad);
+  if ( input_actv_input_grad != 0)
+    atomicAdd(&bias_grad[2 * gridDim.x + blockIdx.x],  input_actv_input_grad);
+  if (output_gate_input_grad != 0)
+    atomicAdd(&bias_grad[3 * gridDim.x + blockIdx.x], output_gate_input_grad);
+}
+
+template < unsigned block_size_T >
+static __forceinline__ __device__ void __cu_bias_grad_reduce(
+  volatile float * const __restrict__ svmem_fused_lstm_backward,
+           float * const __restrict__ bias_grad,
+           float  input_gate_input_grad,
+           float forget_gate_input_grad,
+           float  input_actv_input_grad,
+           float output_gate_input_grad);
+
+template <>
+__forceinline__ __device__ void __cu_bias_grad_reduce < 32 > (
+  volatile float * const __restrict__ svmem_fused_lstm_backward,
+           float * const __restrict__ bias_grad,
+           float  input_gate_input_grad,
+           float forget_gate_input_grad,
+           float  input_actv_input_grad,
+           float output_gate_input_grad) {
+#pragma unroll
+  for (unsigned stride = 16; stride != 0; stride /= 2) {
+#if CUDART_VERSION >= 9000
+     input_gate_input_grad +=  __shfl_down_sync(WARP_FULL_MASK, input_gate_input_grad, stride);
+    forget_gate_input_grad += __shfl_down_sync(WARP_FULL_MASK, forget_gate_input_grad, stride);
+     input_actv_input_grad +=  __shfl_down_sync(WARP_FULL_MASK, input_actv_input_grad, stride);
+    output_gate_input_grad += __shfl_down_sync(WARP_FULL_MASK, output_gate_input_grad, stride);
+#else  // CUDART_VERSION < 9000
+     input_gate_input_grad +=  __shfl_down(input_gate_input_grad, stride);
+    forget_gate_input_grad += __shfl_down(forget_gate_input_grad, stride);
+     input_actv_input_grad +=  __shfl_down(input_actv_input_grad, stride);
+    output_gate_input_grad += __shfl_down(output_gate_input_grad, stride);
+#endif  // CUDART_VERSION
+  }
+  if (threadIdx.x == 0) {
+    atomicAdd(&bias_grad[0 * gridDim.x + blockIdx.x],  input_gate_input_grad);
+    atomicAdd(&bias_grad[1 * gridDim.x + blockIdx.x], forget_gate_input_grad);
+    atomicAdd(&bias_grad[2 * gridDim.x + blockIdx.x],  input_actv_input_grad);
+    atomicAdd(&bias_grad[3 * gridDim.x + blockIdx.x], output_gate_input_grad);
+  }
+}
+
+template <>
+__forceinline__ __device__ void __cu_bias_grad_reduce < 64 > (
+  volatile float * const __restrict__ svmem_fused_lstm_backward,
+           float * const __restrict__ bias_grad,
+           float  input_gate_input_grad,
+           float forget_gate_input_grad,
+           float  input_actv_input_grad,
+           float output_gate_input_grad) {
+  volatile float * svmem__input_gate_input_grad = svmem_fused_lstm_backward +
+                                                  0 * blockDim.x;
+  volatile float * svmem_forget_gate_input_grad = svmem_fused_lstm_backward +
+                                                  1 * blockDim.x;
+  volatile float * svmem__input_actv_input_grad = svmem_fused_lstm_backward +
+                                                  2 * blockDim.x;
+  volatile float * svmem_output_gate_input_grad = svmem_fused_lstm_backward +
+                                                  3 * blockDim.x;
+
+  svmem__input_gate_input_grad[threadIdx.x] =  input_gate_input_grad;
+  svmem_forget_gate_input_grad[threadIdx.x] = forget_gate_input_grad;
+  svmem__input_actv_input_grad[threadIdx.x] =  input_actv_input_grad;
+  svmem_output_gate_input_grad[threadIdx.x] = output_gate_input_grad;
+  __syncthreads();
+  // Up to this point, shared memory is initialized with gate gradients.
+  //======================================================================
+  // Note that starting from this point, execution becomes warp-wide therefore
+  // there is no more synchronization between threads.
+  // 1st warp are responsible for reduction in input_gate, forget_gate.
+  // 2nd warp are responsible for reduction in input_actv, output_gate.
+  if (threadIdx.x < 32) {  // [0, 32)
+     input_gate_input_grad += svmem__input_gate_input_grad[threadIdx.x + 32];
+    forget_gate_input_grad += svmem_forget_gate_input_grad[threadIdx.x + 32];
+#pragma unroll
+    for (unsigned stride = 16; stride != 0; stride /= 2) {
+#if CUDART_VERSION >= 9000
+       input_gate_input_grad +=  __shfl_down_sync(WARP_FULL_MASK, input_gate_input_grad, stride);
+      forget_gate_input_grad += __shfl_down_sync(WARP_FULL_MASK, forget_gate_input_grad, stride);
+#else  // CUDART_VERSION < 9000
+       input_gate_input_grad +=  __shfl_down(input_gate_input_grad, stride);
+      forget_gate_input_grad += __shfl_down(forget_gate_input_grad, stride);
+#endif  // CUDART_VERSION
+    }
+  } else {  // [32, 64)
+     input_actv_input_grad += svmem__input_actv_input_grad[threadIdx.x - 32];
+    output_gate_input_grad += svmem_output_gate_input_grad[threadIdx.x - 32];
+#pragma unroll
+    for (unsigned stride = 16; stride != 0; stride /= 2) {
+#if CUDART_VERSION >= 9000
+       input_actv_input_grad +=  __shfl_down_sync(WARP_FULL_MASK, input_actv_input_grad, stride);
+      output_gate_input_grad += __shfl_down_sync(WARP_FULL_MASK, output_gate_input_grad, stride);
+#else  // CUDART_VERSION < 9000
+       input_actv_input_grad +=  __shfl_down(input_actv_input_grad, stride);
+      output_gate_input_grad += __shfl_down(output_gate_input_grad, stride);
+#endif  // CUDART_VERSION
+    }
+  }
+  //======================================================================
+  // Reduction is complete. Update global memory using atomic functions.
+  if (threadIdx.x ==  0) {
+    atomicAdd(&bias_grad[0 * gridDim.x + blockIdx.x],  input_gate_input_grad);
+    atomicAdd(&bias_grad[1 * gridDim.x + blockIdx.x], forget_gate_input_grad);
+  }
+  if (threadIdx.x == 32) {
+    atomicAdd(&bias_grad[2 * gridDim.x + blockIdx.x],  input_actv_input_grad);
+    atomicAdd(&bias_grad[3 * gridDim.x + blockIdx.x], output_gate_input_grad);
+  }
+}
+
+template <>
+__forceinline__ __device__ void __cu_bias_grad_reduce < 128 > (
+  volatile float * const __restrict__ svmem_fused_lstm_backward,
+           float * const __restrict__ bias_grad,
+           float  input_gate_input_grad,
+           float forget_gate_input_grad,
+           float  input_actv_input_grad,
+           float output_gate_input_grad) {
+  volatile float * svmem__input_gate_input_grad = svmem_fused_lstm_backward +
+                                                  0 * blockDim.x;
+  volatile float * svmem_forget_gate_input_grad = svmem_fused_lstm_backward +
+                                                  1 * blockDim.x;
+  volatile float * svmem__input_actv_input_grad = svmem_fused_lstm_backward +
+                                                  2 * blockDim.x;
+  volatile float * svmem_output_gate_input_grad = svmem_fused_lstm_backward +
+                                                  3 * blockDim.x;
+
+  svmem__input_gate_input_grad[threadIdx.x] =  input_gate_input_grad;
+  svmem_forget_gate_input_grad[threadIdx.x] = forget_gate_input_grad;
+  svmem__input_actv_input_grad[threadIdx.x] =  input_actv_input_grad;
+  svmem_output_gate_input_grad[threadIdx.x] = output_gate_input_grad;
+  __syncthreads();
+  // Up to this point, shared memory is initialized with gate gradients.
+  //======================================================================
+  // apply interleaving parallel reduction
+  // 1st 64 threads are responsible for reduction in  input_gate,  input_actv.
+  // 2nd 64 threads are responsible for reduction in forget_gate, output_gate.
+  if (threadIdx.x < 64) {  // [0, 64)
+    svmem__input_gate_input_grad[threadIdx.x] =
+       input_gate_input_grad =
+       input_gate_input_grad + svmem__input_gate_input_grad[threadIdx.x + 64];
+    svmem_forget_gate_input_grad[threadIdx.x] =
+      forget_gate_input_grad =
+      forget_gate_input_grad + svmem_forget_gate_input_grad[threadIdx.x + 64];
+  } else {  // [64, 128)
+    svmem__input_actv_input_grad[threadIdx.x] =
+       input_actv_input_grad =
+       input_actv_input_grad + svmem__input_actv_input_grad[threadIdx.x - 64];
+    svmem_output_gate_input_grad[threadIdx.x] =
+      output_gate_input_grad =
+      output_gate_input_grad + svmem_output_gate_input_grad[threadIdx.x - 64];
+  }
+  __syncthreads();
+  // Note that starting from this point, execution becomes warp-wide
+  // and therefore there is no more synchronization between threads.
+  // 1st warp ( 0 ~  31) are responsible for reduction in  input_gate.
+  // 2nd warp (32 ~  63) are responsible for reduction in forget_gate.
+  // 3rd warp (64 ~  95) are responsible for reduction in  input_actv.
+  // 4th warp (96 ~ 127) are responsible for reduction in output_gate.
+  if (threadIdx.x < 32) {  // [0, 32)
+     input_gate_input_grad += svmem__input_gate_input_grad[threadIdx.x + 32];
+#pragma unroll
+    for (unsigned stride = 16; stride != 0; stride /= 2)
+#if CUDART_VERSION >= 9000
+       input_gate_input_grad +=  __shfl_down_sync(WARP_FULL_MASK, input_gate_input_grad, stride);
+#else  // CUDART_VERSION < 9000
+       input_gate_input_grad +=  __shfl_down(input_gate_input_grad, stride);
+#endif  // CUDART_VERSION
+  } else if (threadIdx.x < 64) {  // [32, 64)
+    forget_gate_input_grad += svmem_forget_gate_input_grad[threadIdx.x - 32];
+#pragma unroll
+    for (unsigned stride = 16; stride != 0; stride /= 2)
+#if CUDART_VERSION >= 9000
+      forget_gate_input_grad += __shfl_down_sync(WARP_FULL_MASK, forget_gate_input_grad, stride);
+#else  // CUDART_VERSION < 9000
+      forget_gate_input_grad += __shfl_down(forget_gate_input_grad, stride);
+#endif  // CUDART_VERSION
+  } else if (threadIdx.x < 96) {  // [64, 96)
+     input_actv_input_grad += svmem__input_actv_input_grad[threadIdx.x + 32];
+#pragma unroll
+    for (unsigned stride = 16; stride != 0; stride /= 2)
+#if CUDART_VERSION >= 9000
+       input_actv_input_grad +=  __shfl_down_sync(WARP_FULL_MASK, input_actv_input_grad, stride);
+#else  // CUDART_VERSION < 9000
+       input_actv_input_grad +=  __shfl_down(input_actv_input_grad, stride);
+#endif  // CUDART_VERSION
+  } else {  // [96, 128)
+    output_gate_input_grad += svmem_output_gate_input_grad[threadIdx.x - 32];
+#pragma unroll
+    for (unsigned stride = 16; stride != 0; stride /= 2)
+#if CUDART_VERSION >= 9000
+      output_gate_input_grad += __shfl_down_sync(WARP_FULL_MASK, output_gate_input_grad, stride);
+#else  // CUDART_VERSION < 9000
+      output_gate_input_grad += __shfl_down(output_gate_input_grad, stride);
+#endif  // CUDART_VERSION
+  }
+  //======================================================================
+  // Reduction is complete. Update global memory using atomic functions.
+  if (threadIdx.x ==  0)
+    atomicAdd(&bias_grad[0 * gridDim.x + blockIdx.x],  input_gate_input_grad);
+  if (threadIdx.x == 32)
+    atomicAdd(&bias_grad[1 * gridDim.x + blockIdx.x], forget_gate_input_grad);
+  if (threadIdx.x == 64)
+    atomicAdd(&bias_grad[2 * gridDim.x + blockIdx.x],  input_actv_input_grad);
+  if (threadIdx.x == 96)
+    atomicAdd(&bias_grad[3 * gridDim.x + blockIdx.x], output_gate_input_grad);
+}
+
+// Fused Implementation of LSTM Cell (Backward Pass)
+// This block should be launched with the following parameters:
+// <<< dim3(num_hidden_units,
+//          (batch_size - 1)/(128/64/32) + 1, 1),
+//          (128/64/32), ... >>>
+// where the choice of (128/64/32) should be
+// figured out dynamically depending on batch size.
+// @param1 (output) i2h_workspace: Input  -> Hidden Temporary Workspace
+// @param2 (output) h2h_workspace: Hidden -> Hidden Temporary Workspace
+// [4 x num_hidden_units x batch_size]
+// @param3 (output) bias_grad: Bias Gradient
+// [4 x num_hidden_units]
+// We can only keep one copy of bias gradient because i2h_bias and h2h_bias
+// always share the same gradient.
+// @param4  (inout) cell_grad: Cell Gradient
+// Gradient of current cell state will be computed and propagated back
+// to the previous cell in the time sequence.
+// @param5  (input) prev_cell_state: Previous Cell State
+// [num_hidden_units x batch_size]
+// @param6  (input) linear_gates: Linear Gates
+// The idea is to replay computations in Forward Pass with minimum cost.
+// [4 x num_hidden_units x batch_size]
+// @param7  (input) curr_cell_state: Current  Cell State
+// [num_hidden_units x batch_size]
+// @param8  (input) i2h_grad_workspace: Input  ->
+//   Hidden Gradient Temporary Workspace
+// @param9  (input) h2h_grad_workspace: Hidden ->
+//   Hidden Gradient Temporary Workspace
+// [num_hidden_units x batch_size]
+// @param10 (param) batch_size: Batch Size
+// @param11 (param) algo: Backward Pass Reduction Algorithm
+static __global__ void __cuda_fused_lstm_backward(
+        float * const __restrict__ i2h_workspace,
+        float * const __restrict__ h2h_workspace,
+        float * const __restrict__ bias_grad,
+        float * const __restrict__ cell_grad,
+  const float * const __restrict__ prev_cell_state,
+  const float * const __restrict__ linear_gates,
+  const float * const __restrict__ curr_cell_state,
+  const float * const __restrict__ i2h_grad_workspace,
+  const float * const __restrict__ h2h_grad_workspace,
+  const unsigned batch_size, const enumBackwardReduceAlgo algo) {
+  extern __shared__ volatile float svmem_fused_lstm_backward[];
+
+  const unsigned batch_idx = threadIdx.x + blockIdx.y * blockDim.x;
+
+  float  input_gate_input_grad = 0;
+  float forget_gate_input_grad = 0;
+  float  input_actv_input_grad = 0;
+  float output_gate_input_grad = 0;
+
+  // If *this* thread is not padded for reduction purpose, ...
+  if (batch_idx < batch_size) {
+    const unsigned g_threadIdx = batch_idx + blockIdx.x * batch_size,
+      num_hidden_units_x_batch_size = gridDim.x * batch_size;
+
+    //==============================================================
+    // Forward Pass Replay
+    //==============================================================
+    // replay the computation we just did for the forward pass
+    float  input_gate = __cu_sigmoidf(linear_gates[
+      0 * num_hidden_units_x_batch_size + g_threadIdx]);
+    float forget_gate = __cu_sigmoidf(linear_gates[
+      1 * num_hidden_units_x_batch_size + g_threadIdx]);
+    float  input_actv =          tanh(linear_gates[
+      2 * num_hidden_units_x_batch_size + g_threadIdx]);
+    float output_gate = __cu_sigmoidf(linear_gates[
+      3 * num_hidden_units_x_batch_size + g_threadIdx]);
+
+    float curr_cell_state_actv = tanh(curr_cell_state[g_threadIdx]);
+
+    //==============================================================
+    // Backward Computation
+    //==============================================================
+    // In the gradient computation, we mainly make use of
+    // the following two important properties:
+    // 1. tanh'(x) = 1 - tanh(x) * tanh(x)
+    // 2. sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x))
+    // where the symbol ' denotes gradient.
+    float curr_hidden_state_grad = 0;
+    curr_hidden_state_grad += i2h_grad_workspace == NULL ?
+      0 : i2h_grad_workspace[g_threadIdx];
+    curr_hidden_state_grad += h2h_grad_workspace == NULL ?
+      0 : h2h_grad_workspace[g_threadIdx];
+
+    float curr_cell_state_grad =
+      curr_hidden_state_grad * output_gate *
+        (1 - curr_cell_state_actv * curr_cell_state_actv) +
+      cell_grad[g_threadIdx];
+    float     output_gate_grad = curr_hidden_state_grad * curr_cell_state_actv;
+
+    float forget_gate_grad = curr_cell_state_grad *
+                             prev_cell_state[g_threadIdx];
+    float  input_gate_grad = curr_cell_state_grad * input_actv;
+    float  input_actv_grad = curr_cell_state_grad * input_gate;
+
+    cell_grad[g_threadIdx] = curr_cell_state_grad * forget_gate;
+
+     input_gate_input_grad =  input_gate_grad *  input_gate * (1 -  input_gate);
+    forget_gate_input_grad = forget_gate_grad * forget_gate * (1 - forget_gate);
+     input_actv_input_grad =  input_actv_grad * (1 - input_actv * input_actv);
+    output_gate_input_grad = output_gate_grad * output_gate * (1 - output_gate);
+
+    i2h_workspace[0 * num_hidden_units_x_batch_size + g_threadIdx] =
+    h2h_workspace[0 * num_hidden_units_x_batch_size + g_threadIdx] =
+       input_gate_input_grad;
+    i2h_workspace[1 * num_hidden_units_x_batch_size + g_threadIdx] =
+    h2h_workspace[1 * num_hidden_units_x_batch_size + g_threadIdx] =
+      forget_gate_input_grad;
+    i2h_workspace[2 * num_hidden_units_x_batch_size + g_threadIdx] =
+    h2h_workspace[2 * num_hidden_units_x_batch_size + g_threadIdx] =
+       input_actv_input_grad;
+    i2h_workspace[3 * num_hidden_units_x_batch_size + g_threadIdx] =
+    h2h_workspace[3 * num_hidden_units_x_batch_size + g_threadIdx] =
+      output_gate_input_grad;
+  }
+  //======================================================================
+  // Bias Gradients Reduction
+  //======================================================================
+  // According to the technical report presented at GTC:
+  // http://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf
+  // The best performance is achieved when we use
+  // a combination of CTA (block)-wide reduction and atomics.
+  switch (algo) {
+    case enumBackwardReduceAlgo:: _32_HIERARCHICAL:
+      __cu_bias_grad_reduce < 32 >  (
+        svmem_fused_lstm_backward, bias_grad,
+        input_gate_input_grad, forget_gate_input_grad,
+        input_actv_input_grad, output_gate_input_grad); break;
+    case enumBackwardReduceAlgo:: _64_HIERARCHICAL:
+      __cu_bias_grad_reduce < 64 >  (
+        svmem_fused_lstm_backward, bias_grad,
+        input_gate_input_grad, forget_gate_input_grad,
+        input_actv_input_grad, output_gate_input_grad); break;
+    case enumBackwardReduceAlgo::_128_HIERARCHICAL:
+      __cu_bias_grad_reduce < 128 > (
+        svmem_fused_lstm_backward, bias_grad,
+        input_gate_input_grad, forget_gate_input_grad,
+        input_actv_input_grad, output_gate_input_grad); break;
+    default:
+      __cu_bias_grad_reduce_generic(
+        bias_grad,
+        input_gate_input_grad, forget_gate_input_grad,
+        input_actv_input_grad, output_gate_input_grad);
+  }
+}
+
+#endif  // MXNET_OPERATOR_CONTRIB_OPEN_LSTM_RNN_INCLUDE_LSTM_CELL_CUH_


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services