You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2017/11/21 05:49:26 UTC
[GitHub] madjam closed pull request #8654: Adds security best practice doc.

madjam closed pull request #8654: Adds security best practice doc.
URL: https://github.com/apache/incubator-mxnet/pull/8654
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/amalgamation/dmlc-minimum0.cc b/amalgamation/dmlc-minimum0.cc
index a24ca21859..be1793a51d 100644
--- a/amalgamation/dmlc-minimum0.cc
+++ b/amalgamation/dmlc-minimum0.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright 2015 by Contributors.
  * \brief Mininum DMLC library Amalgamation, used for easy plugin of dmlc lib.
  *  Normally this is not needed.
  */
diff --git a/cmake/Modules/FindJeMalloc.cmake b/cmake/Modules/FindJeMalloc.cmake
index 859bd481ed..f3ca06faa3 100644
--- a/cmake/Modules/FindJeMalloc.cmake
+++ b/cmake/Modules/FindJeMalloc.cmake
@@ -14,8 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-#
-#----
+
+
 # Copyright (c)      2014 Thomas Heller
 # Copyright (c) 2007-2012 Hartmut Kaiser
 # Copyright (c) 2010-2011 Matt Anderson
@@ -24,21 +24,21 @@
 #----
 # Distributed under the Boost Software License, Version 1.0.
 # Boost Software License - Version 1.0 - August 17th, 2003
-# 
+#
 # Permission is hereby granted, free of charge, to any person or organization
 # obtaining a copy of the software and accompanying documentation covered by
 # this license (the "Software") to use, reproduce, display, distribute,
 # execute, and transmit the Software, and to prepare derivative works of the
 # Software, and to permit third-parties to whom the Software is furnished to
 # do so, all subject to the following:
-# 
+#
 # The copyright notices in the Software and this entire statement, including
 # the above license grant, this restriction and the following disclaimer,
 # must be included in all copies of the Software, in whole or in part, and
 # all derivative works of the Software, unless such copies or derivative
 # works are solely in the form of machine-executable object code generated by
 # a source language processor.
-# 
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
diff --git a/cpp-package/include/mxnet-cpp/MxNetCpp.h b/cpp-package/include/mxnet-cpp/MxNetCpp.h
index 882bbead51..7ac039dd88 100644
--- a/cpp-package/include/mxnet-cpp/MxNetCpp.h
+++ b/cpp-package/include/mxnet-cpp/MxNetCpp.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file MxNetCpp.h
  * \brief meta include file for mxnet.cpp
  * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/base.h b/cpp-package/include/mxnet-cpp/base.h
index 19375c0f81..d0f1bea15f 100644
--- a/cpp-package/include/mxnet-cpp/base.h
+++ b/cpp-package/include/mxnet-cpp/base.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file base.h
 * \brief base definitions for mxnetcpp
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/executor.h b/cpp-package/include/mxnet-cpp/executor.h
index 7e45ef56ab..4cb28819de 100644
--- a/cpp-package/include/mxnet-cpp/executor.h
+++ b/cpp-package/include/mxnet-cpp/executor.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file executor.h
 * \brief executor definition
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/initializer.h b/cpp-package/include/mxnet-cpp/initializer.h
index e5bfa4da8e..61e95469b7 100644
--- a/cpp-package/include/mxnet-cpp/initializer.h
+++ b/cpp-package/include/mxnet-cpp/initializer.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file initializer.h
  * \brief random initializer
  * \author Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/io.h b/cpp-package/include/mxnet-cpp/io.h
index 7281416ae3..7099d7d46f 100644
--- a/cpp-package/include/mxnet-cpp/io.h
+++ b/cpp-package/include/mxnet-cpp/io.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file operator.h
 * \brief definition of io, such as DataIter
 * \author Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/kvstore.h b/cpp-package/include/mxnet-cpp/kvstore.h
index 9c3c81f37f..d5aa1509a8 100644
--- a/cpp-package/include/mxnet-cpp/kvstore.h
+++ b/cpp-package/include/mxnet-cpp/kvstore.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file kvstore.h
 * \brief definition of kvstore
 * \author Chuntao Hong
diff --git a/cpp-package/include/mxnet-cpp/lr_scheduler.h b/cpp-package/include/mxnet-cpp/lr_scheduler.h
index b9381a830a..cffd1c7576 100644
--- a/cpp-package/include/mxnet-cpp/lr_scheduler.h
+++ b/cpp-package/include/mxnet-cpp/lr_scheduler.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2017 by Contributors
 * \file lr_scheduler.h
 * \brief Scheduling learning rate
 */
diff --git a/cpp-package/include/mxnet-cpp/metric.h b/cpp-package/include/mxnet-cpp/metric.h
index 6dbb197dae..d015d8b4ac 100644
--- a/cpp-package/include/mxnet-cpp/metric.h
+++ b/cpp-package/include/mxnet-cpp/metric.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file base.h
 * \brief metrics defined
 * \author Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/model.h b/cpp-package/include/mxnet-cpp/model.h
index c8af6a476a..b3a0a9dbef 100644
--- a/cpp-package/include/mxnet-cpp/model.h
+++ b/cpp-package/include/mxnet-cpp/model.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file model.h
 * \brief MXNET.cpp model module
 * \author Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/monitor.h b/cpp-package/include/mxnet-cpp/monitor.h
index 33ef4855c1..c1494d0bd0 100644
--- a/cpp-package/include/mxnet-cpp/monitor.h
+++ b/cpp-package/include/mxnet-cpp/monitor.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2017 by Contributors
 * \file monitor.h
 * \brief monitor definition
 * \author Xin Li
diff --git a/cpp-package/include/mxnet-cpp/ndarray.h b/cpp-package/include/mxnet-cpp/ndarray.h
index 9e196d0730..082c06981c 100644
--- a/cpp-package/include/mxnet-cpp/ndarray.h
+++ b/cpp-package/include/mxnet-cpp/ndarray.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file ndarray.h
 * \brief definition of ndarray
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/op_map.h b/cpp-package/include/mxnet-cpp/op_map.h
index b54cc0ae2c..17746d1fa5 100644
--- a/cpp-package/include/mxnet-cpp/op_map.h
+++ b/cpp-package/include/mxnet-cpp/op_map.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file op_map.h
 * \brief definition of OpMap
 * \author Chuntao Hong
diff --git a/cpp-package/include/mxnet-cpp/op_suppl.h b/cpp-package/include/mxnet-cpp/op_suppl.h
index 52cdae772a..4f3011c17c 100644
--- a/cpp-package/include/mxnet-cpp/op_suppl.h
+++ b/cpp-package/include/mxnet-cpp/op_suppl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file op_suppl.h
 * \brief A supplement and amendment of the operators from op.h
 * \author Zhang Chen, zhubuntu, Xin Li
diff --git a/cpp-package/include/mxnet-cpp/op_util.h b/cpp-package/include/mxnet-cpp/op_util.h
index 20e06a8518..b2b442fd8a 100644
--- a/cpp-package/include/mxnet-cpp/op_util.h
+++ b/cpp-package/include/mxnet-cpp/op_util.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2017 by Contributors
 * \file op_util.h
 * \brief operator helper functions
 * \author Chris Olivier
diff --git a/cpp-package/include/mxnet-cpp/operator.h b/cpp-package/include/mxnet-cpp/operator.h
index 02bd21ebe8..4d4bedac8f 100644
--- a/cpp-package/include/mxnet-cpp/operator.h
+++ b/cpp-package/include/mxnet-cpp/operator.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file operator.h
 * \brief definition of operator
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/optimizer.h b/cpp-package/include/mxnet-cpp/optimizer.h
index e57da5d95c..4aebb55c50 100644
--- a/cpp-package/include/mxnet-cpp/optimizer.h
+++ b/cpp-package/include/mxnet-cpp/optimizer.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file optimizer.h
 * \brief definition of optimizer
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/shape.h b/cpp-package/include/mxnet-cpp/shape.h
index 2793e436c0..01ee476363 100644
--- a/cpp-package/include/mxnet-cpp/shape.h
+++ b/cpp-package/include/mxnet-cpp/shape.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file shape.h
 * \brief definition of shape
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/symbol.h b/cpp-package/include/mxnet-cpp/symbol.h
index 888aebd6f3..127ef156eb 100644
--- a/cpp-package/include/mxnet-cpp/symbol.h
+++ b/cpp-package/include/mxnet-cpp/symbol.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file symbol.h
 * \brief definition of symbol
 * \author Chuntao Hong, Zhang Chen
diff --git a/cub b/cub
index 05eb57faa0..89de7ab201 160000
--- a/cub
+++ b/cub
@@ -1 +1 @@
-Subproject commit 05eb57faa0a4cac37c2a86fdf4b4dc865a95a1a3
+Subproject commit 89de7ab20167909bc2c4f8acd397671c47cf3c0d
diff --git a/dmlc-core b/dmlc-core
index 595d02c0e8..fcf831a323 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 595d02c0e87be8a0846700462b6f45f1b1031e39
+Subproject commit fcf831a3239249588b014c3e6cd2bdb7366547e1
diff --git a/docker/install/scala.sh b/docker/install/scala.sh
index c1d2de6c75..b1bfe28074 100755
--- a/docker/install/scala.sh
+++ b/docker/install/scala.sh
@@ -27,7 +27,7 @@ echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" |
 apt-get install -y oracle-java8-installer
 apt-get install -y oracle-java8-set-default
 
-apt-get install -y maven 
+apt-get install -y maven
 
 wget http://downloads.lightbend.com/scala/2.11.8/scala-2.11.8.deb
 dpkg -i scala-2.11.8.deb
diff --git a/docs/api/python/autograd/autograd.md b/docs/api/python/autograd/autograd.md
index de8188446b..410d6a94e2 100644
--- a/docs/api/python/autograd/autograd.md
+++ b/docs/api/python/autograd/autograd.md
@@ -1,14 +1,9 @@
 # Autograd Package
 
-
 ```eval_rst
 .. currentmodule:: mxnet.autograd
 ```
 
-```eval_rst
-.. warning:: This package is currently experimental and may change in the near future.
-```
-
 ## Overview
 
 The `autograd` package enables automatic
diff --git a/docs/api/python/gluon/data.md b/docs/api/python/gluon/data.md
index f72f3cd03f..0b5f959e32 100644
--- a/docs/api/python/gluon/data.md
+++ b/docs/api/python/gluon/data.md
@@ -15,10 +15,6 @@ This document lists the data APIs in Gluon:
 The `Gluon Data` API, defined in the `gluon.data` package, provides useful dataset loading
 and processing tools, as well as common public datasets.
 
-```eval_rst
-.. warning:: This package contains experimental APIs and may change in the near future.
-```
-
 In the rest of this document, we list routines provided by the `gluon.data` package.
 
 ## Data
diff --git a/docs/api/python/gluon/gluon.md b/docs/api/python/gluon/gluon.md
index 0ef6dbed0e..2ae766fdcb 100644
--- a/docs/api/python/gluon/gluon.md
+++ b/docs/api/python/gluon/gluon.md
@@ -5,10 +5,6 @@
 .. currentmodule:: mxnet.gluon
 ```
 
-```eval_rst
-.. warning:: This package is currently experimental and may change in the near future.
-```
-
 <script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
 
 ## Overview
diff --git a/docs/api/python/gluon/model_zoo.md b/docs/api/python/gluon/model_zoo.md
index 18d9ae2cb8..8310461242 100644
--- a/docs/api/python/gluon/model_zoo.md
+++ b/docs/api/python/gluon/model_zoo.md
@@ -13,15 +13,12 @@ This document lists the model APIs in Gluon:
     :nosignatures:
 
     mxnet.gluon.model_zoo
+    mxnet.gluon.model_zoo.vision
 ```
 
 The `Gluon Model Zoo` API, defined in the `gluon.model_zoo` package, provides pre-defined
 and pre-trained models to help bootstrap machine learning applications.
 
-```eval_rst
-.. warning:: This package contains experimental APIs and may change in the near future.
-```
-
 In the rest of this document, we list routines provided by the `gluon.model_zoo` package.
 
 ### Vision
@@ -186,6 +183,8 @@ In the rest of this document, we list routines provided by the `gluon.model_zoo`
 
 ```eval_rst
 
+.. automodule:: mxnet.gluon.model_zoo
+
 .. automodule:: mxnet.gluon.model_zoo.vision
     :members:
     :imported-members:
diff --git a/docs/api/python/ndarray/ndarray.md b/docs/api/python/ndarray/ndarray.md
index 09564c2f20..59ca4a612e 100644
--- a/docs/api/python/ndarray/ndarray.md
+++ b/docs/api/python/ndarray/ndarray.md
@@ -559,13 +559,13 @@ The `ndarray` package provides several classes:
 .. autosummary::
     :nosignatures:
 
-    sample_uniform
-    sample_normal
-    sample_gamma
-    sample_exponential
-    sample_poisson
-    sample_negative_binomial
-    sample_generalized_negative_binomial
+    mxnet.ndarray.random.uniform
+    mxnet.ndarray.random.normal
+    mxnet.ndarray.random.gamma
+    mxnet.ndarray.random.exponential
+    mxnet.ndarray.random.poisson
+    mxnet.ndarray.random.negative_binomial
+    mxnet.ndarray.random.generalized_negative_binomial
     mxnet.random.seed
 ```
 
@@ -580,7 +580,6 @@ The `ndarray` package provides several classes:
     argsort
     argmax
     argmin
-    argmax_channel
 ```
 
 ### Sequence operation
diff --git a/docs/api/python/ndarray/sparse.md b/docs/api/python/ndarray/sparse.md
index 9b742f4fc5..dd0286d092 100644
--- a/docs/api/python/ndarray/sparse.md
+++ b/docs/api/python/ndarray/sparse.md
@@ -123,13 +123,22 @@ We summarize the interface for each class in the following sections.
     CSRNDArray.copy
     CSRNDArray.copyto
     CSRNDArray.as_in_context
-    CSRNDArray.asnumpy
     CSRNDArray.asscipy
+    CSRNDArray.asnumpy
     CSRNDArray.asscalar
     CSRNDArray.astype
     CSRNDArray.tostype
 ```
 
+### Array inspection
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    CSRNDArray.check_format
+```
+
 ### Array creation
 
 ```eval_rst
@@ -139,6 +148,25 @@ We summarize the interface for each class in the following sections.
     CSRNDArray.zeros_like
 ```
 
+### Array reduction
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    CSRNDArray.sum
+    CSRNDArray.mean
+```
+
+### Powers
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    CSRNDArray.square
+```
+
 ### Indexing
 
 ```eval_rst
@@ -190,6 +218,15 @@ We summarize the interface for each class in the following sections.
     RowSparseNDArray.tostype
 ```
 
+### Array inspection
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.check_format
+```
+
 ### Array creation
 
 ```eval_rst
@@ -213,6 +250,52 @@ We summarize the interface for each class in the following sections.
     RowSparseNDArray.trunc
 ```
 
+### Trigonometric functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.sin
+    RowSparseNDArray.tan
+    RowSparseNDArray.arcsin
+    RowSparseNDArray.arctan
+    RowSparseNDArray.degrees
+    RowSparseNDArray.radians
+```
+
+### Hyperbolic functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.sinh
+    RowSparseNDArray.tanh
+    RowSparseNDArray.arcsinh
+    RowSparseNDArray.arctanh
+```
+
+### Exponents and logarithms
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.expm1
+    RowSparseNDArray.log1p
+```
+
+### Powers
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.sqrt
+    RowSparseNDArray.square
+```
+
 ### Indexing
 
 ```eval_rst
@@ -221,6 +304,7 @@ We summarize the interface for each class in the following sections.
 
     RowSparseNDArray.__getitem__
     RowSparseNDArray.__setitem__
+    RowSparseNDArray.retain
 ```
 
 ### Lazy evaluation
@@ -232,6 +316,16 @@ We summarize the interface for each class in the following sections.
     RowSparseNDArray.wait_to_read
 ```
 
+### Miscellaneous
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.clip
+    RowSparseNDArray.sign
+```
+
 ## Array creation routines
 
 ```eval_rst
@@ -311,6 +405,16 @@ We summarize the interface for each class in the following sections.
     arctanh
 ```
 
+### Reduce functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sum
+    mean
+```
+
 ### Rounding
 
 ```eval_rst
@@ -355,6 +459,20 @@ We summarize the interface for each class in the following sections.
     sign
 ```
 
+## Neural network
+
+### Updater
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sgd_update
+    sgd_mom_update
+    adam_update
+    ftrl_update
+```
+
 ### More
 
 ```eval_rst
@@ -363,6 +481,7 @@ We summarize the interface for each class in the following sections.
 
     make_loss
     stop_gradient
+    mxnet.ndarray.contrib.SparseEmbedding
 ```
 
 ## API Reference
@@ -372,10 +491,10 @@ We summarize the interface for each class in the following sections.
 ```eval_rst
 
 .. autoclass:: mxnet.ndarray.sparse.CSRNDArray
-    :members: shape, context, dtype, stype, data, indices, indptr, copy, copyto, as_in_context, asnumpy, asscalar, astype, tostype, slice, wait_to_read, zeros_like, __getitem__, __setitem__
+    :members: shape, context, dtype, stype, data, indices, indptr, copy, copyto, as_in_context, asscipy, asnumpy, asscalar, astype, tostype, slice, wait_to_read, zeros_like, __neg__, sum, mean, square, __getitem__, __setitem__, check_format
 
 .. autoclass:: mxnet.ndarray.sparse.RowSparseNDArray
-    :members: shape, context, dtype, stype, data, indices, copy, copyto, as_in_context, asnumpy, asscalar, astype, tostype, wait_to_read, zeros_like, round, rint, fix, floor, ceil, trunc, __getitem__, __setitem__
+    :members: shape, context, dtype, stype, data, indices, copy, copyto, as_in_context, asnumpy, asscalar, astype, tostype, wait_to_read, zeros_like, round, rint, fix, floor, ceil, trunc, sin, tan, arcsin, arctan, degrees, radians, sinh, tanh, arcsinh, arctanh, expm1, log1p, sqrt, square, __negative__, __getitem__, __setitem__, check_format, retain, clip, sign
 
 .. automodule:: mxnet.ndarray.sparse
     :members:
diff --git a/docs/api/python/symbol/sparse.md b/docs/api/python/symbol/sparse.md
index 5ebbfcd057..b40276b9f1 100644
--- a/docs/api/python/symbol/sparse.md
+++ b/docs/api/python/symbol/sparse.md
@@ -95,10 +95,107 @@ In the rest of this document, we list sparse related routines provided by the
     :nosignatures:
 
     elemwise_add
+    elemwise_sub
+    elemwise_mul
+    negative
     dot
     add_n
 ```
 
+### Trigonometric functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sin
+    tan
+    arcsin
+    arctan
+    degrees
+    radians
+```
+
+### Hyperbolic functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sinh
+    tanh
+    arcsinh
+    arctanh
+```
+
+### Reduce functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sum
+    mean
+```
+
+### Rounding
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    round
+    rint
+    fix
+    floor
+    ceil
+    trunc
+```
+
+### Exponents and logarithms
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    expm1
+    log1p
+```
+
+### Powers
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sqrt
+    square
+```
+
+### Miscellaneous
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    clip
+    abs
+    sign
+```
+
+## Neural network
+
+### More
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    make_loss
+    stop_gradient
+    mxnet.symbol.contrib.SparseEmbedding
+```
+
 ## API Reference
 
 <script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
diff --git a/docs/api/python/symbol/symbol.md b/docs/api/python/symbol/symbol.md
index e93976d603..e383597236 100644
--- a/docs/api/python/symbol/symbol.md
+++ b/docs/api/python/symbol/symbol.md
@@ -558,13 +558,13 @@ Composite multiple symbols into a new one by an operator.
 .. autosummary::
     :nosignatures:
 
-    sample_uniform
-    sample_normal
-    sample_gamma
-    sample_exponential
-    sample_poisson
-    sample_negative_binomial
-    sample_generalized_negative_binomial
+    mxnet.symbol.random.uniform
+    mxnet.symbol.random.normal
+    mxnet.symbol.random.gamma
+    mxnet.symbol.random.exponential
+    mxnet.symbol.random.poisson
+    mxnet.symbol.random.negative_binomial
+    mxnet.symbol.random.generalized_negative_binomial
     mxnet.random.seed
 ```
 
@@ -579,7 +579,6 @@ Composite multiple symbols into a new one by an operator.
     argsort
     argmax
     argmin
-    argmax_channel
 ```
 
 ### Sequence operation
diff --git a/docs/conf.py b/docs/conf.py
index ad51323f01..d018408d45 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -59,6 +59,7 @@
     'sphinx.ext.autosummary',
     'sphinx.ext.napoleon',
     'sphinx.ext.mathjax',
+    'sphinx.ext.viewcode',
     'breathe',
     'mxdoc'
 ]
diff --git a/docs/faq/index.md b/docs/faq/index.md
index 1bfaea4a7f..e29bda0b68 100644
--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@@ -40,6 +40,9 @@ and full working examples, visit the [tutorials section](../tutorials/index.md).
 
 * [How to convert MXNet models to Apple CoreML format?](https://github.com/apache/incubator-mxnet/tree/master/tools/coreml)
 
+## Security
+* [How to run MXNet securely?](http://mxnet.io/how_to/security.md)
+
 ## Extend and Contribute to MXNet
 
 * [How do I join the MXNet development discussion?](http://mxnet.io/community/mxnet_channels.html)
diff --git a/docs/how_to/security.md b/docs/how_to/security.md
new file mode 100644
index 0000000000..3434cf5c97
--- /dev/null
+++ b/docs/how_to/security.md
@@ -0,0 +1,22 @@
+# MXNet Security best practices
+
+MXNet framework has no built-in security protections. It assumes that the MXNet entities involved in model training and inferencing (hosting) are fully trusted. It also assumes that their communications cannot be eavesdropped or tampered with. MXNet consumers shall ensure that the above assumptions are met.
+
+In particular the following threat-vectors exist when training using MXNet:
+
+* When running distributed training using MXNet there is no built-in support for authenticating cluster nodes participating in the training job.
+* Data exchange between cluster nodes happens is in plain-text.
+* Using `kvstore.set_optimizer` one can use a custom optimizer to combine gradients. This optimizer code is sent to the server nodes as a pickle file. A server does not perform any further validation of the pickle file and simply executes the code trusting the sender (worker).
+* Since there is no authentication between nodes, a malicious actor running on the same network can launch a Denial of Service (DoS) attack by sending data that can overwhelm/crash a scheduler or other server nodes.
+
+It is highly recommended that the following best practices be followed when using MXNet:
+
+* Run MXNet with least privilege, i.e. not as root.
+* Run MXNet training jobs inside a secure and isolated environment. If you are using a cloud provider like Amazon AWS, running your training job inside a [private VPC] (https://aws.amazon.com/vpc/) is a good way to accomplish this. Additionally, configure your network security settings so as to only allow connections that the cluster nodes require.
+* Make sure no unauthorized actors have physical or remote access to the nodes participating in MXNet training.
+* During training, one can configure MXNet to periodically save model checkpoints. To protect these model checkpoints from unauthorized access, make sure the checkpoints are written out to an encrypted storage volume, and have a provision to delete checkpoints that are no longer needed.
+* When sharing trained models, or when receiving trained models from other parties, ensure that model artifacts are authenticated and integrity protected using cryptographic signatures, thus ensuring that the data received comes from trusted sources and has not been maliciously (or accidentally) modified in transit.
+* By default, mx.random uses a static and fixed seed value. The random utilities in MXNet should therefore never be used to implement any type of security critical functionality where cryptographically secure pseudorandom number generation is required.
+
+# Deployment Considerations
+* When deploying high-value, proprietary models for inference, care should be taken to prevent an adversary from stealing the model. The research paper [Stealing Machine Learning Models via Prediction APIs] (https://arxiv.org/pdf/1609.02943.pdf) outlines experiments performed to show how an attacker can use a prediction API to leak the ML model or construct a nearly identical replica. A simple way to thwart such an attack is to not expose the prediction probabilities to a high degree of precision in the API response.
diff --git a/docs/mxdoc.py b/docs/mxdoc.py
index 26e4c9e265..caf135680d 100644
--- a/docs/mxdoc.py
+++ b/docs/mxdoc.py
@@ -62,8 +62,12 @@ def generate_doxygen(app):
 
 def build_mxnet(app):
     """Build mxnet .so lib"""
-    _run_cmd("cd %s/.. && cp make/config.mk config.mk && make -j$(nproc) DEBUG=1" %
-            app.builder.srcdir)
+    if not os.path.exists(os.path.join(app.builder.srcdir, '..', 'config.mk')):
+        _run_cmd("cd %s/.. && cp make/config.mk config.mk && make -j$(nproc) DEBUG=1" %
+                app.builder.srcdir)
+    else:
+        _run_cmd("cd %s/.. && make -j$(nproc) DEBUG=1" %
+                app.builder.srcdir)
 
 def build_r_docs(app):
     """build r pdf"""
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index aaaa40c381..6429dfb31b 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -2,13 +2,37 @@
 
 These tutorials introduce a few fundamental concepts in deep learning and how to implement them in _MXNet_. The _Basics_ section contains tutorials on manipulating arrays, building networks, loading/preprocessing data, etc. The _Training and Inference_ section talks about implementing Linear Regression, training a Handwritten digit classifier using MLP and CNN, running inferences using a pre-trained model, and lastly, efficiently training a large scale image classifier.
 
-```eval_rst
-.. Note:: We are working on a set of tutorials for the new imperative interface called Gluon. A preview version is hosted at http://gluon.mxnet.io.
-```
 
-## Python
+## Gluon
+
+Gluon is the high-level interface for MXNet. It is more intuitive and easier to use than the lower level interface.
+Gluon supports dynamic (define-by-run) graphs with JIT-compilation to achieve both flexibility and efficiency.
+This is a selected subset of Gluon tutorials. For the comprehensive tutorial on Gluon,
+please see [gluon.mxnet.io](http://gluon.mxnet.io).
+
+### Basics
+
+- [Manipulate data the MXNet way with ndarray](http://gluon.mxnet.io/chapter01_crashcourse/ndarray.html)
+- [Automatic differentiation with autograd](http://gluon.mxnet.io/chapter01_crashcourse/autograd.html)
+- [Linear regression with gluon](http://gluon.mxnet.io/chapter02_supervised-learning/linear-regression-gluon.html)
+- [Serialization - saving, loading and checkpointing](http://gluon.mxnet.io/chapter03_deep-neural-networks/serialization.html)
+
+### Neural Networks
+
+- [Multilayer perceptrons in gluon](http://gluon.mxnet.io/chapter03_deep-neural-networks/mlp-gluon.html)
+- [Convolutional Neural Networks in gluon](http://gluon.mxnet.io/chapter04_convolutional-neural-networks/cnn-gluon.html)
+- [Recurrent Neural Networks with gluon](http://gluon.mxnet.io/chapter05_recurrent-neural-networks/rnns-gluon.html)
+
+### Advanced
+
+- [Plumbing: A look under the hood of gluon](http://gluon.mxnet.io/chapter03_deep-neural-networks/plumbing.html)
+- [Designing a custom layer with gluon](http://gluon.mxnet.io/chapter03_deep-neural-networks/custom-layer.html)
+- [Fast, portable neural networks with Gluon HybridBlocks](http://gluon.mxnet.io/chapter07_distributed-learning/hybridize.html)
+- [Training on multiple GPUs with gluon](http://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html)
+
+## MXNet
 
-### Basic
+### Basics
 
 ```eval_rst
 .. toctree::
diff --git a/docs/tutorials/sparse/row_sparse.md b/docs/tutorials/sparse/row_sparse.md
index 6a69341da9..55f8a7d1c5 100644
--- a/docs/tutorials/sparse/row_sparse.md
+++ b/docs/tutorials/sparse/row_sparse.md
@@ -6,7 +6,7 @@
 Many real world datasets deal with high dimensional sparse feature vectors. When learning
 the weights of models with sparse datasets, the derived gradients of the weights could be sparse.
 
-Let's say we perform a matrix multiplication of ``X``  and ``W``, where ``X`` is a 2x2 matrix, and ``W`` is a 2x1 matrix. Let ``Y`` be the matrix multiplication of the two matrices:
+Let's say we perform a matrix multiplication of ``X``  and ``W``, where ``X`` is a 1x2 matrix, and ``W`` is a 2x3 matrix. Let ``Y`` be the matrix multiplication of the two matrices:
 
 ```python
 import mxnet as mx
diff --git a/example/gluon/data.py b/example/gluon/data.py
index 30c1a8c59b..67519e6a20 100644
--- a/example/gluon/data.py
+++ b/example/gluon/data.py
@@ -115,7 +115,7 @@ def imagenet_iterator(train_data, val_data, batch_size, data_shape, resize=-1):
 
 
 class DummyIter(mx.io.DataIter):
-    def __init__(self, batch_size, data_shape, batches = 5):
+    def __init__(self, batch_size, data_shape, batches = 100):
         super(DummyIter, self).__init__(batch_size)
         self.data_shape = (batch_size,) + data_shape
         self.label_shape = (batch_size,)
diff --git a/example/image-classification/symbols/vgg.py b/example/image-classification/symbols/vgg.py
index 94e8962c64..8ae48a0a28 100644
--- a/example/image-classification/symbols/vgg.py
+++ b/example/image-classification/symbols/vgg.py
@@ -29,10 +29,10 @@ def get_feature(internel_layer, layers, filters, batch_norm = False, **kwargs):
         for j in range(num):
             internel_layer = mx.sym.Convolution(data = internel_layer, kernel=(3, 3), pad=(1, 1), num_filter=filters[i], name="conv%s_%s" %(i + 1, j + 1))
             if batch_norm:
-                internel_layer = mx.symbol.BatchNorm(data=internel_layer, name="bn%s_%s" %(i + 1, j + 1)) 
+                internel_layer = mx.symbol.BatchNorm(data=internel_layer, name="bn%s_%s" %(i + 1, j + 1))
             internel_layer = mx.sym.Activation(data=internel_layer, act_type="relu", name="relu%s_%s" %(i + 1, j + 1))
         internel_layer = mx.sym.Pooling(data=internel_layer, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool%s" %(i + 1))
-    return internel_layer    
+    return internel_layer
 
 def get_classifier(input_data, num_classes, **kwargs):
     flatten = mx.sym.Flatten(data=input_data, name="flatten")
@@ -43,7 +43,7 @@ def get_classifier(input_data, num_classes, **kwargs):
     relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
     drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
     fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8")
-    return fc8  
+    return fc8
 
 def get_symbol(num_classes, num_layers=11, batch_norm=False, dtype='float32', **kwargs):
     """
@@ -54,23 +54,23 @@ def get_symbol(num_classes, num_layers=11, batch_norm=False, dtype='float32', **
     num_layers : int
         Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
     batch_norm : bool, default False
-        Use batch normalization. 
+        Use batch normalization.
     dtype: str, float32 or float16
-        Data precision.   
+        Data precision.
     """
     vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
                 13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
                 16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
                 19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
-    if not vgg_spec.has_key(num_layers):        
+    if not vgg_spec.has_key(num_layers):
         raise ValueError("Invalide num_layers {}. Possible choices are 11,13,16,19.".format(num_layers))
-    layers, filters = vgg_spec[num_layers] 
+    layers, filters = vgg_spec[num_layers]
     data = mx.sym.Variable(name="data")
     if dtype == 'float16':
         data = mx.sym.Cast(data=data, dtype=np.float16)
     feature = get_feature(data, layers, filters, batch_norm)
     classifier = get_classifier(feature, num_classes)
     if dtype == 'float16':
-        classifier = mx.sym.Cast(data=classifier, dtype=np.float32)  
+        classifier = mx.sym.Cast(data=classifier, dtype=np.float32)
     symbol = mx.sym.SoftmaxOutput(data=classifier, name='softmax')
     return symbol
diff --git a/example/sparse/linear_classification.py b/example/sparse/linear_classification.py
index 70f896386c..1d63c55b11 100644
--- a/example/sparse/linear_classification.py
+++ b/example/sparse/linear_classification.py
@@ -126,8 +126,8 @@
         # evaluate metric on validation dataset
         score = mod.score(eval_data, ['nll_loss'])
         logging.info('epoch %d, eval nll = %s ' % (epoch, score[0][1]))
-        save_optimizer_states = 'dist' not in kv.type
-        mod.save_checkpoint("checkpoint", epoch, save_optimizer_states=False)
+        save_optimizer_states = 'dist' not in kv.type if kv else True
+        mod.save_checkpoint("checkpoint", epoch, save_optimizer_states=save_optimizer_states)
         # reset the iterator for next pass of data
         data_iter.reset()
     logging.info('Training completed.')
diff --git a/example/sparse/matrix_factorization.py b/example/sparse/matrix_factorization.py
index cdb61643d3..3387706665 100644
--- a/example/sparse/matrix_factorization.py
+++ b/example/sparse/matrix_factorization.py
@@ -22,6 +22,8 @@
 import numpy as np
 from get_data import get_movielens_iter, get_movielens_data
 from matrix_fact_model import matrix_fact_net
+
+
 logging.basicConfig(level=logging.DEBUG)
 
 parser = argparse.ArgumentParser(description="Run matrix factorization with sparse embedding",
@@ -36,6 +38,8 @@
                     help="the factor size of the embedding operation")
 parser.add_argument('--use-dense', action='store_true',
                     help="use the dense embedding operator")
+parser.add_argument('--use-gpu', action='store_true',
+                    help="use gpu")
 parser.add_argument('--dummy-iter', action='store_true',
                     help="use the dummy data iterator for speed test")
 
@@ -63,7 +67,7 @@
     print_every = args.print_every
 
     momentum = 0.9
-    ctx = mx.cpu(0)
+    ctx = mx.gpu(0) if args.use_gpu else mx.cpu(0)
     learning_rate = 0.1
 
     # prepare dataset and iterators
@@ -75,7 +79,6 @@
 
     # construct the model
     net = matrix_fact_net(factor_size, factor_size, max_user, max_movies, sparse_embed=use_sparse)
-    a = time.time()
 
     # initialize the module
     mod = mx.module.Module(symbol=net, context=ctx, data_names=['user', 'item'],
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index cceee70ffd..7c136a6470 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file base.h
  * \brief configuation of mxnet as well as basic data structure.
  */
@@ -143,7 +144,8 @@ struct Context {
   enum DeviceType {
     kCPU = cpu::kDevMask,
     kGPU = gpu::kDevMask,
-    kCPUPinned = 3
+    kCPUPinned = 3,
+    kCPUShared = 5,
   };
   /*! \brief the device type we run the op on */
   DeviceType dev_type;
@@ -155,10 +157,17 @@ struct Context {
    * \brief Get corresponding device mask
    * \return cpu::kDevMask or gpu::kDevMask
    */
-  inline int dev_mask() const {
-    if (dev_type == kCPUPinned) return cpu::kDevMask;
+  inline DeviceType dev_mask() const {
+    if (dev_type == kCPUPinned || dev_type == kCPUShared) return kCPU;
     return dev_type;
   }
+  /*!
+   * \brief Returns dev_id for kGPU, 0 otherwise
+   */
+  inline int real_dev_id() const {
+    if (dev_type == kGPU) return dev_id;
+    return 0;
+  }
   /*!
    * \brief Comparator, used to enable Context as std::map key.
    * \param b another context to compare
@@ -200,7 +209,7 @@ struct Context {
     return true;
   }
   /*! \brief the maximal device type */
-  static const int32_t kMaxDevType = 4;
+  static const int32_t kMaxDevType = 6;
   /*! \brief the maximal device index */
   static const int32_t kMaxDevID = 16;
   /*!
@@ -223,6 +232,12 @@ struct Context {
    * \return Pinned CPU context. -1 for current GPU.
    */
   inline static Context CPUPinned(int32_t dev_id = -1);
+  /*!
+   * Create a CPU shared memory context.
+   * \param dev_id dummy device id.
+   * \return CPU shared memory context.
+   */
+  inline static Context CPUShared(int32_t dev_id = 0);
   /*!
    * Create a context from string of the format [cpu|gpu|cpu_pinned](n)
    * \param str the string pattern
@@ -273,7 +288,7 @@ inline Context Context::Create(DeviceType dev_type, int32_t dev_id) {
   ctx.dev_type = dev_type;
   if (dev_id < 0) {
     ctx.dev_id = 0;
-    if (dev_type != kCPU) {
+    if (dev_type & kGPU) {
 #if MXNET_USE_CUDA
       CHECK_EQ(cudaGetDevice(&ctx.dev_id), cudaSuccess);
 #else
@@ -293,6 +308,10 @@ inline Context Context::CPUPinned(int32_t dev_id) {
   return Create(kCPUPinned, dev_id);
 }
 
+inline Context Context::CPUShared(int32_t dev_id) {
+  return Create(kCPUShared, dev_id);
+}
+
 inline Context Context::GPU(int32_t dev_id) {
   return Create(kGPU, dev_id);
 }
@@ -313,6 +332,8 @@ inline Context Context::FromString(std::string str) {
       ret = GPU(id);
     } else if (type == "cpu_pinned") {
       ret = CPUPinned(id);
+    } else if (type == "cpu_shared") {
+      ret = CPUShared(id);
     } else {
       LOG(FATAL) << "Invalid context string " << str;
     }
@@ -329,6 +350,8 @@ inline std::ostream& operator<<(std::ostream &out, const Context &ctx) {
     out << "gpu(";
   } else if (ctx.dev_type == Context::kCPUPinned) {
     out << "cpu_pinned(";
+  } else if (ctx.dev_type == Context::kCPUShared) {
+    out << "cpu_shared(";
   } else {
     out << "unknown(";
   }
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 55b840dd2c..77fc6a5f50 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file c_api.h
  * \brief C API of mxnet
  */
@@ -146,7 +147,9 @@ enum CustomOpPropCallbacks {
   kCustomOpPropInferShape,
   kCustomOpPropDeclareBackwardDependency,
   kCustomOpPropCreateOperator,
-  kCustomOpPropInferType
+  kCustomOpPropInferType,
+  kCustomOpPropInferStorageType,
+  kCustomOpPropBackwardInferStorageType
 };
 
 
@@ -158,6 +161,10 @@ typedef int (*CustomOpListFunc)(char*** /*args*/, void* /*state*/);
 typedef int (*CustomOpInferShapeFunc)(int /*num_input*/, int* /*ndims*/,
                                       unsigned** /*shapes*/, void* /*state*/);
 typedef int (*CustomOpInferTypeFunc)(int /*num_input*/, int* /*types*/, void* /*state*/);
+typedef int (*CustomOpInferStorageTypeFunc)(int /*num_input*/, int* /*stypes*/, void* /*state*/);
+typedef int (*CustomOpBackwardInferStorageTypeFunc)(int /*num_input*/,
+                                                    int * /*stypes*/,
+                                                    void * /*state*/);
 typedef int (*CustomOpBwdDepFunc)(const int* /*out_grad*/, const int* /*in_data*/,
                                   const int* /*out_data*/, int* /*num_deps*/,
                                   int** /*rdeps*/, void* /*state*/);
@@ -232,6 +239,13 @@ MXNET_DLL int MXDumpProfile();
 /*! \brief Set the number of OMP threads to use */
 MXNET_DLL int MXSetNumOMPThreads(int thread_num);
 
+/*!
+ * \brief set bulk execution limit
+ * \param bulk_size new bulk_size
+ * \param prev_bulk_size previous bulk_size
+ */
+MXNET_DLL int MXEngineSetBulkSize(int bulk_size, int* prev_bulk_size);
+
 /*!
  * \brief get the MXNet library version as an integer
  * \param pointer to the integer holding the version number
@@ -404,6 +418,12 @@ MXNET_DLL int MXNDArraySyncCopyFromNDArray(NDArrayHandle handle_dst,
                                            const NDArrayHandle handle_src,
                                            const int i);
 
+/*!
+ * \brief check whether the NDArray format is valid
+ * \param full_check if `True`, rigorous check, O(N) operations
+ *    Otherwise basic check, O(1) operations
+ */
+MXNET_DLL int MXNDArraySyncCheckFormat(NDArrayHandle handle, const bool full_check);
 /*!
  * \brief Wait until all the pending writes with respect NDArray are finished.
  *  Always call this before read data out synchronizely.
@@ -1994,6 +2014,26 @@ MXNET_DLL int MXRtcCudaKernelCall(CudaKernelHandle handle, int dev_id, void** ar
                                   mx_uint grid_dim_z, mx_uint block_dim_x,
                                   mx_uint block_dim_y, mx_uint block_dim_z,
                                   mx_uint shared_mem);
+/*!
+ * \brief Get shared memory handle from NDArray
+ * \param handle NDArray handle.
+ * \param shared_pid output PID
+ * \param shared_id output shared memory id.
+ */
+MXNET_DLL int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid,
+                                          int* shared_id);
+/*!
+ * \brief Reconstruct NDArray from shared memory handle
+ * \param shared_pid shared PID
+ * \param shared_id shared memory id
+ * \param shape pointer to NDArray dimensions
+ * \param ndim number of NDArray dimensions
+ * \param dtype data type of NDArray
+ * \param out constructed NDArray
+ */
+MXNET_DLL int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const mx_uint *shape,
+                                           mx_uint ndim, int dtype, NDArrayHandle *out);
+
 
 #ifdef __cplusplus
 }
diff --git a/include/mxnet/c_predict_api.h b/include/mxnet/c_predict_api.h
index 8cf153e7ca..e4bfb398d5 100644
--- a/include/mxnet/c_predict_api.h
+++ b/include/mxnet/c_predict_api.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file c_predict_api.h
  * \brief C predict API of mxnet, contains a minimum API to run prediction.
  *  This file is self-contained, and do not dependent on any other files.
diff --git a/include/mxnet/engine.h b/include/mxnet/engine.h
index 4c2314e176..366a6b61b3 100644
--- a/include/mxnet/engine.h
+++ b/include/mxnet/engine.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file engine.h
  * \brief Engine that schedules all the operations according to dependency.
  */
@@ -112,6 +113,18 @@ class MXNET_API Engine {
    * \return 0 when success, -1 when failure happens.
    */
   virtual void NotifyShutdown() = 0;
+  /*!
+   *\brief Stop all workers in the engine
+   */
+  virtual void Stop() {
+    LOG(FATAL) << "Engine cannot be stopped";
+  }
+  /*!
+   * \brief Restart all workers in the engine
+   */
+  virtual void Start() {
+    LOG(FATAL) << "Engine cannot be restarted";
+  }
   /*!
    * \brief Allocate a new variable, the variable can then
    *        be used to schedule the operation concurrently via dependency
@@ -221,12 +234,12 @@ class MXNET_API Engine {
    * \param opr_name The operator name.
    * \tparam SyncFn the synchronous function to be pushed.
    */
-  inline void PushSync(SyncFn exec_fn, Context exec_ctx,
-                       std::vector<VarHandle> const& const_vars,
-                       std::vector<VarHandle> const& mutable_vars,
-                       FnProperty prop = FnProperty::kNormal,
-                       int priority = 0,
-                       const char* opr_name = nullptr) {
+  virtual void PushSync(SyncFn exec_fn, Context exec_ctx,
+                        std::vector<VarHandle> const& const_vars,
+                        std::vector<VarHandle> const& mutable_vars,
+                        FnProperty prop = FnProperty::kNormal,
+                        int priority = 0,
+                        const char* opr_name = nullptr) {
     this->PushAsync([exec_fn](RunContext ctx, CallbackOnComplete on_complete) {
         exec_fn(ctx);
         on_complete();
@@ -267,6 +280,14 @@ class MXNET_API Engine {
     }
     read_vars->resize(rtop - read_vars->begin());
   }
+  /*! \brief query current limit for bulk size */
+  virtual int bulk_size() const {
+    return 0;
+  }
+  /*! \brief set maximum limit for bulk size */
+  virtual int set_bulk_size(int) {
+    return 0;
+  }
 };  // class Engine
 #endif  // DMLC_USE_CXX11
 }  // namespace mxnet
diff --git a/include/mxnet/executor.h b/include/mxnet/executor.h
index 85d34778dd..d749100f5d 100644
--- a/include/mxnet/executor.h
+++ b/include/mxnet/executor.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file executor.h
  * \brief Symbolic executor interface of mxnet.
  * \author Min Lin, Bing Xu
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 68c1ede65a..3c806d85d5 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file io.h
  * \brief mxnet io data structure and data iterator
  */
diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h
index ddaa207dab..1649c43680 100644
--- a/include/mxnet/kvstore.h
+++ b/include/mxnet/kvstore.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file kvstore.h
  * \brief key-value store interface for mxnet
  */
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 84ee9fa5e4..fa0c367052 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file ndarray.h
  * \brief NDArray interface that handles array arithematics.
  */
@@ -62,6 +63,15 @@ enum NDArrayStorageType {
   kCSRStorage,             // csr
 };
 
+enum NDArrayFormatErr {
+  kNormalErr,     // normal
+  kCSRShapeErr,   // shape mismatch for csr
+  kCSRIndPtrErr,  // indptr error for csr
+  kCSRIdxErr,     // idx error for csr
+  kRSPShapeErr,   // shape mismatch for row sparse
+  kRSPIdxErr,     // indices error for row sparse
+};
+
 
 /*!
  * \brief ndarray interface
@@ -151,6 +161,14 @@ class NDArray {
     Mkl_mem_ = std::make_shared<MKLMemHolder>();
 #endif
   }
+  /*! \brief create ndarray from shared memory */
+  NDArray(int shared_pid, int shared_id, const TShape& shape, int dtype)
+      : ptr_(std::make_shared<Chunk>(shared_pid, shared_id, shape, dtype)), shape_(shape),
+        dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) {
+#if MKL_EXPERIMENTAL == 1
+    Mkl_mem_ = std::make_shared<MKLMemHolder>();
+#endif
+  }
 
   /*!
    * \brief constructing a static NDArray of non-default storage that shares data with TBlob
@@ -308,6 +326,13 @@ class NDArray {
     }
     return true;
   }
+  /*! \brief get storage handle */
+  inline Storage::Handle storage_handle() const {
+    CHECK(!is_none());
+    CHECK_EQ(storage_type(), kDefaultStorage);
+    CheckAndAlloc();
+    return ptr_->shandle;
+  }
   /*!
    * \brief Block until all the pending write operations with respect
    *    to current NDArray are finished, and read can be performed.
@@ -326,7 +351,10 @@ class NDArray {
      * Push an empty mutable function to flush all preceding reads to the
      * variable.
      */
-    Engine::Get()->PushSync([](RunContext) {}, Context{}, {}, {ptr_->var});
+    Engine::Get()->PushAsync(
+      [](RunContext, Engine::CallbackOnComplete on_complete) {
+        on_complete();
+      }, Context{}, {}, {ptr_->var});
     Engine::Get()->WaitForVar(ptr_->var);
   }
   /*! \return the associated variable of the ndarray.*/
@@ -446,6 +474,12 @@ class NDArray {
    * \param size the memory size we want to copy into, in sizeof(DType) not raw btyes.
    */
   void SyncCopyToCPU(void *data, size_t size) const;
+  /*!
+  * \brief check whether the NDArray format is valid
+  * \param full_check if `True`, rigorous check, O(N) operations
+  *    Otherwise basic check, O(1) operations
+  */
+  void SyncCheckFormat(const bool full_check) const;
   /*!
    * \brief Slice a NDArray
    * \param begin begin index in first dim (inclusive)
@@ -664,6 +698,18 @@ class NDArray {
       shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_);
       storage_shape = data.shape_;
     }
+
+    Chunk(int shared_pid, int shared_id, const TShape& shape, int dtype)
+        : static_data(false), delay_alloc(false) {
+      var = Engine::Get()->NewVariable();
+      ctx = Context::CPUShared(0);
+      shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);;
+      shandle.ctx = ctx;
+      shandle.shared_pid = shared_pid;
+      shandle.shared_id = shared_id;
+      Storage::Get()->Alloc(&shandle);
+      storage_shape = shape;
+    }
     // Constructor for a non-default storage chunk
     Chunk(NDArrayStorageType storage_type_, const TShape &storage_shape_, Context ctx_,
           bool delay_alloc_, int dtype, const std::vector<int> &aux_types_,
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index 9c512eedd5..8cb8a99b46 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file op_attr_types.h
  * \brief Additional operator attributes
  *  beside the ones provided by NNVM
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 2245db0dbb..cfa1627804 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file operator.h
  * \brief Operator interface of mxnet.
  * \author Naiyan Wang
diff --git a/include/mxnet/operator_util.h b/include/mxnet/operator_util.h
index 92ef2ecc58..bebe3f13ae 100644
--- a/include/mxnet/operator_util.h
+++ b/include/mxnet/operator_util.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file operator_util.h
  * \brief Utility functions and registries to help quickly build new operators.
  *  [Deprecated]
diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
index 1ca1fc6fa7..7d2e6caf85 100644
--- a/include/mxnet/resource.h
+++ b/include/mxnet/resource.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file resource.h
  * \brief Global resource allocation handling.
  */
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index 7e3af8eeca..d137540513 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file storage.h
  * \brief Storage manager across multiple devices.
  */
@@ -50,6 +51,11 @@ class Storage {
      * \brief Context information about device and ID.
      */
     Context ctx;
+    /*!
+     * \brief Id for IPC shared memory
+     */
+    int shared_pid{-1};
+    int shared_id{-1};
   };
   /*!
    * \brief Allocate a new contiguous memory for a given size.
@@ -57,7 +63,23 @@ class Storage {
    * \param ctx Context information about the device and ID.
    * \return Handle struct.
    */
-  virtual Handle Alloc(size_t size, Context ctx) = 0;
+  Handle Alloc(size_t size, Context ctx) {
+    Handle hd;
+    hd.size = size;
+    hd.ctx = ctx;
+    this->Alloc(&hd);
+    return hd;
+  }
+  /*!
+   * \brief Allocate a new contiguous memory for a given size.
+   * \param handle handle initialized with size and ctx
+   */
+  virtual void Alloc(Handle* handle) = 0;
+  /*!
+   * \brief Increase ref counter on shared memory.
+   * \param handle handle to shared memory.
+   */
+  virtual void SharedIncrementRefCount(Handle handle) = 0;
   /*!
    * \brief Free storage.
    * \param handle Handle struect.
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index 18bf4fa780..b65cd2b434 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2014 by Contributors
  * \file tensor_blob.h
  * \brief TBlob class that holds common representation of
  *  arbirary dimension tensor, can be used to transformed
diff --git a/mshadow b/mshadow
index 2d7780c3f2..7ff9a4e08d 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 2d7780c3f2eefe4453fa419862d1b2089bedb8d5
+Subproject commit 7ff9a4e08dd6d0a3c462fc4169de590708080ccc
diff --git a/nnvm b/nnvm
index 8d79cfd0b4..b279286304 160000
--- a/nnvm
+++ b/nnvm
@@ -1 +1 @@
-Subproject commit 8d79cfd0b42fbe9f6ad75886d495065d5500b9dd
+Subproject commit b279286304ac954098d94a2695bca599e832effb
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Types.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Types.pm
index e9b1e5522e..ea17f19554 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Types.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Types.pm
@@ -77,4 +77,4 @@ subtype "SymbolOrArrayOfSymbols" => as "AI::MXNet::Symbol|ArrayRef[AI::MXNet::Sy
 subtype "NameShapeOrDataDesc" => as "NameShape|AI::MXNet::DataDesc";
 subtype "AdvancedSlice"   => as "ArrayRef[ArrayRef|PDL|PDL::Matrix|AI::MXNet::NDArray]";
 
-1;
\ No newline at end of file
+1;
diff --git a/plugin/caffe/caffe_blob.cc b/plugin/caffe/caffe_blob.cc
index 697efbfa99..4d655f32dd 100644
--- a/plugin/caffe/caffe_blob.cc
+++ b/plugin/caffe/caffe_blob.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_blob.cc
  * \brief Implementations of SetDataGradToBlob given various device/dimension
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_blob.h b/plugin/caffe/caffe_blob.h
index 666d269fda..a54c5c81ff 100644
--- a/plugin/caffe/caffe_blob.h
+++ b/plugin/caffe/caffe_blob.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_blob.h
  * \brief conversion between tensor and caffeBlob
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_common.cc b/plugin/caffe/caffe_common.cc
index 53513a17d6..dd445efbd6 100644
--- a/plugin/caffe/caffe_common.cc
+++ b/plugin/caffe/caffe_common.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_common.h
  * \brief Common functions for caffeOp and caffeLoss symbols
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_common.h b/plugin/caffe/caffe_common.h
index 8565d9e2e2..ba7b9ad7b8 100644
--- a/plugin/caffe/caffe_common.h
+++ b/plugin/caffe/caffe_common.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_common.h
  * \brief Common functions for caffeOp and caffeLoss symbols
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_data_iter.cc b/plugin/caffe/caffe_data_iter.cc
index 2682298b4b..cc96c3898e 100644
--- a/plugin/caffe/caffe_data_iter.cc
+++ b/plugin/caffe/caffe_data_iter.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file caffe_data_iter.cc
  * \brief register mnist iterator
 */
diff --git a/plugin/caffe/caffe_fieldentry.h b/plugin/caffe/caffe_fieldentry.h
index 47d246f443..f97b76519e 100644
--- a/plugin/caffe/caffe_fieldentry.h
+++ b/plugin/caffe/caffe_fieldentry.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_fieldentry.h
  * \brief Implement FieldEntry<caffe::LayerParameter>
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_loss-inl.h b/plugin/caffe/caffe_loss-inl.h
index 37bfcf06be..60b03b1d92 100644
--- a/plugin/caffe/caffe_loss-inl.h
+++ b/plugin/caffe/caffe_loss-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_loss-inl.h
  * \brief Caffe Operator
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_loss.cc b/plugin/caffe/caffe_loss.cc
index ce697d6c8f..5ce8bb247e 100644
--- a/plugin/caffe/caffe_loss.cc
+++ b/plugin/caffe/caffe_loss.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_loss.cc
  * \brief caffe loss
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_loss.cu b/plugin/caffe/caffe_loss.cu
index 2002cf2c66..698dbe1f1b 100644
--- a/plugin/caffe/caffe_loss.cu
+++ b/plugin/caffe/caffe_loss.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_loss_gpu.cc
  * \brief caffe loss
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_op-inl.h b/plugin/caffe/caffe_op-inl.h
index 43b9b5a091..2c1c9bac17 100644
--- a/plugin/caffe/caffe_op-inl.h
+++ b/plugin/caffe/caffe_op-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_op-inl.h
  * \brief Caffe Operator
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_op.cc b/plugin/caffe/caffe_op.cc
index 5198ccaac7..9db9df0406 100644
--- a/plugin/caffe/caffe_op.cc
+++ b/plugin/caffe/caffe_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_op.cc
  * \brief caffe operator
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_op.cu b/plugin/caffe/caffe_op.cu
index be6c20a108..0802b61313 100644
--- a/plugin/caffe/caffe_op.cu
+++ b/plugin/caffe/caffe_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_operator_gpu.cc
  * \brief caffe operator
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_stream.cc b/plugin/caffe/caffe_stream.cc
index 03badda65c..823948a8aa 100644
--- a/plugin/caffe/caffe_stream.cc
+++ b/plugin/caffe/caffe_stream.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_stream.cc
  * \brief define stream opertors >> and <<
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_stream.h b/plugin/caffe/caffe_stream.h
index b9a08d028f..228e3727da 100644
--- a/plugin/caffe/caffe_stream.h
+++ b/plugin/caffe/caffe_stream.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_stream.h
  * \brief define stream opertors >> and <<
  * \author Haoran Wang
diff --git a/plugin/opencv/cv_api.cc b/plugin/opencv/cv_api.cc
index b0bcbbce20..1508de376d 100644
--- a/plugin/opencv/cv_api.cc
+++ b/plugin/opencv/cv_api.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file cv_api.h
  * \brief C API for opencv
  * \author Junyuan Xie
diff --git a/plugin/opencv/cv_api.h b/plugin/opencv/cv_api.h
index e04357bf30..b318041eb6 100644
--- a/plugin/opencv/cv_api.h
+++ b/plugin/opencv/cv_api.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file cv_api.h
  * \brief C API for opencv
  * \author Junyuan Xie
diff --git a/plugin/sframe/iter_sframe.cc b/plugin/sframe/iter_sframe.cc
index 2a987e2b10..9f09916b81 100644
--- a/plugin/sframe/iter_sframe.cc
+++ b/plugin/sframe/iter_sframe.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file iter_sframe_image.cc
  * \brief
  * \author Bing Xu
diff --git a/plugin/torch/torch_base.cc b/plugin/torch/torch_base.cc
index 89f832ccdf..8a9d85b064 100644
--- a/plugin/torch/torch_base.cc
+++ b/plugin/torch/torch_base.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file torch_base.cc
  * \brief torch_state
  * \author Junyuan Xie
diff --git a/plugin/torch/torch_base.h b/plugin/torch/torch_base.h
index 3aaaa2f139..04bee24974 100644
--- a/plugin/torch/torch_base.h
+++ b/plugin/torch/torch_base.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file torch_base.h
  * \brief Torch interface.
  * \author Junyuan Xie
diff --git a/plugin/torch/torch_criterion-inl.h b/plugin/torch/torch_criterion-inl.h
index 7f592f1562..e0687ab39b 100644
--- a/plugin/torch/torch_criterion-inl.h
+++ b/plugin/torch/torch_criterion-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file torch_module-inl.h
  * \brief torch module operator
  * \author Min Lin
diff --git a/plugin/torch/torch_criterion.cc b/plugin/torch/torch_criterion.cc
index bdfb2f42e6..110a58156a 100644
--- a/plugin/torch/torch_criterion.cc
+++ b/plugin/torch/torch_criterion.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief activation op
  * \author Junyuan Xie
diff --git a/plugin/torch/torch_criterion.cu b/plugin/torch/torch_criterion.cu
index 68c519c7c9..ccb7145f36 100644
--- a/plugin/torch/torch_criterion.cu
+++ b/plugin/torch/torch_criterion.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief activation op
  * \author Bing Xu
diff --git a/plugin/torch/torch_function.cc b/plugin/torch/torch_function.cc
index a1c5ff578d..3ec9a000ac 100644
--- a/plugin/torch/torch_function.cc
+++ b/plugin/torch/torch_function.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file torch_base.cc
  * \brief torch_state
  * \author Junyuan Xie
diff --git a/plugin/torch/torch_function.h b/plugin/torch/torch_function.h
index 8fb2ccfde4..f6f760231b 100644
--- a/plugin/torch/torch_function.h
+++ b/plugin/torch/torch_function.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file torch_function.h
  * \brief Torch interface.
  * \author Junyuan Xie
diff --git a/plugin/torch/torch_module-inl.h b/plugin/torch/torch_module-inl.h
index 15b569fbbe..7fb0440aa5 100644
--- a/plugin/torch/torch_module-inl.h
+++ b/plugin/torch/torch_module-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file torch_module-inl.h
  * \brief torch module operator
  * \author Min Lin
diff --git a/plugin/torch/torch_module.cc b/plugin/torch/torch_module.cc
index 658669fb41..4ab792c4dd 100644
--- a/plugin/torch/torch_module.cc
+++ b/plugin/torch/torch_module.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief activation op
  * \author Bing Xu
diff --git a/plugin/torch/torch_module.cu b/plugin/torch/torch_module.cu
index caf9eb1991..d743da5fd9 100644
--- a/plugin/torch/torch_module.cu
+++ b/plugin/torch/torch_module.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief activation op
  * \author Bing Xu
diff --git a/plugin/warpctc/warpctc-inl.h b/plugin/warpctc/warpctc-inl.h
index d492656b2f..5a540c5794 100644
--- a/plugin/warpctc/warpctc-inl.h
+++ b/plugin/warpctc/warpctc-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file warpctc-inl.h
  * \brief warpctc operator
  * \author Liang Xiang
diff --git a/plugin/warpctc/warpctc.cc b/plugin/warpctc/warpctc.cc
index 0ff61be758..055a6d645d 100644
--- a/plugin/warpctc/warpctc.cc
+++ b/plugin/warpctc/warpctc.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file warpctc.cc
  * \brief warpctc op
  * \author Liang Xiang
diff --git a/plugin/warpctc/warpctc.cu b/plugin/warpctc/warpctc.cu
index 7562a12a3c..3ee20fc9d3 100644
--- a/plugin/warpctc/warpctc.cu
+++ b/plugin/warpctc/warpctc.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file warpctc.cc
  * \brief warpctc op
  * \author Liang Xiang
diff --git a/ps-lite b/ps-lite
index bdd4c67e9e..acdb698fa3 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit bdd4c67e9e34dc0b8350ce306b0caa737eb31c83
+Subproject commit acdb698fa3bb80929ef83bb37c705f025e119b82
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index cf0ba37ab9..4e2c4f0134 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -22,6 +22,7 @@
 from __future__ import absolute_import
 
 from .context import Context, current_context, cpu, gpu
+from . import engine
 from .base import MXNetError
 from . import base
 from . import contrib
diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py
index 0d02c049e3..a0c01a6e06 100644
--- a/python/mxnet/_ctypes/ndarray.py
+++ b/python/mxnet/_ctypes/ndarray.py
@@ -24,7 +24,7 @@
 import ctypes
 
 from ..base import _LIB
-from ..base import c_array, c_str
+from ..base import c_str_array, c_handle_array
 from ..base import NDArrayHandle, CachedOpHandle
 from ..base import check_call
 
@@ -69,7 +69,7 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
         if isinstance(out, NDArrayBase):
             out = (out,)
         num_output = ctypes.c_int(len(out))
-        output_vars = c_array(NDArrayHandle, [i.handle for i in out])
+        output_vars = c_handle_array(out)
         output_vars = ctypes.cast(output_vars, ctypes.POINTER(NDArrayHandle))
     else:
         original_output = None
@@ -83,12 +83,12 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
     check_call(_LIB.MXImperativeInvokeEx(
         ctypes.c_void_p(handle),
         ctypes.c_int(len(ndargs)),
-        c_array(NDArrayHandle, [arr.handle for arr in ndargs]),
+        c_handle_array(ndargs),
         ctypes.byref(num_output),
         ctypes.byref(output_vars),
         ctypes.c_int(len(keys)),
-        c_array(ctypes.c_char_p, [c_str(key) for key in keys]),
-        c_array(ctypes.c_char_p, [c_str(str(val)) for val in vals]),
+        c_str_array(keys),
+        c_str_array([str(s) for s in vals]),
         ctypes.byref(out_stypes)))
 
     if original_output is not None:
@@ -122,7 +122,7 @@ def __call__(self, *args, **kwargs):
             if isinstance(out, NDArrayBase):
                 out = (out,)
             num_output = ctypes.c_int(len(out))
-            output_vars = c_array(NDArrayHandle, [i.handle for i in out])
+            output_vars = c_handle_array(out)
             output_vars = ctypes.cast(output_vars, ctypes.POINTER(NDArrayHandle))
         else:
             original_output = None
@@ -140,7 +140,7 @@ def __call__(self, *args, **kwargs):
         check_call(_LIB.MXInvokeCachedOpEx(
             self.handle,
             ctypes.c_int(len(args)),
-            c_array(NDArrayHandle, [arr.handle for arr in args]),
+            c_handle_array(args),
             ctypes.byref(num_output),
             ctypes.byref(output_vars),
             ctypes.byref(out_stypes)))
diff --git a/python/mxnet/_ctypes/symbol.py b/python/mxnet/_ctypes/symbol.py
index 3ec2ddcdc5..fe4cb950ed 100644
--- a/python/mxnet/_ctypes/symbol.py
+++ b/python/mxnet/_ctypes/symbol.py
@@ -22,7 +22,7 @@
 
 import ctypes
 from ..base import _LIB
-from ..base import c_array, c_str, mx_uint
+from ..base import c_str_array, c_handle_array, c_str, mx_uint
 from ..base import SymbolHandle
 from ..base import check_call
 
@@ -79,11 +79,11 @@ def _compose(self, *args, **kwargs):
 
         num_args = len(args) + len(kwargs)
         if len(kwargs) != 0:
-            keys = c_array(ctypes.c_char_p, [c_str(key) for key in kwargs])
-            args = c_array(SymbolHandle, [s.handle for s in kwargs.values()])
+            keys = c_str_array(kwargs.keys())
+            args = c_handle_array(kwargs.values())
         else:
             keys = None
-            args = c_array(SymbolHandle, [s.handle for s in args])
+            args = c_handle_array(kwargs.values())
         check_call(_LIB.NNSymbolCompose(
             self.handle, name, num_args, keys, args))
 
@@ -95,10 +95,8 @@ def _set_attr(self, **kwargs):
         **kwargs
             The attributes to set
         """
-        keys = c_array(ctypes.c_char_p,
-                       [c_str(key) for key in kwargs])
-        vals = c_array(ctypes.c_char_p,
-                       [c_str(str(val)) for val in kwargs.values()])
+        keys = c_str_array(kwargs.keys())
+        vals = c_str_array([str(s) for s in kwargs.values()])
         num_args = mx_uint(len(kwargs))
         check_call(_LIB.MXSymbolSetAttrs(
             self.handle, num_args, keys, vals))
@@ -122,8 +120,8 @@ def _symbol_creator(handle, args, kwargs, keys, vals, name):
     check_call(_LIB.MXSymbolCreateAtomicSymbol(
         ctypes.c_void_p(handle),
         mx_uint(len(keys)),
-        c_array(ctypes.c_char_p, [c_str(i) for i in keys]),
-        c_array(ctypes.c_char_p, [c_str(str(i)) for i in vals]),
+        c_str_array(keys),
+        c_str_array([str(v) for v in vals]),
         ctypes.byref(sym_handle)))
 
     if args and kwargs:
diff --git a/python/mxnet/autograd.py b/python/mxnet/autograd.py
index dc81fbedbf..340a9e66f4 100644
--- a/python/mxnet/autograd.py
+++ b/python/mxnet/autograd.py
@@ -20,12 +20,13 @@
 from __future__ import absolute_import
 from __future__ import division
 
+from array import array
 from threading import Lock
 import traceback
 import ctypes
 from ctypes import c_int, c_void_p, CFUNCTYPE, POINTER, cast
-from .base import _LIB, check_call, string_types
-from .base import mx_uint, NDArrayHandle, c_array, MXCallbackList, SymbolHandle
+from .base import _LIB, check_call, string_types, mx_uint
+from .base import NDArrayHandle, c_array, c_handle_array, c_array_buf, MXCallbackList, SymbolHandle
 from .ndarray import NDArray, _ndarray_cls
 from .ndarray import _GRAD_REQ_MAP
 from .symbol import Symbol
@@ -207,21 +208,16 @@ def mark_variables(variables, gradients, grad_reqs='write'):
         variables = [variables]
         gradients = [gradients]
 
-    variable_handles = []
-    gradient_handles = []
-    for var, gradvar in zip(variables, gradients):
-        variable_handles.append(var.handle)
-        gradient_handles.append(gradvar.handle)
     if isinstance(grad_reqs, string_types):
         grad_reqs = [_GRAD_REQ_MAP[grad_reqs]]*len(variables)
     else:
         grad_reqs = [_GRAD_REQ_MAP[i] for i in grad_reqs]
 
     check_call(_LIB.MXAutogradMarkVariables(
-        len(variable_handles),
-        c_array(NDArrayHandle, variable_handles),
-        c_array(mx_uint, grad_reqs),
-        c_array(NDArrayHandle, gradient_handles)))
+        len(variables),
+        c_handle_array(variables),
+        c_array_buf(mx_uint, array('I', grad_reqs)),
+        c_handle_array(gradients)))
 
 
 def _parse_head(heads, head_grads):
@@ -231,7 +227,7 @@ def _parse_head(heads, head_grads):
     if isinstance(head_grads, NDArray):
         head_grads = [head_grads]
 
-    head_handles = c_array(NDArrayHandle, [i.handle for i in heads])
+    head_handles = c_handle_array(heads)
 
     if head_grads is None:
         hgrad_handles = ctypes.c_void_p(0)
@@ -318,11 +314,10 @@ def grad(heads, variables, head_grads=None, retain_graph=None, create_graph=Fals
     head_handles, hgrad_handles = _parse_head(heads, head_grads)
 
     if isinstance(variables, NDArray):
-        var_handles = [variables.handle]
+        variables = [variables]
     else:
         assert len(variables), "variables cannot be an empty list."
-        var_handles = [i.handle for i in variables]
-    var_handles = c_array(NDArrayHandle, var_handles)
+    var_handles = c_handle_array(variables)
 
     retain_graph = retain_graph if retain_graph is not None else create_graph
     grad_vars = ctypes.POINTER(NDArrayHandle)()
@@ -474,8 +469,6 @@ def delete_entry(_):
                 return False
             return True
 
-        input_handles = [x.handle for x in inputs]
-        output_handles = [x.handle for x in outputs]
         callbacks = [Function._bwd_functype(backward_entry),
                      Function._del_functype(delete_entry)]
         callbacks = [cast(i, CFUNCTYPE(c_int)) for i in callbacks]
@@ -486,9 +479,9 @@ def delete_entry(_):
                                       POINTER(c_void_p)))
         check_call(_LIB.MXCustomFunctionRecord(
             c_int(len(inputs)),
-            c_array(NDArrayHandle, input_handles),
+            c_handle_array(inputs),
             c_int(len(outputs)),
-            c_array(NDArrayHandle, output_handles),
+            c_handle_array(outputs),
             ctypes.byref(context)))
 
         Function._registry.ref_holder[key] = context
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 5882a50921..80fc9011db 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -145,6 +145,7 @@ def check_call(ret):
     if ret != 0:
         raise MXNetError(py_str(_LIB.MXGetLastError()))
 
+
 if sys.version_info[0] < 3:
     def c_str(string):
         """Create ctypes char * from a Python string.
@@ -166,6 +167,24 @@ def c_str(string):
         Hello, World
         """
         return ctypes.c_char_p(string)
+
+    def c_str_array(strings):
+        """Create ctypes const char ** from a list of Python strings.
+
+        Parameters
+        ----------
+        strings : list of string
+            Python strings.
+
+        Returns
+        -------
+        (ctypes.c_char_p * len(strings))
+            A const char ** pointer that can be passed to C API.
+        """
+        arr = (ctypes.c_char_p * len(strings))()
+        arr[:] = strings
+        return arr
+
 else:
     def c_str(string):
         """Create ctypes char * from a Python string.
@@ -188,6 +207,22 @@ def c_str(string):
         """
         return ctypes.c_char_p(string.encode('utf-8'))
 
+    def c_str_array(strings):
+        """Create ctypes const char ** from a list of Python strings.
+
+        Parameters
+        ----------
+        strings : list of string
+            Python strings.
+
+        Returns
+        -------
+        (ctypes.c_char_p * len(strings))
+            A const char ** pointer that can be passed to C API.
+        """
+        arr = (ctypes.c_char_p * len(strings))()
+        arr[:] = [s.encode('utf-8') for s in strings]
+        return arr
 
 def c_array(ctype, values):
     """Create ctypes array from a Python array.
@@ -213,7 +248,55 @@ def c_array(ctype, values):
     >>> x[1]
     2.0
     """
-    return (ctype * len(values))(*values)
+    out = (ctype * len(values))()
+    out[:] = values
+    return out
+
+
+def c_array_buf(ctype, buf):
+    """Create ctypes array from a Python buffer.
+    For primitive types, using the buffer created with array.array is faster
+    than a c_array call.
+
+    Parameters
+    ----------
+    ctype : ctypes data type
+        Data type of the array we want to convert to, such as mx_float.
+
+    buf : buffer type
+        Data content.
+
+    Returns
+    -------
+    out : ctypes array
+        Created ctypes array.
+
+    Examples
+    --------
+    >>> x = mx.base.c_array_buf(mx.base.mx_float, array.array('i', [1, 2, 3]))
+    >>> print len(x)
+    3
+    >>> x[1]
+    2.0
+    """
+    return (ctype * len(buf)).from_buffer(buf)
+
+def c_handle_array(objs):
+    """Create ctypes const void ** from a list of MXNet objects with handles.
+
+    Parameters
+    ----------
+    objs : list of NDArray/Symbol.
+        MXNet objects.
+
+    Returns
+    -------
+    (ctypes.c_void_p * len(objs))
+        A void ** pointer that can be passed to C API.
+    """
+    arr = (ctypes.c_void_p * len(objs))()
+    arr[:] = [o.handle for o in objs]
+    return arr
 
 def ctypes2buffer(cptr, length):
     """Convert ctypes pointer to buffer type.
diff --git a/python/mxnet/context.py b/python/mxnet/context.py
index 9798b480d2..beccaebcef 100644
--- a/python/mxnet/context.py
+++ b/python/mxnet/context.py
@@ -62,8 +62,8 @@ class Context(object):
     """
     # static class variable
     default_ctx = None
-    devtype2str = {1: 'cpu', 2: 'gpu', 3: 'cpu_pinned'}
-    devstr2type = {'cpu': 1, 'gpu': 2, 'cpu_pinned': 3}
+    devtype2str = {1: 'cpu', 2: 'gpu', 3: 'cpu_pinned', 5: 'cpu_shared'}
+    devstr2type = {'cpu': 1, 'gpu': 2, 'cpu_pinned': 3, 'cpu_shared': 5}
     def __init__(self, device_type, device_id=0):
         if isinstance(device_type, Context):
             self.device_typeid = device_type.device_typeid
@@ -128,14 +128,13 @@ def cpu(device_id=0):
 
     Examples
     ----------
-    >>> with mx.Context('cpu', 1):
+    >>> with mx.cpu():
     ...     cpu_array = mx.nd.ones((2, 3))
     >>> cpu_array.context
-    cpu(1)
-    >>> with mx.cpu(1):
-    ...    cpu_array = mx.nd.ones((2, 3))
+    cpu(0)
+    >>> cpu_array = mx.nd.ones((2, 3), ctx=mx.cpu())
     >>> cpu_array.context
-    cpu(1)
+    cpu(0)
 
     Parameters
     ----------
@@ -151,6 +150,36 @@ def cpu(device_id=0):
     return Context('cpu', device_id)
 
 
+def cpu_pinned(device_id=0):
+    """Returns a CPU pinned memory context. Copying from CPU pinned memory to GPU
+    is faster than from normal CPU memory.
+
+    This function is a short cut for ``Context('cpu_pinned', device_id)``.
+
+    Examples
+    ----------
+    >>> with mx.cpu_pinned():
+    ...     cpu_array = mx.nd.ones((2, 3))
+    >>> cpu_array.context
+    cpu_pinned(0)
+    >>> cpu_array = mx.nd.ones((2, 3), ctx=mx.cpu_pinned())
+    >>> cpu_array.context
+    cpu_pinned(0)
+
+    Parameters
+    ----------
+    device_id : int, optional
+        The device id of the device. `device_id` is not needed for CPU.
+        This is included to make interface compatible with GPU.
+
+    Returns
+    -------
+    context : Context
+        The corresponding CPU pinned memory context.
+    """
+    return Context('cpu_pinned', device_id)
+
+
 def gpu(device_id=0):
     """Returns a GPU context.
 
@@ -159,12 +188,14 @@ def gpu(device_id=0):
 
     Examples
     ----------
-    >>> with mx.Context('gpu', 1):
+    >>> cpu_array = mx.nd.ones((2, 3))
+    >>> cpu_array.context
+    cpu(0)
+    >>> with mx.gpu(1):
     ...     gpu_array = mx.nd.ones((2, 3))
     >>> gpu_array.context
     gpu(1)
-    >>> with mx.gpu(1):
-    ...    gpu_array = mx.nd.ones((2, 3))
+    >>> gpu_array = mx.nd.ones((2, 3), ctx=mx.gpu(1))
     >>> gpu_array.context
     gpu(1)
 
diff --git a/python/mxnet/contrib/autograd.py b/python/mxnet/contrib/autograd.py
index 68ce31bb05..c5c9c027ee 100644
--- a/python/mxnet/contrib/autograd.py
+++ b/python/mxnet/contrib/autograd.py
@@ -20,10 +20,11 @@
 from __future__ import absolute_import
 from __future__ import division
 
+from array import array
 import ctypes
 import functools
 from ..base import _LIB, check_call, string_types
-from ..base import mx_uint, NDArrayHandle, c_array
+from ..base import mx_uint, NDArrayHandle, c_array, c_array_buf, c_handle_array
 # pylint: disable= unused-import
 from ..ndarray import NDArray, zeros_like, _GRAD_REQ_MAP
 
@@ -107,21 +108,16 @@ def mark_variables(variables, gradients, grad_reqs='write'):
     gradients: list of NDArray
     grad_reqs: list of string
     """
-    variable_handles = []
-    gradient_handles = []
-    for var, gradvar in zip(variables, gradients):
-        variable_handles.append(var.handle)
-        gradient_handles.append(gradvar.handle)
     if isinstance(grad_reqs, string_types):
         grad_reqs = [_GRAD_REQ_MAP[grad_reqs]]*len(variables)
     else:
         grad_reqs = [_GRAD_REQ_MAP[i] for i in grad_reqs]
 
     check_call(_LIB.MXAutogradMarkVariables(
-        len(variable_handles),
-        c_array(NDArrayHandle, variable_handles),
-        c_array(mx_uint, grad_reqs),
-        c_array(NDArrayHandle, gradient_handles)))
+        len(variables),
+        c_handle_array(variables),
+        c_array_buf(mx_uint, array('I', grad_reqs)),
+        c_handle_array(gradients)))
 
 
 def backward(outputs, out_grads=None, retain_graph=False):
@@ -134,14 +130,11 @@ def backward(outputs, out_grads=None, retain_graph=False):
     """
     assert isinstance(outputs, (list, tuple)), \
         "outputs must be a list or tuple of NDArrays"
-    output_handles = []
-    for arr in outputs:
-        output_handles.append(arr.handle)
 
     if out_grads is None:
         check_call(_LIB.MXAutogradBackward(
-            len(output_handles),
-            c_array(NDArrayHandle, output_handles),
+            len(outputs),
+            c_handle_array(outputs),
             ctypes.c_void_p(0),
             ctypes.c_int(retain_graph)))
         return
@@ -152,12 +145,12 @@ def backward(outputs, out_grads=None, retain_graph=False):
             ograd_handles.append(arr.handle)
         else:
             ograd_handles.append(NDArrayHandle(0))
-    assert len(ograd_handles) == len(output_handles), \
+    assert len(ograd_handles) == len(outputs), \
         "outputs and out_grads must have the same length"
 
     check_call(_LIB.MXAutogradBackward(
-        len(output_handles),
-        c_array(NDArrayHandle, output_handles),
+        len(outputs),
+        c_handle_array(outputs),
         c_array(NDArrayHandle, ograd_handles),
         ctypes.c_int(retain_graph)))
 
diff --git a/python/mxnet/engine.py b/python/mxnet/engine.py
new file mode 100644
index 0000000000..d4d38f1f29
--- /dev/null
+++ b/python/mxnet/engine.py
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+"""Engine properties management."""
+from __future__ import absolute_import
+
+import ctypes
+from .base import _LIB, check_call
+
+
+def set_bulk_size(size):
+    """Set size limit on bulk execution.
+
+    Bulk execution bundles many operators to run together.
+    This can improve performance when running a lot of small
+    operators sequentially.
+
+    Parameters
+    ----------
+    size : int
+        Maximum number of operators that can be bundled in a bulk.
+
+    Returns
+    -------
+    int
+        Previous bulk size.
+    """
+    prev = ctypes.c_int()
+    check_call(_LIB.MXEngineSetBulkSize(
+        ctypes.c_int(size), ctypes.byref(prev)))
+    return prev.value
+
+
+class _BulkScope(object):
+    """Scope object for bulk execution."""
+    def __init__(self, size):
+        self._size = size
+        self._old_size = None
+
+    def __enter__(self):
+        self._old_size = set_bulk_size(self._size)
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        set_bulk_size(self._old_size)
+
+
+def bulk(size):
+    """Bulk execution bundles many operators to run together.
+    This can improve performance when running a lot of small
+    operators sequentially.
+
+    Returns a scope for managing bulk size::
+
+        with mx.engine.bulk(10):
+            x = mx.nd.zeros((1,))
+            for i in range(100):
+                x += 1
+    """
+    return _BulkScope(size)
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 5cc94a5e80..579e6d3e35 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -25,7 +25,7 @@
 import numpy as np
 from .base import _LIB
 from .base import mx_uint, NDArrayHandle, ExecutorHandle
-from .base import check_call, c_array, py_str
+from .base import check_call, c_handle_array, py_str
 from .ndarray import NDArray
 from .ndarray import _ndarray_cls
 from . import ndarray as nd
@@ -226,7 +226,7 @@ def backward(self, out_grads=None, is_train=True):
         for obj in out_grads:
             if not isinstance(obj, NDArray):
                 raise TypeError("inputs must be NDArray")
-        ndarray = c_array(NDArrayHandle, [item.handle for item in out_grads])
+        ndarray = c_handle_array(out_grads)
         check_call(_LIB.MXExecutorBackwardEx(
             self.handle,
             mx_uint(len(out_grads)),
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index 4f029bf409..beb228ec24 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -20,24 +20,107 @@
 """Dataset generator."""
 __all__ = ['DataLoader']
 
+import multiprocessing
+import multiprocessing.queues
+from multiprocessing.reduction import ForkingPickler
+import pickle
+import io
+import os
+import sys
+import warnings
 import numpy as np
 
 from . import sampler as _sampler
-from ... import nd
+from ... import nd, context
 
 
-def _batchify(data):
+def rebuild_ndarray(*args):
+    """Rebuild ndarray from pickled shared memory"""
+    # pylint: disable=no-value-for-parameter
+    return nd.NDArray(nd.ndarray._new_from_shared_mem(*args))
+
+
+def reduce_ndarray(data):
+    """Reduce ndarray to shared memory handle"""
+    return rebuild_ndarray, data._to_shared_mem()
+
+ForkingPickler.register(nd.NDArray, reduce_ndarray)
+
+
+class ConnectionWrapper(object):
+    """Connection wrapper for multiprocessing that supports sending
+    NDArray via shared memory."""
+
+    def __init__(self, conn):
+        self.conn = conn
+
+    def send(self, obj):
+        """Send object"""
+        buf = io.BytesIO()
+        ForkingPickler(buf, pickle.HIGHEST_PROTOCOL).dump(obj)
+        self.send_bytes(buf.getvalue())
+
+    def recv(self):
+        """Receive object"""
+        buf = self.recv_bytes()
+        return pickle.loads(buf)
+
+    def __getattr__(self, name):
+        """Emmulate conn"""
+        return getattr(self.conn, name)
+
+
+class Queue(multiprocessing.queues.Queue):
+    """Wrapper for multiprocessing queue that dumps NDArray with shared memory."""
+    def __init__(self, *args, **kwargs):
+        if sys.version_info[0] <= 2:
+            super(Queue, self).__init__(*args, **kwargs)
+        else:
+            super(Queue, self).__init__(*args, ctx=multiprocessing.get_context(),
+                                        **kwargs)
+        self._reader = ConnectionWrapper(self._reader)
+        self._writer = ConnectionWrapper(self._writer)
+        self._send = self._writer.send
+        self._recv = self._reader.recv
+
+
+def default_batchify_fn(data):
     """Collate data into batch."""
     if isinstance(data[0], nd.NDArray):
         return nd.stack(*data)
     elif isinstance(data[0], tuple):
         data = zip(*data)
-        return [_batchify(i) for i in data]
+        return [default_batchify_fn(i) for i in data]
     else:
         data = np.asarray(data)
         return nd.array(data, dtype=data.dtype)
 
 
+def default_mp_batchify_fn(data):
+    """Collate data into batch. Use shared memory for stacking."""
+    if isinstance(data[0], nd.NDArray):
+        out = nd.empty((len(data),) + data[0].shape, dtype=data[0].dtype,
+                       ctx=context.Context('cpu_shared', 0))
+        return nd.stack(*data, out=out)
+    elif isinstance(data[0], tuple):
+        data = zip(*data)
+        return [default_mp_batchify_fn(i) for i in data]
+    else:
+        data = np.asarray(data)
+        return nd.array(data, dtype=data.dtype,
+                        ctx=context.Context('cpu_shared', 0))
+
+
+def worker_loop(dataset, key_queue, data_queue, batchify_fn):
+    """Worker loop for multiprocessing DataLoader."""
+    while True:
+        idx, samples = key_queue.get()
+        if idx is None:
+            break
+        batch = batchify_fn([dataset[i] for i in samples])
+        data_queue.put((idx, batch))
+
+
 class DataLoader(object):
     """Loads data from a dataset and returns mini-batches of data.
 
@@ -62,9 +145,27 @@ class DataLoader(object):
     batch_sampler : Sampler
         A sampler that returns mini-batches. Do not specify batch_size,
         shuffle, sampler, and last_batch if batch_sampler is specified.
+    batchify_fn : callable
+        Callback function to allow users to specify how to merge samples
+        into a batch. Defaults to `default_batchify_fn`::
+
+            def default_batchify_fn(data):
+                if isinstance(data[0], nd.NDArray):
+                    return nd.stack(*data)
+                elif isinstance(data[0], tuple):
+                    data = zip(*data)
+                    return [default_batchify_fn(i) for i in data]
+                else:
+                    data = np.asarray(data)
+                    return nd.array(data, dtype=data.dtype)
+
+    num_workers : int, default 0
+        The number of multiprocessing workers to use for data preprocessing.
+        `num_workers > 0` is not supported on Windows yet.
     """
     def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
-                 last_batch=None, batch_sampler=None):
+                 last_batch=None, batch_sampler=None, batchify_fn=None,
+                 num_workers=0):
         self._dataset = dataset
 
         if batch_sampler is None:
@@ -87,10 +188,53 @@ def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
                              "not be specified if batch_sampler is specified.")
 
         self._batch_sampler = batch_sampler
+        if num_workers > 0 and os.name == 'nt':
+            warnings.warn("DataLoader does not support num_workers > 0 on Windows yet.")
+            num_workers = 0
+        self._num_workers = num_workers
+        if batchify_fn is None:
+            if num_workers > 0:
+                self._batchify_fn = default_mp_batchify_fn
+            else:
+                self._batchify_fn = default_batchify_fn
+        else:
+            self._batchify_fn = batchify_fn
 
     def __iter__(self):
-        for batch in self._batch_sampler:
-            yield _batchify([self._dataset[idx] for idx in batch])
+        if self._num_workers == 0:
+            for batch in self._batch_sampler:
+                yield self._batchify_fn([self._dataset[idx] for idx in batch])
+            return
+
+        key_queue = Queue()
+        data_queue = Queue(2*self._num_workers)
+
+        workers = []
+        for _ in range(self._num_workers):
+            worker = multiprocessing.Process(
+                target=worker_loop,
+                args=(self._dataset, key_queue, data_queue, self._batchify_fn))
+            worker.daemon = True
+            worker.start()
+            workers.append(worker)
+
+        for idx, batch in enumerate(self._batch_sampler):
+            key_queue.put((idx, batch))
+
+        data_buffer = {}
+        curr_idx = 0
+        for _ in range(len(self._batch_sampler)):
+            idx, batch = data_queue.get()
+            data_buffer[idx] = batch
+            while curr_idx in data_buffer:
+                yield data_buffer.pop(curr_idx)
+                curr_idx += 1
+
+        for _ in range(self._num_workers):
+            key_queue.put((None, None))
+
+        for worker in workers:
+            worker.join()
 
     def __len__(self):
         return len(self._batch_sampler)
diff --git a/python/mxnet/gluon/model_zoo/vision/__init__.py b/python/mxnet/gluon/model_zoo/vision/__init__.py
index a9a539bf20..619711e71d 100644
--- a/python/mxnet/gluon/model_zoo/vision/__init__.py
+++ b/python/mxnet/gluon/model_zoo/vision/__init__.py
@@ -30,6 +30,7 @@
 -  `MobileNet`_
 
 You can construct a model with random weights by calling its constructor:
+
 .. code::
 
     from mxnet.gluon.model_zoo import vision
@@ -39,8 +40,8 @@
     densenet = vision.densenet_161()
 
 We provide pre-trained models for all the models except ResNet V2.
-These can constructed by passing
-``pretrained=True``:
+These can constructed by passing ``pretrained=True``:
+
 .. code::
 
     from mxnet.gluon.model_zoo import vision
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index 8034ab8415..15c8285bc9 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -490,8 +490,8 @@ class Lambda(Block):
     Output:
         - ** *outputs **: one or more output data. Their shapes depend on the function.
     """
-    def __init__(self, function):
-        super(Lambda, self).__init__()
+    def __init__(self, function, prefix=None):
+        super(Lambda, self).__init__(prefix=prefix)
         if isinstance(function, str):
             assert hasattr(nd, function), \
                    "Function name %s is not found in ndarray." % function
@@ -534,14 +534,17 @@ class HybridLambda(HybridBlock):
     Output:
         - ** *outputs **: one or more output data. Their shapes depend on the function.
     """
-    def __init__(self, function):
-        super(HybridLambda, self).__init__()
+    def __init__(self, function, prefix=None):
+        super(HybridLambda, self).__init__(prefix=prefix)
         if isinstance(function, str):
             assert hasattr(nd, function) and hasattr(sym, function), \
                    "Function name %s is not found in symbol/ndarray." % function
-            self._func = lambda F, *args: getattr(F, function)(*args)
+            func_dict = {sym: getattr(sym, function), nd: getattr(nd, function)}
+            self._func = lambda F, *args: func_dict[F](*args)
+            self._func_name = function
         elif callable(function):
             self._func = function
+            self._func_name = function.__name__
         else:
             raise ValueError(
                 "Unrecognized function in lambda: {} of type {}"
@@ -552,4 +555,4 @@ def hybrid_forward(self, F, x, *args):
 
     def __repr__(self):
         return '{name}({function})'.format(name=self.__class__.__name__,
-                                           function=self._func_impl.__name__)
+                                           function=self._func_name)
diff --git a/python/mxnet/gluon/rnn/rnn_layer.py b/python/mxnet/gluon/rnn/rnn_layer.py
index 3a4f712fb8..204f3c9bd5 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -299,11 +299,11 @@ class RNN(_RNNLayer):
     --------
     >>> layer = mx.gluon.rnn.RNN(100, 3)
     >>> layer.initialize()
-    >>> input = mx.nd.random_uniform(shape=(5, 3, 10))
+    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
     >>> # by default zeros are used as begin state
     >>> output = layer(input)
     >>> # manually specify begin state.
-    >>> h0 = mx.nd.random_uniform(shape=(3, 3, 100))
+    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
     >>> output, hn = layer(input, h0)
     """
     def __init__(self, hidden_size, num_layers=1, activation='relu',
@@ -399,12 +399,12 @@ class LSTM(_RNNLayer):
     --------
     >>> layer = mx.gluon.rnn.LSTM(100, 3)
     >>> layer.initialize()
-    >>> input = mx.nd.random_uniform(shape=(5, 3, 10))
+    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
     >>> # by default zeros are used as begin state
     >>> output = layer(input)
     >>> # manually specify begin state.
-    >>> h0 = mx.nd.random_uniform(shape=(3, 3, 100))
-    >>> c0 = mx.nd.random_uniform(shape=(3, 3, 100))
+    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
+    >>> c0 = mx.nd.random.uniform(shape=(3, 3, 100))
     >>> output, hn = layer(input, [h0, c0])
     """
     def __init__(self, hidden_size, num_layers=1, layout='TNC',
@@ -496,11 +496,11 @@ class GRU(_RNNLayer):
     --------
     >>> layer = mx.gluon.rnn.GRU(100, 3)
     >>> layer.initialize()
-    >>> input = mx.nd.random_uniform(shape=(5, 3, 10))
+    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
     >>> # by default zeros are used as begin state
     >>> output = layer(input)
     >>> # manually specify begin state.
-    >>> h0 = mx.nd.random_uniform(shape=(3, 3, 100))
+    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
     >>> output, hn = layer(input, h0)
     """
     def __init__(self, hidden_size, num_layers=1, layout='TNC',
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index ef2f647eec..25a95be787 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -29,7 +29,7 @@
     h5py = None
 import numpy as np
 from .base import _LIB
-from .base import c_array, c_str, mx_uint, py_str
+from .base import c_str_array, mx_uint, py_str
 from .base import DataIterHandle, NDArrayHandle
 from .base import mx_real_t
 from .base import check_call, build_param_doc as _build_param_doc
@@ -919,11 +919,11 @@ def creator(*args, **kwargs):
         param_vals = []
 
         for k, val in kwargs.items():
-            param_keys.append(c_str(k))
-            param_vals.append(c_str(str(val)))
+            param_keys.append(k)
+            param_vals.append(str(val))
         # create atomic symbol
-        param_keys = c_array(ctypes.c_char_p, param_keys)
-        param_vals = c_array(ctypes.c_char_p, param_vals)
+        param_keys = c_str_array(param_keys)
+        param_vals = c_str_array(param_vals)
         iter_handle = DataIterHandle()
         check_call(_LIB.MXDataIterCreateIter(
             handle,
diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py
index adfef9a949..8625303ee4 100644
--- a/python/mxnet/kvstore.py
+++ b/python/mxnet/kvstore.py
@@ -19,12 +19,13 @@
 """ Key value store interface of MXNet for parameter synchronization."""
 from __future__ import absolute_import
 
+from array import array
 import ctypes
 import pickle
 from .ndarray import NDArray
 from .ndarray import _ndarray_cls
-from .base import _LIB
-from .base import check_call, c_array, c_str, string_types, mx_uint, py_str
+from .base import _LIB, c_str_array, c_handle_array, c_array, c_array_buf, c_str
+from .base import check_call, string_types, mx_uint, py_str
 from .base import NDArrayHandle, KVStoreHandle
 from . import optimizer as opt
 
@@ -46,22 +47,22 @@ def _ctype_key_value(keys, vals):
             assert(use_str_keys == str_keys_i), "inconsistent types of keys detected."
         c_keys_arr = c_array(ctypes.c_char_p, c_keys) if use_str_keys \
                      else c_array(ctypes.c_int, c_keys)
-        c_vals_arr = c_array(NDArrayHandle, c_vals)
+        c_vals_arr = c_array(ctypes.c_void_p, c_vals)
         return (c_keys_arr, c_vals_arr, use_str_keys)
 
     assert(isinstance(keys, (int,) + string_types)), \
            "unexpected type for keys: " + str(type(keys))
     use_str_keys = isinstance(keys, string_types)
     if isinstance(vals, NDArray):
-        c_keys = c_array(ctypes.c_char_p, [c_str(keys)]) if use_str_keys \
-                 else c_array(ctypes.c_int, [keys])
-        return (c_keys, c_array(NDArrayHandle, [vals.handle]), use_str_keys)
+        c_keys = c_str_array([keys]) if use_str_keys \
+                 else c_array_buf(ctypes.c_int, array('i', [keys]))
+        return (c_keys, c_handle_array([vals]), use_str_keys)
     else:
         for value in vals:
             assert(isinstance(value, NDArray))
-        c_keys = c_array(ctypes.c_char_p, [c_str(keys)] * len(vals)) if use_str_keys \
-                 else c_array(ctypes.c_int, [keys] * len(vals))
-        return (c_keys, c_array(NDArrayHandle, [value.handle for value in vals]), use_str_keys)
+        c_keys = c_str_array([keys] * len(vals)) if use_str_keys \
+                 else c_array_buf(ctypes.c_int, array('i', [keys] * len(vals)))
+        return (c_keys, c_handle_array(vals), use_str_keys)
 
 def _updater_wrapper(updater):
     """A wrapper for the user-defined handle."""
diff --git a/python/mxnet/module/bucketing_module.py b/python/mxnet/module/bucketing_module.py
index dd6cafb277..fa92c5d1a1 100644
--- a/python/mxnet/module/bucketing_module.py
+++ b/python/mxnet/module/bucketing_module.py
@@ -85,6 +85,7 @@ def __init__(self, sym_gen, default_bucket_key=None, logger=logging,
         self._curr_module = None
         self._curr_bucket_key = None
         self._params_dirty = False
+        self._monitor = None
 
     def _reset_bind(self):
         """Internal utility function to reset binding."""
@@ -356,6 +357,8 @@ def switch_bucket(self, bucket_key, data_shapes, label_shapes=None):
             module.bind(data_shapes, label_shapes, self._curr_module.for_training,
                         self._curr_module.inputs_need_grad,
                         force_rebind=False, shared_module=self._buckets[self._default_bucket_key])
+            if self._monitor is not None:
+                module.install_monitor(self._monitor)
             self._buckets[bucket_key] = module
 
         self._curr_module = self._buckets[bucket_key]
@@ -499,5 +502,6 @@ def symbol(self):
     def install_monitor(self, mon):
         """Installs monitor on all executors """
         assert self.binded
+        self._monitor = mon
         for mod in self._buckets.values():
             mod.install_monitor(mon)
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 6cbf3284e5..b655ad88c3 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -27,13 +27,14 @@
 except ImportError:
     from builtins import slice as py_slice
 
+from array import array as native_array
 import ctypes
 import warnings
 import operator
 from functools import reduce # pylint: disable=redefined-builtin
 import numpy as np
 from ..base import _LIB, numeric_types, integer_types
-from ..base import c_array, mx_real_t
+from ..base import c_array, c_array_buf, c_handle_array, mx_real_t
 from ..base import mx_uint, NDArrayHandle, check_call
 from ..base import ctypes2buffer
 from ..context import Context
@@ -129,7 +130,7 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
     """
     hdl = NDArrayHandle()
     check_call(_LIB.MXNDArrayCreateEx(
-        c_array(mx_uint, shape),
+        c_array_buf(mx_uint, native_array('I', shape)),
         mx_uint(len(shape)),
         ctypes.c_int(ctx.device_typeid),
         ctypes.c_int(ctx.device_id),
@@ -139,6 +140,18 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
     return hdl
 
 
+def _new_from_shared_mem(shared_pid, shared_id, shape, dtype):
+    hdl = NDArrayHandle()
+    check_call(_LIB.MXNDArrayCreateFromSharedMem(
+        ctypes.c_int(shared_pid),
+        ctypes.c_int(shared_id),
+        c_array(mx_uint, shape),
+        mx_uint(len(shape)),
+        ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])),
+        ctypes.byref(hdl)))
+    return hdl
+
+
 def waitall():
     """Wait for all async operations to finish in MXNet.
 
@@ -173,6 +186,13 @@ def __repr__(self):
     def __reduce__(self):
         return NDArray, (None,), self.__getstate__()
 
+    def _to_shared_mem(self):
+        shared_pid = ctypes.c_int()
+        shared_id = ctypes.c_int()
+        check_call(_LIB.MXNDArrayGetSharedMemHandle(
+            self.handle, ctypes.byref(shared_pid), ctypes.byref(shared_id)))
+        return shared_pid.value, shared_id.value, self.shape, self.dtype
+
     def __add__(self, other):
         """x.__add__(y) <=> x+y <=> mx.nd.add(x, y) """
         return add(self, other)
@@ -412,13 +432,14 @@ def __setitem__(self, key, value):
             raise ValueError('Indexing NDArray with index=%s and type=%s is not supported'
                              % (str(key), str(type(key))))
 
+    # pylint: disable=line-too-long
     def __getitem__(self, key):
         """x.__getitem__(i) <=> x[i]
         Returns a sliced view of this array if the elements fetched are contiguous in memory;
         otherwise, returns a newly created NDArray.
         This functions supports advanced indexing defined in the following reference with
         some limitations.
-        https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html#combining-advanced-and-basic-indexing  # pylint: disable=line-too-long
+        https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html#combining-advanced-and-basic-indexing
         The following features/functionality are not supported for now:
         1. If key is a list type, only a list of integers is supported,
            i.e. key=[1, 2] is okay, while not for key=[[1]].
@@ -469,6 +490,7 @@ def __getitem__(self, key):
         else:
             raise ValueError('Indexing NDArray with index=%s and type=%s is not supported'
                              % (str(key), str(type(key))))
+    # pylint: enable=line-too-long
 
     def _get_index_nd(self, key):
         """Returns an index array for use in scatter_nd and gather_nd."""
@@ -939,7 +961,7 @@ def reshape(self, shape):
         # Actual reshape
         check_call(_LIB.MXNDArrayReshape(self.handle,
                                          len(shape),
-                                         c_array(ctypes.c_int, shape),
+                                         c_array_buf(ctypes.c_int, native_array('i', shape)),
                                          ctypes.byref(handle)))
         return NDArray(handle=handle, writable=self.writable)
 
@@ -1957,7 +1979,7 @@ def backward(self, out_grad=None, retain_graph=False, train_mode=True):
             ograd_handles = [out_grad.handle]
 
         check_call(_LIB.MXAutogradBackwardEx(
-            1, c_array(NDArrayHandle, [self.handle]),
+            1, c_handle_array([self]),
             c_array(NDArrayHandle, ograd_handles),
             0,
             ctypes.c_void_p(0),
diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py
index 45a269a10d..229044e289 100644
--- a/python/mxnet/ndarray/sparse.py
+++ b/python/mxnet/ndarray/sparse.py
@@ -30,6 +30,7 @@
 
 import ctypes
 import warnings
+from array import array as native_array
 
 __all__ = ["_ndarray_cls", "csr_matrix", "row_sparse_array",
            "BaseSparseNDArray", "CSRNDArray", "RowSparseNDArray"]
@@ -37,7 +38,7 @@
 import numpy as np
 from ..base import NotSupportedForSparseNDArray
 from ..base import _LIB, numeric_types
-from ..base import c_array, mx_real_t, integer_types
+from ..base import c_array_buf, mx_real_t, integer_types
 from ..base import mx_uint, NDArrayHandle, check_call
 from ..context import Context
 from . import _internal
@@ -86,16 +87,16 @@ def _new_alloc_handle(stype, shape, ctx, delay_alloc, dtype, aux_types, aux_shap
     num_aux = mx_uint(len(aux_types))
     check_call(_LIB.MXNDArrayCreateSparseEx(
         ctypes.c_int(int(_STORAGE_TYPE_STR_TO_ID[stype])),
-        c_array(mx_uint, shape),
+        c_array_buf(mx_uint, native_array('I', shape)),
         mx_uint(len(shape)),
         ctypes.c_int(ctx.device_typeid),
         ctypes.c_int(ctx.device_id),
         ctypes.c_int(int(delay_alloc)),
         ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])),
         num_aux,
-        c_array(ctypes.c_int, aux_type_ids),
-        c_array(mx_uint, aux_shape_lens),
-        c_array(mx_uint, aux_shapes),
+        c_array_buf(ctypes.c_int, native_array('i', aux_type_ids)),
+        c_array_buf(mx_uint, native_array('I', aux_shape_lens)),
+        c_array_buf(mx_uint, native_array('I', aux_shapes)),
         ctypes.byref(hdl)))
     return hdl
 
@@ -221,6 +222,17 @@ def copyto(self, other):
         else:
             raise TypeError('copyto does not support type ' + str(type(other)))
 
+    def check_format(self, full_check=True):
+        """Check whether the NDArray format is valid.
+
+        Parameters
+        ----------
+        full_check : bool, optional
+            If `True`, rigorous check, O(N) operations. Otherwise
+            basic check, O(1) operations (default True).
+        """
+        check_call(_LIB.MXNDArraySyncCheckFormat(self.handle, ctypes.c_bool(full_check)))
+
     def _data(self):
         """A deep copy NDArray of the data array associated with the BaseSparseNDArray.
 
@@ -735,6 +747,13 @@ def copyto(self, other):
         else:
             raise TypeError('copyto does not support type ' + str(type(other)))
 
+    def retain(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`retain`.
+
+        The arguments are the same as for :py:func:`retain`, with
+        this array as data.
+        """
+        return retain(self, *args, **kwargs)
 
 def _prepare_src_array(source_array, dtype):
     """Prepare `source_array` so that it can be used to construct NDArray.
diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py
index 6f3b0ff9c5..4f597c749f 100644
--- a/python/mxnet/ndarray/utils.py
+++ b/python/mxnet/ndarray/utils.py
@@ -19,7 +19,8 @@
 """Utility functions for NDArray and BaseSparseNDArray."""
 import ctypes
 
-from ..base import _LIB, check_call, py_str, c_str, string_types, mx_uint, NDArrayHandle, c_array
+from ..base import _LIB, check_call, py_str, c_str, string_types, mx_uint, NDArrayHandle
+from ..base import c_array, c_handle_array, c_str_array
 from .ndarray import NDArray
 from .ndarray import array as _array
 from .ndarray import empty as _empty_ndarray
@@ -212,27 +213,24 @@ def save(fname, data):
     """
     if isinstance(data, NDArray):
         data = [data]
-    handles = []
+        handles = c_array(NDArrayHandle, [])
     if isinstance(data, dict):
-        keys = []
-        for key, val in data.items():
-            if not isinstance(key, string_types):
-                raise TypeError('save only accept dict str->NDArray or list of NDArray')
-            if not isinstance(val, NDArray):
-                raise TypeError('save only accept dict str->NDArray or list of NDArray')
-            keys.append(c_str(key))
-            handles.append(val.handle)
-        keys = c_array(ctypes.c_char_p, keys)
+        str_keys = data.keys()
+        nd_vals = data.values()
+        if any(not isinstance(k, string_types) for k in str_keys) or \
+           any(not isinstance(v, NDArray) for v in nd_vals):
+            raise TypeError('save only accept dict str->NDArray or list of NDArray')
+        keys = c_str_array(str_keys)
+        handles = c_handle_array(nd_vals)
     elif isinstance(data, list):
-        for val in data:
-            if not isinstance(val, NDArray):
-                raise TypeError('save only accept dict str->NDArray or list of NDArray')
-            handles.append(val.handle)
+        if any(not isinstance(v, NDArray) for v in data):
+            raise TypeError('save only accept dict str->NDArray or list of NDArray')
         keys = None
+        handles = c_handle_array(data)
     else:
         raise ValueError("data needs to either be a NDArray, dict of str, NDArray pairs "
                          "or a list of NDarrays.")
     check_call(_LIB.MXNDArraySave(c_str(fname),
                                   mx_uint(len(handles)),
-                                  c_array(NDArrayHandle, handles),
+                                  handles,
                                   keys))
diff --git a/python/mxnet/notebook/callback.py b/python/mxnet/notebook/callback.py
index 56321b715b..776900fe59 100644
--- a/python/mxnet/notebook/callback.py
+++ b/python/mxnet/notebook/callback.py
@@ -367,7 +367,7 @@ def _process_batch(self, param, df_name):
             metrics = {}
         metrics['elapsed'] = datetime.datetime.now() - self.start_time
         for key, value in metrics.items():
-            if not self._data[df_name].has_key(key):
+            if key not in self._data[df_name]:
                 self._data[df_name][key] = []
             self._data[df_name][key].append(value)
 
diff --git a/python/mxnet/operator.py b/python/mxnet/operator.py
index 1337bbccc3..f515bf83b8 100644
--- a/python/mxnet/operator.py
+++ b/python/mxnet/operator.py
@@ -16,20 +16,24 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable=invalid-name, protected-access, too-many-arguments, no-self-use, too-many-locals, broad-except
+# pylint: disable=invalid-name, protected-access, too-many-arguments, no-self-use, too-many-locals, broad-except, too-many-lines
 """numpy interface for operators."""
 from __future__ import absolute_import
 
 import traceback
 
+from array import array
 from threading import Lock
 from ctypes import CFUNCTYPE, POINTER, Structure, pointer
 from ctypes import c_void_p, c_int, c_char, c_char_p, cast, c_bool
 
-from .base import _LIB, check_call, MXCallbackList
-from .base import c_array, c_str, mx_uint, mx_float, ctypes2numpy_shared, NDArrayHandle, py_str
+from .base import _LIB, check_call, MXCallbackList, c_array, c_array_buf
+from .base import c_str, mx_uint, mx_float, ctypes2numpy_shared, NDArrayHandle, py_str
 from . import symbol, context
 from .ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
+from .ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID, _STORAGE_TYPE_ID_TO_STR
+from .ndarray import _ndarray_cls
+
 
 c_int_p = POINTER(c_int)
 
@@ -206,7 +210,9 @@ def infer_shape_entry(num_tensor, tensor_dims,
             assert len(ishape) == n_in
             rshape = list(ishape) + list(oshape)
             for i in range(n_in+n_out):
-                tensor_shapes[i] = cast(c_array(mx_uint, rshape[i]), POINTER(mx_uint))
+                tensor_shapes[i] = cast(c_array_buf(mx_uint,
+                                                    array('I', rshape[i])),
+                                        POINTER(mx_uint))
                 tensor_dims[i] = len(rshape[i])
 
         def list_outputs_entry(out, _):
@@ -324,7 +330,9 @@ def infer_shape_entry(num_tensor, tensor_dims,
                 assert len(ishape) == n_in
                 rshape = list(ishape) + list(oshape)
                 for i in range(n_in+n_out):
-                    tensor_shapes[i] = cast(c_array(mx_uint, rshape[i]), POINTER(mx_uint))
+                    tensor_shapes[i] = cast(c_array_buf(mx_uint,
+                                                        array('I', rshape[i])),
+                                            POINTER(mx_uint))
                     tensor_dims[i] = len(rshape[i])
             except Exception:
                 print('Error in NDArrayOp.infer_shape: %s' % traceback.format_exc())
@@ -363,7 +371,7 @@ def declare_backward_dependency(out_grad, in_data, out_data, num_dep, deps, _):
                 out_data = [out_data[i] for i in range(len(self.list_outputs()))]
                 rdeps = self.declare_backward_dependency(out_grad, in_data, out_data)
                 num_dep[0] = len(rdeps)
-                rdeps = cast(c_array(c_int, rdeps), c_int_p)
+                rdeps = cast(c_array_buf(c_int, array('i', rdeps)), c_int_p)
                 deps[0] = rdeps
             except Exception:
                 print('Error in NDArrayOp.declare_backward_dependency: %s' % traceback.format_exc())
@@ -513,6 +521,51 @@ def infer_type(self, in_type):
         return in_type, [in_type[0]]*len(self.list_outputs()), \
             [in_type[0]]*len(self.list_auxiliary_states())
 
+    def infer_storage_type(self, in_stype):
+        """infer_storage_type interface. Used to infer storage type of
+        inputs and outputs in the forward pass.
+
+        Parameters
+        ----------
+        in_stype : list of stypes, Valid stypes are default, row_sparse and
+            csr
+
+        Returns
+        -------
+        in_stype : list
+            list of argument stypes.
+        out_stype : list
+            list of output types calculated from in_stype,
+            in the same order as declared in list_outputs.
+        aux_type : Optional, list
+            list of aux types calculated from in_stype,
+            in the same order as declared in list_auxiliary_states.
+        """
+        return in_stype, [in_stype[0]]*len(self.list_outputs()), \
+            [in_stype[0]]*len(self.list_auxiliary_states())
+
+    def infer_storage_type_backward(self, in_stype):
+        """infer_storage_type_backward interface. Used to infer storage
+        type of inputs and outputs in the backward pass.
+
+        Parameters
+        ----------
+        in_stype : list of stypes. Provide the in_stypes in the
+        following order: output_grads, in_data, out_data, aux_data(optional)
+
+        Returns
+        -------
+        in_stype : list
+            list of input stypes.
+        out_stype : list
+            list of output stypes calculated from in_stype.
+        aux_stype : list
+            list of aux stypes calculated from in_stype,
+            in the same order as declared in list_auxiliary_states.
+        """
+        return in_stype, [in_stype[0]]*len(self.list_outputs()), \
+            [in_stype[0]]*len(self.list_auxiliary_states())
+
     def list_outputs(self):
         """list_outputs interface. Can override when creating new operators.
 
@@ -601,6 +654,8 @@ def do_register(prop_cls):
         infershape_functype = CFUNCTYPE(c_int, c_int, POINTER(c_int),
                                         POINTER(POINTER(mx_uint)), c_void_p)
         infertype_functype = CFUNCTYPE(c_int, c_int, POINTER(c_int), c_void_p)
+        inferstorage_functype = CFUNCTYPE(c_int, c_int, POINTER(c_int), c_void_p)
+        inferstorage_backward_functype = CFUNCTYPE(c_int, c_int, POINTER(c_int), c_void_p)
         list_functype = CFUNCTYPE(c_int, POINTER(POINTER(POINTER(c_char))), c_void_p)
         deps_functype = CFUNCTYPE(c_int, c_int_p, c_int_p, c_int_p,
                                   c_int_p, POINTER(c_int_p), c_void_p)
@@ -645,7 +700,9 @@ def infer_shape_entry(num_tensor, tensor_dims,
                         "shapes, got %d."%(n_aux, len(ashape))
                     rshape = list(ishape) + list(oshape) + list(ashape)
                     for i in range(n_in+n_out+n_aux):
-                        tensor_shapes[i] = cast(c_array(mx_uint, rshape[i]), POINTER(mx_uint))
+                        tensor_shapes[i] = cast(c_array_buf(mx_uint,
+                                                            array('I', rshape[i])),
+                                                POINTER(mx_uint))
                         tensor_dims[i] = len(rshape[i])
 
                     infer_shape_entry._ref_holder = [tensor_shapes]
@@ -654,6 +711,81 @@ def infer_shape_entry(num_tensor, tensor_dims,
                     return False
                 return True
 
+            def infer_storage_type_backward_entry(num_tensor, tensor_stypes, _):
+                """C Callback for CustomOpProp::InferStorageTypeBackward"""
+                try:
+                    n_in = len(op_prop.list_arguments())
+                    n_out = len(op_prop.list_outputs())
+                    n_aux = len(op_prop.list_auxiliary_states())
+                    total_inputs = n_in + 2 * n_out
+                    total_aux = n_aux
+                    total_outputs = n_in
+                    assert num_tensor == (2 * n_in + 2 * n_out + n_aux)
+
+                    stypes = [_STORAGE_TYPE_ID_TO_STR[tensor_stypes[i]] \
+                             for i in range(total_inputs + total_aux)]
+                    ret = op_prop.infer_storage_type_backward(stypes)
+                    if len(ret) == 2:
+                        istype, ostype = ret
+                        astype = []
+                    elif len(ret) == 3:
+                        istype, ostype, astype = ret
+                    else:
+                        raise AssertionError("infer_storage_type backward must return 2 or 3 lists")
+                    assert len(ostype) == total_outputs, \
+                        "InferStorageTypeBackward Error: expecting %d entries in returned output " \
+                        "stypes, got %d."%(total_outputs, len(ostype))
+                    assert len(istype) == (total_inputs), \
+                        "InferStorageTypeBackward Error: expecting %d entries in returned output " \
+                        "stypes, got %d."%(total_inputs, len(istype))
+                    rtype = list(istype) + list(ostype) + list(astype)
+                    for i, dtype in enumerate(rtype):
+                        tensor_stypes[i] = _STORAGE_TYPE_STR_TO_ID[dtype]
+                    infer_storage_type_backward_entry._ref_holder = [tensor_stypes]
+                except Exception:
+                    print('Error in %s.infer_type: %s' % (reg_name, traceback.format_exc()))
+                    return False
+                return True
+
+
+            def infer_storage_type_entry(num_tensor, tensor_stypes, _):
+                """C Callback for CustomOpProp::InferStorageType"""
+                try:
+                    n_in = len(op_prop.list_arguments())
+                    n_out = len(op_prop.list_outputs())
+                    n_aux = len(op_prop.list_auxiliary_states())
+                    assert num_tensor == n_in + n_out + n_aux
+
+                    stypes = [_STORAGE_TYPE_ID_TO_STR[tensor_stypes[i]] for i in range(n_in)]
+                    ret = op_prop.infer_storage_type(stypes)
+                    if len(ret) == 2:
+                        istype, ostype = ret
+                        astype = []
+                    elif len(ret) == 3:
+                        istype, ostype, astype = ret
+                    else:
+                        raise AssertionError("infer_storage_type must return 2 or 3 lists")
+
+                    assert len(ostype) == n_out, \
+                        "InferStorageType Error: expecting %d entries in returned output " \
+                        "stypes, got %d."%(n_out, len(ostype))
+                    assert len(istype) == n_in, \
+                        "InferStorageType Error: expecting %d entries in returned input " \
+                        "stypes, got %d."%(n_in, len(istype))
+                    assert len(astype) == n_aux, \
+                        "InferStorageType Error: expecting %d entries in returned aux state " \
+                        "stypes, got %d."%(n_aux, len(astype))
+                    rtype = list(istype) + list(ostype) + list(astype)
+                    for i, dtype in enumerate(rtype):
+                        tensor_stypes[i] = _STORAGE_TYPE_STR_TO_ID[dtype]
+
+                    infer_storage_type_entry._ref_holder = [tensor_stypes]
+                except Exception:
+                    print('Error in %s.infer_type: %s' % (reg_name, traceback.format_exc()))
+                    return False
+                return True
+
+
             def infer_type_entry(num_tensor, tensor_types, _):
                 """C Callback for CustomOpProp::InferType"""
                 try:
@@ -673,13 +805,13 @@ def infer_type_entry(num_tensor, tensor_types, _):
                         raise AssertionError("infer_type must return 2 or 3 lists")
                     assert len(otype) == n_out, \
                         "InferType Error: expecting %d entries in returned output " \
-                        "shapes, got %d."%(n_out, len(otype))
+                        "types, got %d."%(n_out, len(otype))
                     assert len(itype) == n_in, \
                         "InferType Error: expecting %d entries in returned input " \
-                        "shapes, got %d."%(n_in, len(itype))
+                        "types, got %d."%(n_in, len(itype))
                     assert len(atype) == n_aux, \
                         "InferType Error: expecting %d entries in returned aux state " \
-                        "shapes, got %d."%(n_aux, len(atype))
+                        "types, got %d."%(n_aux, len(atype))
                     rtype = list(itype) + list(otype) + list(atype)
                     for i, dtype in enumerate(rtype):
                         tensor_types[i] = _DTYPE_NP_TO_MX[dtype]
@@ -741,7 +873,7 @@ def declare_backward_dependency_entry(out_grad, in_data, out_data, num_dep, deps
                     out_data = [out_data[i] for i in range(len(op_prop.list_outputs()))]
                     rdeps = op_prop.declare_backward_dependency(out_grad, in_data, out_data)
                     num_dep[0] = len(rdeps)
-                    rdeps = cast(c_array(c_int, rdeps), c_int_p)
+                    rdeps = cast(c_array_buf(c_int, array('i', rdeps)), c_int_p)
                     deps[0] = rdeps
 
                     declare_backward_dependency_entry._ref_holder = [deps]
@@ -768,13 +900,13 @@ def forward_entry(num_ndarray, ndarraies, tags, reqs, is_train, _):
                             tensors = [[] for i in range(5)]
                             for i in range(num_ndarray):
                                 if tags[i] == 1 or tags[i] == 4:
-                                    tensors[tags[i]].append(NDArray(cast(ndarraies[i],
-                                                                         NDArrayHandle),
-                                                                    writable=True))
+                                    tensors[tags[i]].append(_ndarray_cls(cast(ndarraies[i],
+                                                                              NDArrayHandle),
+                                                                         writable=True))
                                 else:
-                                    tensors[tags[i]].append(NDArray(cast(ndarraies[i],
-                                                                         NDArrayHandle),
-                                                                    writable=False))
+                                    tensors[tags[i]].append(_ndarray_cls(cast(ndarraies[i],
+                                                                              NDArrayHandle),
+                                                                         writable=False))
                             reqs = [req_enum[reqs[i]] for i in range(len(tensors[1]))]
                             with ctx:
                                 op.forward(is_train=is_train, req=reqs,
@@ -792,13 +924,13 @@ def backward_entry(num_ndarray, ndarraies, tags, reqs, is_train, _):
                             tensors = [[] for i in range(5)]
                             for i in range(num_ndarray):
                                 if tags[i] == 2 or tags[i] == 4:
-                                    tensors[tags[i]].append(NDArray(cast(ndarraies[i],
-                                                                         NDArrayHandle),
-                                                                    writable=True))
+                                    tensors[tags[i]].append(_ndarray_cls(cast(ndarraies[i],
+                                                                              NDArrayHandle),
+                                                                         writable=True))
                                 else:
-                                    tensors[tags[i]].append(NDArray(cast(ndarraies[i],
-                                                                         NDArrayHandle),
-                                                                    writable=False))
+                                    tensors[tags[i]].append(_ndarray_cls(cast(ndarraies[i],
+                                                                              NDArrayHandle),
+                                                                         writable=False))
                             reqs = [req_enum[reqs[i]] for i in range(len(tensors[2]))]
                             with ctx:
                                 op.backward(req=reqs,
@@ -856,7 +988,9 @@ def delete_entry(_):
                          infershape_functype(infer_shape_entry),
                          deps_functype(declare_backward_dependency_entry),
                          createop_functype(create_operator_entry),
-                         infertype_functype(infer_type_entry)]
+                         infertype_functype(infer_type_entry),
+                         inferstorage_functype(infer_storage_type_entry),
+                         inferstorage_backward_functype(infer_storage_type_backward_entry)]
             callbacks = [cast(i, CFUNCTYPE(c_int)) for i in callbacks]
             contexts = [None]*len(callbacks)
             ret[0] = MXCallbackList(c_int(len(callbacks)),
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 66c261b880..5eb4f05d6d 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -101,7 +101,7 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         assert isinstance(param_idx2name, dict), \
             'param_idx2name should be a dict of param indexes to names.'
         self.idx2name = param_idx2name.copy()
-        self.sym = sym
+        self.sym_info = (sym.attr_dict(), sym.list_arguments()) if sym is not None else ()
         self.param_dict = param_dict if param_dict else {}
 
         self.set_lr_mult({})
@@ -321,9 +321,9 @@ def set_lr_mult(self, args_lr_mult):
             compatibility, and we recommend to use the name instead.
         """
         self.lr_mult = {}
-        if self.sym is not None:
-            attr = self.sym.attr_dict()
-            for name in self.sym.list_arguments():
+        if self.sym_info:
+            attr, arg_names = self.sym_info
+            for name in arg_names:
                 if name in attr and '__lr_mult__' in attr[name]:
                     self.lr_mult[name] = float(attr[name]['__lr_mult__'])
         self.lr_mult.update(args_lr_mult)
@@ -358,9 +358,9 @@ def set_wd_mult(self, args_wd_mult):
         for n in self.idx2name.values():
             if not (n.endswith('_weight') or n.endswith('_gamma')):
                 self.wd_mult[n] = 0.0
-        if self.sym is not None:
-            attr = self.sym.attr_dict()
-            for name in self.sym.list_arguments():
+        if self.sym_info:
+            attr, arg_names = self.sym_info
+            for name in arg_names:
                 if name in attr and '__wd_mult__' in attr[name]:
                     self.wd_mult[name] = float(attr[name]['__wd_mult__'])
         self.wd_mult.update(args_wd_mult)
@@ -442,13 +442,20 @@ class SGD(Optimizer):
         weight = weight - state
 
     If the storage types of weight, state and grad are all ``row_sparse``, \
-    sparse updates are applied by::
+    **sparse updates** are applied by::
 
         for row in grad.indices:
             rescaled_grad[row] = lr * rescale_grad * clip(grad[row], clip_gradient) + wd * weight[row]
             state[row] = momentum[row] * state[row] + rescaled_grad[row]
             weight[row] = weight[row] - state[row]
 
+    The sparse update only updates the momentum for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all
+    indices. Compared with the original update, it can provide large
+    improvements in model training throughput for some applications. However, it
+    provides slightly different semantics than the original update, and
+    may lead to different empirical results.
+
     For details of the update algorithm see
     :class:`~mxnet.ndarray.sgd_update` and :class:`~mxnet.ndarray.sgd_mom_update`.
 
@@ -667,7 +674,7 @@ class Adam(Optimizer):
         w = w - learning_rate * m / (sqrt(v) + epsilon)
 
     If the storage types of weight, state and grad are all ``row_sparse``, \
-    sparse updates are applied by::
+    **sparse updates** are applied by::
 
         for row in grad.indices:
             rescaled_grad[row] = clip(grad[row] * rescale_grad + wd * weight[row], clip_gradient)
@@ -675,6 +682,12 @@ class Adam(Optimizer):
             v[row] = beta2 * v[row] + (1 - beta2) * (rescaled_grad[row]**2)
             w[row] = w[row] - learning_rate * m[row] / (sqrt(v[row]) + epsilon)
 
+    The sparse update only updates the mean and var for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all indices.
+    Compared with the original update, it can provide large improvements in model training
+    throughput for some applications. However, it provides slightly different semantics than
+    the original update, and may lead to different empirical results.
+
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.
 
@@ -936,7 +949,7 @@ class Ftrl(Optimizer):
         w = (sign(z) * lamda1 - z) / ((beta + sqrt(n)) / learning_rate + wd) * (abs(z) > lamda1)
 
     If the storage types of weight, state and grad are all ``row_sparse``, \
-    sparse updates are applied by::
+    **sparse updates** are applied by::
 
         for row in grad.indices:
             rescaled_grad[row] = clip(grad[row] * rescale_grad, clip_gradient)
@@ -944,6 +957,13 @@ class Ftrl(Optimizer):
             n[row] += rescaled_grad[row]**2
             w[row] = (sign(z[row]) * lamda1 - z[row]) / ((beta + sqrt(n[row])) / learning_rate + wd) * (abs(z[row]) > lamda1)
 
+    The sparse update only updates the z and n for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all
+    indices. Compared with the original update, it can provide large
+    improvements in model training throughput for some applications. However, it
+    provides slightly different semantics than the original update, and
+    may lead to different empirical results.
+
     For details of the update algorithm, see :class:`~mxnet.ndarray.ftrl_update`.
 
     This optimizer accepts the following parameters in addition to those accepted
diff --git a/python/mxnet/rtc.py b/python/mxnet/rtc.py
index aff4588be2..4dea0e656b 100644
--- a/python/mxnet/rtc.py
+++ b/python/mxnet/rtc.py
@@ -18,11 +18,12 @@
 """Interface to runtime cuda kernel compile module."""
 from __future__ import absolute_import
 
+from array import array
 import re
 import ctypes
 import numpy as np
 
-from .base import _LIB, mx_uint, c_array, check_call
+from .base import _LIB, mx_uint, c_array, c_array_buf, c_str_array, check_call
 from .base import c_str, CudaModuleHandle, CudaKernelHandle, numeric_types, string_types
 from .ndarray import _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP, NDArray
 
@@ -100,9 +101,9 @@ def __init__(self, source, options=(), exports=()):
         check_call(_LIB.MXRtcCudaModuleCreate(
             c_str(source),
             len(options),
-            c_array(ctypes.c_char_p, [c_str(opt) for opt in options]),
+            c_str_array(options),
             len(exports),
-            c_array(ctypes.c_char_p, [c_str(name) for name in exports]),
+            c_str_array(exports),
             ctypes.byref(self.handle)))
 
     def __del__(self):
@@ -162,9 +163,9 @@ def get_kernel(self, name, signature):
             self.handle,
             c_str(name),
             len(dtypes),
-            c_array(ctypes.c_int, [ctypes.c_int(i) for i in is_ndarray]),
-            c_array(ctypes.c_int, [ctypes.c_int(i) for i in is_const]),
-            c_array(ctypes.c_int, [ctypes.c_int(i) for i in dtypes]),
+            c_array_buf(ctypes.c_int, array('i', is_ndarray)),
+            c_array_buf(ctypes.c_int, array('i', is_const)),
+            c_array_buf(ctypes.c_int, array('i', dtypes)),
             ctypes.byref(hdl)))
 
         return CudaKernel(hdl, name, is_ndarray, dtypes)
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 4713c1ee1c..e2cf0ecb68 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -25,6 +25,7 @@
 except ImportError:
     from builtins import slice as py_slice
 
+from array import array
 import ctypes
 import warnings
 from numbers import Number
@@ -32,8 +33,8 @@
 import numpy as _numpy
 
 from ..attribute import AttrScope
-from ..base import _LIB, numeric_types
-from ..base import c_array, c_str, mx_uint, py_str, string_types
+from ..base import _LIB, numeric_types, c_array, c_array_buf, c_str, c_str_array, c_handle_array
+from ..base import mx_uint, py_str, string_types
 from ..base import NDArrayHandle, ExecutorHandle, SymbolHandle
 from ..base import check_call, MXNetError, NotImplementedForSymbol
 from ..context import Context
@@ -463,11 +464,11 @@ def _compose(self, *args, **kwargs):
 
         num_args = len(args) + len(kwargs)
         if len(kwargs) != 0:
-            keys = c_array(ctypes.c_char_p, [c_str(key) for key in kwargs])
-            args = c_array(SymbolHandle, [s.handle for s in kwargs.values()])
+            keys = c_str_array(kwargs.keys())
+            args = c_handle_array(kwargs.values())
         else:
             keys = None
-            args = c_array(SymbolHandle, [s.handle for s in args])
+            args = c_handle_array(args)
         check_call(_LIB.MXSymbolCompose(
             self.handle, name, num_args, keys, args))
 
@@ -856,7 +857,7 @@ def infer_type(self, *args, **kwargs):
                     types either by positional or kwargs way.')
         sdata = []
         if len(args) != 0:
-            keys = None
+            keys = c_array(ctypes.c_char_p, [])
             for s in args:
                 if s is not None:
                     s = _numpy.dtype(s).type
@@ -866,12 +867,13 @@ def infer_type(self, *args, **kwargs):
                 else:
                     sdata.append(-1)
         else:
-            keys = []
+            str_keys = []
             for k, v in kwargs.items():
                 v = _numpy.dtype(v).type
                 if v in _DTYPE_NP_TO_MX:
-                    keys.append(c_str(k))
+                    str_keys.append(k)
                     sdata.append(_DTYPE_NP_TO_MX[v])
+            keys = c_str_array(str_keys)
         arg_type_size = mx_uint()
         arg_type_data = ctypes.POINTER(ctypes.c_int)()
         out_type_size = mx_uint()
@@ -882,8 +884,8 @@ def infer_type(self, *args, **kwargs):
         check_call(_LIB.MXSymbolInferType(
             self.handle,
             mx_uint(len(sdata)),
-            c_array(ctypes.c_char_p, keys),
-            c_array(ctypes.c_int, sdata),
+            keys,
+            c_array_buf(ctypes.c_int, array('i', sdata)),
             ctypes.byref(arg_type_size),
             ctypes.byref(arg_type_data),
             ctypes.byref(out_type_size),
@@ -1043,7 +1045,7 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
         sdata = []
         indptr = [0]
         if len(args) != 0:
-            keys = None
+            keys = c_array(ctypes.c_char_p, [])
             for i, s in enumerate(args):
                 if s is not None:
                     if not isinstance(s, tuple):
@@ -1052,14 +1054,15 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
                     sdata.extend(s)
                 indptr.append(len(sdata))
         else:
-            keys = []
+            str_keys = []
             for k, v in kwargs.items():
                 if not isinstance(v, tuple):
                     raise TypeError("Arguments need to be shapes (tuple), "
                                     "but '%s' is %s." % (k, type(v)))
-                keys.append(c_str(k))
+                str_keys.append(k)
                 sdata.extend(v)
                 indptr.append(len(sdata))
+            keys = c_str_array(str_keys)
         arg_shape_size = mx_uint()
         arg_shape_ndim = ctypes.POINTER(mx_uint)()
         arg_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
@@ -1077,9 +1080,9 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
         check_call(infer_func(
             self.handle,
             mx_uint(len(indptr) - 1),
-            c_array(ctypes.c_char_p, keys),
-            c_array(mx_uint, indptr),
-            c_array(mx_uint, sdata),
+            keys,
+            c_array_buf(mx_uint, array('I', indptr)),
+            c_array_buf(mx_uint, array('I', sdata)),
             ctypes.byref(arg_shape_size),
             ctypes.byref(arg_shape_ndim),
             ctypes.byref(arg_shape_data),
@@ -1330,11 +1333,11 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
             for k, v in type_dict.items():
                 v = _numpy.dtype(v).type
                 if v in _DTYPE_NP_TO_MX:
-                    provided_arg_type_names.append(c_str(k))
-                    provided_arg_type_data.append(ctypes.c_int(_DTYPE_NP_TO_MX[v]))
+                    provided_arg_type_names.append(k)
+                    provided_arg_type_data.append(_DTYPE_NP_TO_MX[v])
             num_provided_arg_types = mx_uint(len(provided_arg_type_names))
-            provided_arg_type_names = c_array(ctypes.c_char_p, provided_arg_type_names)
-            provided_arg_type_data = c_array(ctypes.c_int, provided_arg_type_data)
+            provided_arg_type_names = c_str_array(provided_arg_type_names)
+            provided_arg_type_data = c_array_buf(ctypes.c_int, array('i', provided_arg_type_data))
 
         # storage types
         num_provided_arg_stypes = 0
@@ -1346,11 +1349,11 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
             provided_arg_stype_data = []
             for k, v in stype_dict.items():
                 if v in _STORAGE_TYPE_STR_TO_ID:
-                    provided_arg_stype_names.append(c_str(k))
-                    provided_arg_stype_data.append(ctypes.c_int(_STORAGE_TYPE_STR_TO_ID[v]))
+                    provided_arg_stype_names.append(k)
+                    provided_arg_stype_data.append(_STORAGE_TYPE_STR_TO_ID[v])
             num_provided_arg_stypes = mx_uint(len(provided_arg_stype_names))
-            provided_arg_stype_names = c_array(ctypes.c_char_p, provided_arg_stype_names)
-            provided_arg_stype_data = c_array(ctypes.c_int, provided_arg_stype_data)
+            provided_arg_stype_names = c_str_array(provided_arg_stype_names)
+            provided_arg_stype_data = c_array_buf(ctypes.c_int, array('i', provided_arg_stype_data))
 
         provided_arg_shape_data = []  # shape data
         # argument shape index in sdata,
@@ -1361,7 +1364,7 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
             # if k not in listed_arguments and k not in listed_aux_states:
             #   raise ValueError('arg name %s is not valid', k)
             if isinstance(v, tuple):
-                provided_arg_shape_names.append(c_str(k))
+                provided_arg_shape_names.append(k)
                 provided_arg_shape_data.extend(v)
                 provided_arg_shape_idx.append(len(provided_arg_shape_data))
 
@@ -1372,11 +1375,11 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
             if isinstance(grad_req, string_types):
                 # use provided_req_type_list_len = 0 to indicate this situation
                 provided_req_type_list_len = 0
-                provided_grad_req_types = [c_str(grad_req)]
+                provided_grad_req_types = [grad_req]
             elif isinstance(grad_req, list):
                 if len(grad_req) == 0:
                     raise RuntimeError('grad_req in simple_bind cannot be an empty list')
-                provided_grad_req_types = [c_str(item) for item in grad_req]
+                provided_grad_req_types = grad_req
                 provided_req_type_list_len = len(provided_grad_req_types)
             elif isinstance(grad_req, dict):
                 if len(grad_req) == 0:
@@ -1384,11 +1387,11 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
                 provided_grad_req_names = []
                 provided_grad_req_types = []
                 for k, v in grad_req.items():
-                    provided_grad_req_names.append(c_str(k))
-                    provided_grad_req_types.append(c_str(v))
-                provided_grad_req_names = c_array(ctypes.c_char_p, provided_grad_req_names)
+                    provided_grad_req_names.append(k)
+                    provided_grad_req_types.append(v)
+                provided_grad_req_names = c_str_array(provided_grad_req_names)
                 provided_req_type_list_len = len(provided_grad_req_types)
-            provided_grad_req_types = c_array(ctypes.c_char_p, provided_grad_req_types)
+            provided_grad_req_types = c_str_array(provided_grad_req_types)
 
         num_ctx_map_keys = mx_uint(0)
         ctx_map_keys = ctypes.POINTER(ctypes.c_char_p)()
@@ -1399,20 +1402,20 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
             ctx_map_dev_types = []
             ctx_map_dev_ids = []
             for key, val in group2ctx.items():
-                ctx_map_keys.append(c_str(key))
-                ctx_map_dev_types.append(ctypes.c_int(val.device_typeid))
-                ctx_map_dev_ids.append(ctypes.c_int(val.device_id))
+                ctx_map_keys.append(key)
+                ctx_map_dev_types.append(val.device_typeid)
+                ctx_map_dev_ids.append(val.device_id)
             num_ctx_map_keys = mx_uint(len(ctx_map_keys))
-            ctx_map_keys = c_array(ctypes.c_char_p, ctx_map_keys)
-            ctx_map_dev_types = c_array(ctypes.c_int, ctx_map_dev_types)
-            ctx_map_dev_ids = c_array(ctypes.c_int, ctx_map_dev_ids)
+            ctx_map_keys = c_str_array(ctx_map_keys)
+            ctx_map_dev_types = c_array(ctypes.c_int, array('i', ctx_map_dev_types))
+            ctx_map_dev_ids = c_array(ctypes.c_int, array('i', ctx_map_dev_ids))
 
         # prepare param names
         shared_arg_name_list = []
         if shared_arg_names is not None:
             if not isinstance(shared_arg_names, list):
                 raise ValueError('shared_arg_names in simple_bind must be a list or None')
-            shared_arg_name_list = [c_str(name) for name in shared_arg_names]
+            shared_arg_name_list = shared_arg_names
 
         # prepare shared_buffer
         if shared_buffer is None:
@@ -1422,16 +1425,14 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
         else:
             if not isinstance(shared_buffer, dict):
                 raise ValueError('shared_buffer in simple_bind must be dict or None')
-            shared_buffer_names = []
-            shared_buffer_handles = []
-            for k, v in shared_buffer.items():
+            buffer_names = shared_buffer.keys()
+            buffer_arrays = shared_buffer.values()
+            for v in buffer_arrays:
                 assert(v.stype == 'default'), \
                     "shared_buffer is expected to only contain NDArrays with default storage"
-                shared_buffer_names.append(c_str(k))
-                shared_buffer_handles.append(v.handle)
-            shared_buffer_names = c_array(ctypes.c_char_p, shared_buffer_names)
-            shared_buffer_len = ctypes.c_int(len(shared_buffer_handles))
-            shared_buffer_handles = c_array(NDArrayHandle, shared_buffer_handles)
+            shared_buffer_names = c_str_array(buffer_names)
+            shared_buffer_len = ctypes.c_int(len(buffer_arrays))
+            shared_buffer_handles = c_handle_array(buffer_arrays)
         updated_shared_buffer_names = ctypes.POINTER(ctypes.c_char_p)()
         updated_shared_buffer_handles = ctypes.POINTER(NDArrayHandle)()
 
@@ -1460,9 +1461,11 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
                                                  provided_grad_req_names,
                                                  provided_grad_req_types,
                                                  mx_uint(len(provided_arg_shape_names)),
-                                                 c_array(ctypes.c_char_p, provided_arg_shape_names),
-                                                 c_array(mx_uint, provided_arg_shape_data),
-                                                 c_array(mx_uint, provided_arg_shape_idx),
+                                                 c_str_array(provided_arg_shape_names),
+                                                 c_array_buf(mx_uint,
+                                                             array('I', provided_arg_shape_data)),
+                                                 c_array_buf(mx_uint,
+                                                             array('I', provided_arg_shape_idx)),
                                                  num_provided_arg_types,
                                                  provided_arg_type_names,
                                                  provided_arg_type_data,
@@ -1470,7 +1473,7 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
                                                  provided_arg_stype_names,
                                                  provided_arg_stype_data,
                                                  mx_uint(len(shared_arg_name_list)),
-                                                 c_array(ctypes.c_char_p, shared_arg_name_list),
+                                                 c_str_array(shared_arg_name_list),
                                                  ctypes.byref(shared_buffer_len),
                                                  shared_buffer_names,
                                                  shared_buffer_handles,
@@ -1623,19 +1626,19 @@ def bind(self, ctx, args, args_grad=None, grad_req='write',
         if isinstance(grad_req, string_types):
             if grad_req not in _GRAD_REQ_MAP:
                 raise ValueError('grad_req must be in %s' % str(_GRAD_REQ_MAP))
-            reqs_array = c_array(
-                mx_uint,
-                [mx_uint(_GRAD_REQ_MAP[grad_req])] * len(listed_arguments))
+            reqs_array = c_array_buf(mx_uint,
+                                     array('I', [_GRAD_REQ_MAP[grad_req]] * len(listed_arguments)))
         elif isinstance(grad_req, list):
-            reqs_array = c_array(mx_uint, [mx_uint(_GRAD_REQ_MAP[item]) for item in grad_req])
+            reqs_array = c_array_buf(mx_uint,
+                                     array('I', [_GRAD_REQ_MAP[item] for item in grad_req]))
         elif isinstance(grad_req, dict):
             req_array = []
             for name in listed_arguments:
                 if name in grad_req:
-                    req_array.append(mx_uint(_GRAD_REQ_MAP[grad_req[name]]))
+                    req_array.append(_GRAD_REQ_MAP[grad_req[name]])
                 else:
-                    req_array.append(mx_uint(0))
-            reqs_array = c_array(mx_uint, req_array)
+                    req_array.append(0)
+            reqs_array = c_array_buf(mx_uint, array('I', req_array))
 
         ctx_map_keys = []
         ctx_map_dev_types = []
@@ -1643,9 +1646,9 @@ def bind(self, ctx, args, args_grad=None, grad_req='write',
 
         if group2ctx:
             for key, val in group2ctx.items():
-                ctx_map_keys.append(c_str(key))
-                ctx_map_dev_types.append(ctypes.c_int(val.device_typeid))
-                ctx_map_dev_ids.append(ctypes.c_int(val.device_id))
+                ctx_map_keys.append(key)
+                ctx_map_dev_types.append(val.device_typeid)
+                ctx_map_dev_ids.append(val.device_id)
 
         handle = ExecutorHandle()
         shared_handle = shared_exec.handle if shared_exec is not None else ExecutorHandle()
@@ -1653,9 +1656,9 @@ def bind(self, ctx, args, args_grad=None, grad_req='write',
                                          ctypes.c_int(ctx.device_typeid),
                                          ctypes.c_int(ctx.device_id),
                                          mx_uint(len(ctx_map_keys)),
-                                         c_array(ctypes.c_char_p, ctx_map_keys),
-                                         c_array(ctypes.c_int, ctx_map_dev_types),
-                                         c_array(ctypes.c_int, ctx_map_dev_ids),
+                                         c_str_array(ctx_map_keys),
+                                         c_array_buf(ctypes.c_int, array('i', ctx_map_dev_types)),
+                                         c_array_buf(ctypes.c_int, array('i', ctx_map_dev_ids)),
                                          mx_uint(len(args)),
                                          args_handle,
                                          args_grad_handle,
@@ -1688,7 +1691,7 @@ def gradient(self, wrt):
             A gradient Symbol with returns to be the corresponding gradients.
         """
         handle = SymbolHandle()
-        c_wrt = c_array(ctypes.c_char_p, [c_str(key) for key in wrt])
+        c_wrt = c_str_array(wrt)
         check_call(_LIB.MXSymbolGrad(self.handle,
                                      mx_uint(len(wrt)),
                                      c_wrt,
@@ -2450,15 +2453,12 @@ def Group(symbols):
     sym : Symbol
         A group symbol.
      """
-    ihandles = []
-    for sym in symbols:
-        if not isinstance(sym, Symbol):
-            raise TypeError('Expected a list of symbols as input')
-        ihandles.append(sym.handle)
+    if any(not isinstance(sym, Symbol) for sym in symbols):
+        raise TypeError('Expected a list of symbols as input')
     handle = SymbolHandle()
     check_call(_LIB.MXSymbolCreateGroup(
-        mx_uint(len(ihandles)),
-        c_array(SymbolHandle, ihandles), ctypes.byref(handle)))
+        mx_uint(len(symbols)),
+        c_handle_array(symbols), ctypes.byref(handle)))
     return Symbol(handle)
 
 
diff --git a/python/mxnet/torch.py b/python/mxnet/torch.py
index b7fce6d5c8..fc815b1469 100644
--- a/python/mxnet/torch.py
+++ b/python/mxnet/torch.py
@@ -23,8 +23,8 @@
 import ctypes
 import sys
 from .base import _LIB
-from .base import c_array, py_str, build_param_doc as _build_param_doc
-from .base import mx_uint, mx_float, NDArrayHandle, FunctionHandle
+from .base import c_array, c_str_array, c_handle_array, py_str, build_param_doc as _build_param_doc
+from .base import mx_uint, mx_float, FunctionHandle
 from .base import check_call
 from .ndarray import NDArray, _new_empty_handle
 
@@ -144,12 +144,12 @@ def generic_torch_function(*args, **kwargs):
 
         check_call(_LIB.MXFuncInvokeEx( \
                    handle, \
-                   c_array(NDArrayHandle, [x.handle for x in ndargs[n_mutate_vars:]]), \
+                   c_handle_array(ndargs[n_mutate_vars:]), \
                    c_array(mx_float, []), \
-                   c_array(NDArrayHandle, [x.handle for x in ndargs[:n_mutate_vars]]),
+                   c_handle_array(ndargs[:n_mutate_vars]),
                    ctypes.c_int(len(kwargs)),
-                   c_array(ctypes.c_char_p, kwargs.keys()),
-                   c_array(ctypes.c_char_p, kwargs.values()),))
+                   c_str_array(kwargs.keys()),
+                   c_str_array(kwargs.values())))
         if n_mutate_vars == 1:
             return ndargs[0]
         else:
diff --git a/scala-package/init-native/src/main/native/ml_dmlc_mxnet_init_native_c_api.cc b/scala-package/init-native/src/main/native/ml_dmlc_mxnet_init_native_c_api.cc
index 114510c66a..19051891df 100644
--- a/scala-package/init-native/src/main/native/ml_dmlc_mxnet_init_native_c_api.cc
+++ b/scala-package/init-native/src/main/native/ml_dmlc_mxnet_init_native_c_api.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file ml_dmlc_mxnet_native_c_api.cc
  * \brief JNI function implementations
  */
diff --git a/scala-package/native/src/main/native/jni_helper_func.h b/scala-package/native/src/main/native/jni_helper_func.h
index 009bbec64e..2ef31b2b39 100644
--- a/scala-package/native/src/main/native/jni_helper_func.h
+++ b/scala-package/native/src/main/native/jni_helper_func.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file jni_helper_func.h
  * \brief Helper functions for operating JVM objects
  */
diff --git a/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc b/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
index db0f11e27f..f4fe93be06 100644
--- a/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file ml_dmlc_mxnet_native_c_api.cc
  * \brief JNI function implementations
  */
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 1d348a5b40..0dde00443a 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file c_api.cc
  * \brief C API of mxnet
  */
@@ -34,6 +35,7 @@
 #include <mxnet/c_api.h>
 #include <mxnet/kvstore.h>
 #include <mxnet/rtc.h>
+#include <mxnet/storage.h>
 #include <vector>
 #include <sstream>
 #include <string>
@@ -136,6 +138,12 @@ int MXSetNumOMPThreads(int thread_num) {
   API_END();
 }
 
+int MXEngineSetBulkSize(int bulk_size, int* prev_bulk_size) {
+  API_BEGIN();
+  *prev_bulk_size = Engine::Get()->set_bulk_size(bulk_size);
+  API_END();
+}
+
 int MXGetVersion(int *out) {
   API_BEGIN();
   *out = static_cast<int>(MXNET_VERSION);
@@ -271,6 +279,13 @@ int MXNDArraySyncCopyFromNDArray(NDArrayHandle handle_dst,
   API_END();
 }
 
+int MXNDArraySyncCheckFormat(NDArrayHandle handle, const bool full_check) {
+  API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
+  arr->SyncCheckFormat(full_check);
+  API_END();
+}
+
 int MXNDArrayWaitToRead(NDArrayHandle handle) {
   API_BEGIN();
   static_cast<NDArray*>(handle)->WaitToRead();
@@ -1228,3 +1243,31 @@ int MXRtcCudaKernelCall(CudaKernelHandle handle, int dev_id, void** args,
 #endif
   API_END();
 }
+
+
+int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid, int* shared_id) {
+  API_BEGIN();
+  NDArray* arr = reinterpret_cast<NDArray*>(handle);
+  Storage::Handle shandle;
+  if (arr->ctx().dev_type == Context::kCPUShared) {
+    arr->WaitToRead();
+    shandle = arr->storage_handle();
+    Storage::Get()->SharedIncrementRefCount(shandle);
+  } else {
+    NDArray new_arr(arr->shape(), Context::CPUShared(0), false, arr->dtype());
+    CopyFromTo(*arr, new_arr);
+    new_arr.WaitToRead();
+    shandle = new_arr.storage_handle();
+    Storage::Get()->SharedIncrementRefCount(shandle);
+  }
+  *shared_pid = shandle.shared_pid;
+  *shared_id = shandle.shared_id;
+  API_END();
+}
+
+int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const mx_uint *shape,
+                                 mx_uint ndim, int dtype, NDArrayHandle *out) {
+  API_BEGIN();
+  *out = new NDArray(shared_pid, shared_id, TShape(shape, shape + ndim), dtype);
+  API_END();
+}
diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h
index cc47436465..7f866045a7 100644
--- a/src/c_api/c_api_common.h
+++ b/src/c_api/c_api_common.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file c_api_error.h
  * \brief Error handling for C API.
  */
diff --git a/src/c_api/c_api_error.cc b/src/c_api/c_api_error.cc
index 4d93b908fb..6dd4719281 100644
--- a/src/c_api/c_api_error.cc
+++ b/src/c_api/c_api_error.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file c_api_error.cc
  * \brief C error handling
  */
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index 8be3965ef0..40df49144f 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file c_api_executor.cc
  * \brief C API of mxnet
  */
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 0474e26e6c..dad71b0816 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file c_api_symbolic.cc
  * \brief C API of mxnet
  */
diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index 9ed84982ba..3a693dbfcb 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file c_predict_api.cc
  * \brief C predict API of mxnet
  */
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
index c135ff8a1d..a1c37a9478 100644
--- a/src/common/cuda_utils.h
+++ b/src/common/cuda_utils.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cuda_utils.h
  * \brief CUDA debugging utilities.
  */
diff --git a/src/common/lazy_alloc_array.h b/src/common/lazy_alloc_array.h
index aa2cd4a139..0fd5acd63d 100644
--- a/src/common/lazy_alloc_array.h
+++ b/src/common/lazy_alloc_array.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file lazy_alloc_array.h
  * \brief An array that lazily allocate elements as
  *   First time the cell get visited.
@@ -56,8 +57,6 @@ class LazyAllocArray {
   /*! \brief clear all the allocated elements in array */
   inline void Clear();
 
-  void SignalForKill();
-
  private:
   template<typename SyncObject>
   class unique_unlock {
@@ -86,12 +85,12 @@ class LazyAllocArray {
   /*! \brief overflow array of more elements */
   std::vector<std::shared_ptr<TElem> > more_;
   /*! \brief Signal shutdown of array */
-  std::atomic<bool> exit_now_;
+  std::atomic<bool> is_clearing_;
 };
 
 template<typename TElem>
 inline LazyAllocArray<TElem>::LazyAllocArray()
-  : exit_now_(false) {
+  : is_clearing_(false) {
 }
 
 // implementations
@@ -106,7 +105,7 @@ inline std::shared_ptr<TElem> LazyAllocArray<TElem>::Get(int index, FCreate crea
       return ptr;
     } else {
       std::lock_guard<std::mutex> lock(create_mutex_);
-      if (!exit_now_.load()) {
+      if (!is_clearing_.load()) {
         std::shared_ptr<TElem> ptr = head_[idx];
         if (ptr) {
           return ptr;
@@ -117,7 +116,7 @@ inline std::shared_ptr<TElem> LazyAllocArray<TElem>::Get(int index, FCreate crea
     }
   } else {
     std::lock_guard<std::mutex> lock(create_mutex_);
-    if (!exit_now_.load()) {
+    if (!is_clearing_.load()) {
       idx -= kInitSize;
       if (more_.size() <= idx) {
         more_.reserve(idx + 1);
@@ -139,7 +138,7 @@ inline std::shared_ptr<TElem> LazyAllocArray<TElem>::Get(int index, FCreate crea
 template<typename TElem>
 inline void LazyAllocArray<TElem>::Clear() {
   std::unique_lock<std::mutex> lock(create_mutex_);
-  exit_now_.store(true);
+  is_clearing_.store(true);
   // Currently, head_ and more_ never get smaller, so it's safe to
   // iterate them outside of the lock.  The loops should catch
   // any growth which might happen when create_mutex_ is unlocked
@@ -155,6 +154,8 @@ inline void LazyAllocArray<TElem>::Clear() {
     unique_unlock<std::mutex> unlocker(&lock);
     p = std::shared_ptr<TElem>(nullptr);
   }
+  more_.clear();
+  is_clearing_.store(false);
 }
 
 template<typename TElem>
@@ -173,12 +174,6 @@ inline void LazyAllocArray<TElem>::ForEach(FVisit fvisit) {
   }
 }
 
-template<typename TElem>
-inline void LazyAllocArray<TElem>::SignalForKill() {
-  std::lock_guard<std::mutex> lock(create_mutex_);
-  exit_now_.store(true);
-}
-
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_LAZY_ALLOC_ARRAY_H_
diff --git a/src/common/object_pool.h b/src/common/object_pool.h
index 6e11ce5ca7..576ff9aea1 100644
--- a/src/common/object_pool.h
+++ b/src/common/object_pool.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  */
 #ifndef MXNET_COMMON_OBJECT_POOL_H_
 #define MXNET_COMMON_OBJECT_POOL_H_
diff --git a/src/common/rtc.cc b/src/common/rtc.cc
index cd26f0e05a..cc51aaa108 100644
--- a/src/common/rtc.cc
+++ b/src/common/rtc.cc
@@ -124,7 +124,7 @@ void CudaModule::Kernel::Launch(
     uint32_t grid_dim_x, uint32_t grid_dim_y, uint32_t grid_dim_z,
     uint32_t block_dim_x, uint32_t block_dim_y, uint32_t block_dim_z,
     uint32_t shared_mem) {
-  CHECK_EQ(ctx.dev_mask(), gpu::kDevMask)
+  CHECK_EQ(ctx.dev_mask(), Context::kGPU)
       << "CUDA Runtime compilation only supports Nvidia GPU.";
 
   auto mod = mod_;
diff --git a/src/common/utils.cc b/src/common/utils.cc
index 125e4e5dc7..784fcf8651 100644
--- a/src/common/utils.cc
+++ b/src/common/utils.cc
@@ -28,6 +28,12 @@
 namespace mxnet {
 namespace common {
 
+template<>
+void CheckFormatWrapper<cpu>(const RunContext &rctx, const NDArray &input,
+                             const TBlob &err_cpu, const bool full_check) {
+  CheckFormatImpl<cpu>(rctx, input, err_cpu, full_check);
+}
+
 template<>
 void CastStorageDispatch<cpu>(const OpContext& ctx,
                               const NDArray& input,
diff --git a/src/common/utils.cu b/src/common/utils.cu
index 093480a989..c6e2bf8138 100644
--- a/src/common/utils.cu
+++ b/src/common/utils.cu
@@ -28,6 +28,12 @@
 namespace mxnet {
 namespace common {
 
+template<>
+void CheckFormatWrapper<gpu>(const RunContext &rctx, const NDArray &input,
+                             const TBlob &err_cpu,  const bool full_check) {
+  CheckFormatImpl<gpu>(rctx, input, err_cpu, full_check);
+}
+
 template<>
 void CastStorageDispatch<gpu>(const OpContext& ctx,
                               const NDArray& input,
diff --git a/src/common/utils.h b/src/common/utils.h
index e0604de88a..038ab2a047 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file utils.h
  * \brief Basic utilility functions.
  */
@@ -43,9 +44,177 @@
 #include <algorithm>
 #include <functional>
 
+#include "../operator/mxnet_op.h"
+
 namespace mxnet {
 namespace common {
 
+
+/*!
+ * \brief IndPtr should be non-negative, in non-decreasing order, start with 0
+ *           and end with value equal with size of indices.
+ */
+struct csr_indptr_check {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const IType* indptr,
+                                  const nnvm::dim_t end, const nnvm::dim_t idx_size) {
+    if (indptr[i+1] < 0 || indptr[i+1] < indptr[i] ||
+        (i == 0 && indptr[i] != 0) ||
+        (i == end - 1 && indptr[end] != idx_size))
+      *out = kCSRIndPtrErr;
+  }
+};
+
+/*!
+ *  \brief Indices should be non-negative, less than the number of columns
+ *           and in ascending order per row.
+ */
+struct csr_idx_check {
+  template<typename DType, typename IType, typename RType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const IType* idx,
+                                  const RType* indptr, const nnvm::dim_t ncols) {
+    for (RType j = indptr[i]; j < indptr[i+1]; j++) {
+      if (idx[j] >= ncols || idx[j] < 0 ||
+          (j < indptr[i+1] - 1 && idx[j] >= idx[j+1])) {
+        *out = kCSRIdxErr;
+        break;
+      }
+    }
+  }
+};
+
+/*!
+ *  \brief Indices of RSPNDArray should be non-negative,
+ *           less than the size of first dimension and in ascending order
+ */
+struct rsp_idx_check {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const IType* idx,
+                                  const nnvm::dim_t end, const nnvm::dim_t nrows) {
+    if ((i < end && idx[i+1] <= idx[i])
+        || idx[i] < 0 || idx[i] >= nrows)
+      *out = kRSPIdxErr;
+  }
+};
+
+template<typename xpu>
+void CheckFormatWrapper(const RunContext &rctx, const NDArray &input,
+                        const TBlob &err_cpu, const bool full_check);
+
+/*!
+ * \brief Check the validity of CSRNDArray.
+ * \param rctx Execution context.
+ * \param input Input NDArray of CSRStorage.
+ * \param err_cpu Error number on cpu.
+ * \param full_check If true, rigorous check, O(N) operations,
+ *          otherwise basic check, O(1) operations.
+ */
+template<typename xpu>
+void CheckFormatCSRImpl(const RunContext &rctx, const NDArray &input,
+                        const TBlob &err_cpu, const bool full_check) {
+  using namespace op::mxnet_op;
+  CHECK_EQ(input.storage_type(), kCSRStorage)
+          << "CheckFormatCSRImpl is for CSRNDArray";
+  const TShape shape = input.shape();
+  const TShape idx_shape = input.aux_shape(csr::kIdx);
+  const TShape indptr_shape = input.aux_shape(csr::kIndPtr);
+  const TShape storage_shape = input.storage_shape();
+  if ((shape.ndim() != 2) ||
+      (idx_shape.ndim() != 1 || indptr_shape.ndim() != 1 || storage_shape.ndim() != 1) ||
+      (indptr_shape[0] != shape[0] + 1) ||
+      (idx_shape[0] != storage_shape[0])) {
+     MSHADOW_TYPE_SWITCH(err_cpu.type_flag_, DType, {
+       DType* err = err_cpu.dptr<DType>();
+       *err = kCSRShapeErr;
+     });
+     return;
+  }
+  if (full_check) {
+    MSHADOW_TYPE_SWITCH(err_cpu.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(input.aux_type(csr::kIndPtr), RType, {
+        MSHADOW_IDX_TYPE_SWITCH(input.aux_type(csr::kIdx), IType, {
+          mshadow::Stream<xpu> *s = rctx.get_stream<xpu>();
+          NDArray ret_xpu = NDArray(mshadow::Shape1(1),
+                                    rctx.get_ctx(), false, err_cpu.type_flag_);
+          TBlob val_xpu = ret_xpu.data();
+          Kernel<set_to_int<kNormalErr>, xpu>::Launch(s, val_xpu.Size(), val_xpu.dptr<DType>());
+          Kernel<csr_indptr_check, xpu>::Launch(s, indptr_shape[0] - 1, val_xpu.dptr<DType>(),
+            input.aux_data(csr::kIndPtr).dptr<RType>(),
+            indptr_shape[0] - 1, idx_shape[0]);
+          // no need to check indices if indices are empty
+          if (idx_shape[0] != 0) {
+            Kernel<csr_idx_check, xpu>::Launch(s, indptr_shape[0] - 1, val_xpu.dptr<DType>(),
+              input.aux_data(csr::kIdx).dptr<IType>(),
+              input.aux_data(csr::kIndPtr).dptr<RType>(), shape[1]);
+          }
+          mshadow::Copy(err_cpu.get<cpu, 1, DType>(),
+                        val_xpu.get<xpu, 1, DType>(s), s);
+        });
+      });
+    });
+  }
+}
+
+/*!
+ * \brief Check the validity of RowSparseNDArray.
+ * \param rctx Execution context.
+ * \param input Input NDArray of RowSparseStorage.
+ * \param err_cpu Error number on cpu.
+ * \param full_check If true, rigorous check, O(N) operations,
+ *          otherwise basic check, O(1) operations.
+ */
+template<typename xpu>
+void CheckFormatRSPImpl(const RunContext &rctx, const NDArray &input,
+                        const TBlob &err_cpu, const bool full_check) {
+  using namespace op::mxnet_op;
+  CHECK_EQ(input.storage_type(), kRowSparseStorage)
+          << "CheckFormatRSPImpl is for RSPNDArray";
+  const TShape idx_shape = input.aux_shape(rowsparse::kIdx);
+  if (idx_shape[0] != input.storage_shape()[0]) {
+    MSHADOW_TYPE_SWITCH(err_cpu.type_flag_, DType, {
+      DType* err = err_cpu.dptr<DType>();
+      *err = kRSPShapeErr;
+    });
+    return;
+  }
+  if (idx_shape[0] == 0) {
+    return;
+  }
+  if (full_check) {
+    MSHADOW_TYPE_SWITCH(err_cpu.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(input.aux_type(rowsparse::kIdx), IType, {
+        mshadow::Stream<xpu> *s = rctx.get_stream<xpu>();
+        NDArray ret_xpu = NDArray(mshadow::Shape1(1),
+                                  rctx.get_ctx(), false, err_cpu.type_flag_);
+        TBlob val_xpu = ret_xpu.data();
+        Kernel<set_to_int<kNormalErr>, xpu>::Launch(s, val_xpu.Size(), val_xpu.dptr<DType>());
+
+        Kernel<rsp_idx_check, xpu>::Launch(s, idx_shape[0],
+          val_xpu.dptr<DType>(), input.aux_data(rowsparse::kIdx).dptr<IType>(),
+          idx_shape[0] - 1, input.shape()[0]);
+        mshadow::Copy(err_cpu.get<cpu, 1, DType>(),
+                      val_xpu.get<xpu, 1, DType>(s), s);
+      });
+    });
+  }
+}
+
+template<typename xpu>
+void CheckFormatImpl(const RunContext &rctx, const NDArray &input,
+                     const TBlob &err_cpu, const bool full_check) {
+  int stype = input.storage_type();
+  if (stype == kCSRStorage) {
+    CheckFormatCSRImpl<xpu>(rctx, input, err_cpu, full_check);
+  } else if (stype == kRowSparseStorage) {
+    CheckFormatRSPImpl<xpu>(rctx, input, err_cpu, full_check);
+  } else if (stype == kDefaultStorage) {
+    // no-op for default storage
+  } else {
+    LOG(FATAL) << "Unknown storage type " << stype;
+  }
+}
+
+
 template<typename xpu>
 void CastStorageDispatch(const OpContext& ctx, const NDArray& input, const NDArray& output);
 
diff --git a/src/engine/engine.cc b/src/engine/engine.cc
index d6196085be..1c72f33d24 100644
--- a/src/engine/engine.cc
+++ b/src/engine/engine.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file engine.cc
  * \brief Implementation of engine.
  */
diff --git a/src/engine/engine_impl.h b/src/engine/engine_impl.h
index cf727366f6..b3ec34dc85 100644
--- a/src/engine/engine_impl.h
+++ b/src/engine/engine_impl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file engine_impl.h
  * \brief Internal implementation header of engine components.
  */
diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
index 4d63749f82..86f3877397 100644
--- a/src/engine/naive_engine.cc
+++ b/src/engine/naive_engine.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file naive_engine.cc
  * \brief Implementation of NaiveEngine
  */
diff --git a/src/engine/openmp.cc b/src/engine/openmp.cc
index ad0c5740ec..bd7f7fdea7 100644
--- a/src/engine/openmp.cc
+++ b/src/engine/openmp.cc
@@ -30,7 +30,7 @@ namespace engine {
 #endif
 
 static inline bool is_env_set(const char *var) {
-  return dmlc::GetEnv(var, INT_MIN) == INT_MIN;
+  return dmlc::GetEnv(var, INT_MIN) != INT_MIN;
 }
 
 OpenMP *OpenMP::Get() {
@@ -55,8 +55,6 @@ OpenMP::OpenMP()
       omp_thread_max_ = omp_get_max_threads();
     }
   }
-  omp_set_nested(dmlc::GetEnv("OMP_NESTED", false));
-  omp_set_dynamic(dmlc::GetEnv("OMP_DYNAMIC", false));
 #else
   enabled_ = false;
   omp_thread_max_ = 1;
diff --git a/src/engine/profiler.cc b/src/engine/profiler.cc
index 99504f61ce..21c476f64a 100644
--- a/src/engine/profiler.cc
+++ b/src/engine/profiler.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file profiler.cc
  * \brief implements profiler
  */
diff --git a/src/engine/profiler.h b/src/engine/profiler.h
index b7f8e0e1f0..dbbc773351 100644
--- a/src/engine/profiler.h
+++ b/src/engine/profiler.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file profiler.h
  * \brief implements profiler
  */
diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h
index cd6db53f14..432bccf27d 100644
--- a/src/engine/stream_manager.h
+++ b/src/engine/stream_manager.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  */
 #ifndef MXNET_ENGINE_STREAM_MANAGER_H_
 #define MXNET_ENGINE_STREAM_MANAGER_H_
@@ -89,6 +90,8 @@ RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
 #else
       LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif  // MXNET_USE_CUDA
+    default:
+      LOG(FATAL) << "Not Reached";
     }
   }
   return ret;
@@ -116,6 +119,8 @@ RunContext StreamManager<kNumGpus, kStreams>::GetIORunContext(
 #else
       LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif  // MXNET_USE_CUDA
+    default:
+      LOG(FATAL) << "Not Reached";
     }
   }
   return ret;
diff --git a/src/engine/thread_pool.h b/src/engine/thread_pool.h
index a4c1e3321a..b4dae6bfd4 100644
--- a/src/engine/thread_pool.h
+++ b/src/engine/thread_pool.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  */
 #ifndef MXNET_ENGINE_THREAD_POOL_H_
 #define MXNET_ENGINE_THREAD_POOL_H_
diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc
index bc5b81c568..b17d928637 100644
--- a/src/engine/threaded_engine.cc
+++ b/src/engine/threaded_engine.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file threaded_engine.cc
  * \brief implements base threaded engine.
  * \author Yutian Li
@@ -267,8 +268,9 @@ void ThreadedEngine::DeleteOperator(OprHandle op) {
   deps.insert(deps.end(),
               threaded_opr->mutable_vars.begin(),
               threaded_opr->mutable_vars.end());
-  this->PushSync([threaded_opr](RunContext) {
+  this->PushAsync([threaded_opr](RunContext, CallbackOnComplete on_complete) {
       ThreadedOpr::Delete(threaded_opr);
+      on_complete();
     }, Context::CPU(), {}, deps, FnProperty::kAsync, 0,
     PROFILER_MESSAGE("DeleteOperator"));
 }
@@ -304,6 +306,7 @@ void ThreadedEngine::PushAsync(AsyncFn fn, Context exec_ctx,
                                FnProperty prop,
                                int priority,
                                const char* opr_name) {
+  BulkFlush();
   ThreadedOpr *opr = NewOperator(std::move(fn), const_vars, mutable_vars, prop, opr_name);
   opr->temporary = true;
 #if MXNET_USE_PROFILER
@@ -316,20 +319,42 @@ void ThreadedEngine::PushAsync(AsyncFn fn, Context exec_ctx,
   Push(opr, exec_ctx, priority, profiling);
 }
 
+void ThreadedEngine::PushSync(SyncFn exec_fn, Context exec_ctx,
+                              std::vector<VarHandle> const& const_vars,
+                              std::vector<VarHandle> const& mutable_vars,
+                              FnProperty prop,
+                              int priority,
+                              const char* opr_name) {
+  BulkStatus& bulk_status = *BulkStatusStore::Get();
+  if (!bulk_status.bulk_size || prop != FnProperty::kNormal || priority) {
+    this->PushAsync([exec_fn](RunContext ctx, CallbackOnComplete on_complete) {
+        exec_fn(ctx);
+        on_complete();
+      }, exec_ctx, const_vars, mutable_vars, prop, priority, opr_name);
+    return;
+  }
+
+  if (bulk_status.count && exec_ctx != bulk_status.ctx) BulkFlush();
+  BulkAppend(exec_fn, exec_ctx, const_vars, mutable_vars);
+  return;
+}
+
 void ThreadedEngine::DeleteVariable(SyncFn delete_fn,
                                     Context exec_ctx,
                                     VarHandle var) {
   ThreadedVar* threaded_var = ThreadedVar::CastFromBase(var);
-  this->PushSync([delete_fn, threaded_var](RunContext ctx) {
+  this->PushAsync([delete_fn, threaded_var](RunContext ctx, CallbackOnComplete on_complete) {
       // Mark variable as orphan,
       // so during `ThreadedEngine::OnComplete` it could be recycled.
       threaded_var->SetToDelete();
       delete_fn(ctx);
+      on_complete();
     }, exec_ctx, {}, {var}, FnProperty::kDeleteVar, 0,
     PROFILER_MESSAGE("DeleteVariable"));
 }
 
 void ThreadedEngine::WaitForVar(VarHandle var) {
+  BulkFlush();
   ThreadedVar* threaded_var = ThreadedVar::CastFromBase(var);
   if (threaded_var->ready_to_read()) return;
   if (engine_info_) {
@@ -337,7 +362,7 @@ void ThreadedEngine::WaitForVar(VarHandle var) {
     debug_wait_var_ = threaded_var;
   }
   std::atomic<bool> done{false};
-  this->PushSync([this, &done](RunContext) {
+  this->PushAsync([this, &done](RunContext, CallbackOnComplete on_complete) {
       if (engine_info_) {
         LOG(INFO) << "Sync is executed";
       }
@@ -349,6 +374,7 @@ void ThreadedEngine::WaitForVar(VarHandle var) {
       if (engine_info_) {
         LOG(INFO) << "Sync is notified";
       }
+      on_complete();
     }, Context::CPU(), {var}, {}, FnProperty::kNormal, 0,
     PROFILER_MESSAGE("WaitForVar"));
   {
@@ -360,6 +386,7 @@ void ThreadedEngine::WaitForVar(VarHandle var) {
 }
 
 void ThreadedEngine::WaitForAll() {
+  BulkFlush();
   std::unique_lock<std::mutex> lock{finished_m_};
   finished_cv_.wait(lock, [this]() {
       return pending_.load() == 0 || kill_.load();
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index e000a22c22..d85321c52c 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file threaded_engine.h
  * \brief Implements base class of threaded engine
  *    that tracks the dependency and pushes actions to execute.
@@ -33,6 +34,7 @@
 #include <functional>
 #include <condition_variable>
 #include <atomic>
+#include <utility>
 #include <mutex>
 #include <string>
 #include <thread>
@@ -272,6 +274,12 @@ class ThreadedEngine : public Engine {
                  FnProperty prop = FnProperty::kNormal,
                  int priority = 0,
                  const char* opr_name = nullptr) override;
+  void PushSync(SyncFn exec_fn, Context exec_ctx,
+                std::vector<VarHandle> const& const_vars,
+                std::vector<VarHandle> const& mutable_vars,
+                FnProperty prop = FnProperty::kNormal,
+                int priority = 0,
+                const char* opr_name = nullptr) override;
   void DeleteVariable(SyncFn delete_fn, Context exec_ctx, VarHandle var) override;
   void WaitForVar(VarHandle var) override;
   void WaitForAll() override;
@@ -364,7 +372,35 @@ class ThreadedEngine : public Engine {
     }
   }
 
+  int bulk_size() const override {
+    return BulkStatusStore::Get()->bulk_size;
+  }
+
+  int set_bulk_size(int bulk_size) override {
+    BulkStatus& bulk_status = *BulkStatusStore::Get();
+    std::swap(bulk_status.bulk_size, bulk_size);
+    if (bulk_status.count >= bulk_status.bulk_size) BulkFlush();
+    return bulk_size;
+  }
+
  private:
+  /*! \brief structure for holding bulk execution status */
+  struct BulkStatus {
+    /*! \brief maximum number of ops per bulk */
+    int bulk_size = 0;
+    /*! \brief current number of ops in bulk */
+    int count = 0;
+    /*! \brief context of current ops */
+    Context ctx;
+    /*! \brief current op functions */
+    SyncFn fn;
+    /*! \brief constant variables */
+    std::vector<VarHandle> const_vars;
+    /*! \brief mutable variables */
+    std::vector<VarHandle> mutable_vars;
+  };
+  /*! thread local store for bulk */
+  typedef dmlc::ThreadLocalStore<BulkStatus> BulkStatusStore;
   /*!
    * \brief check if thee is duplication in const_vars and mutable_vars.
    * \param const_vars the variables to read from.
@@ -380,6 +416,46 @@ class ThreadedEngine : public Engine {
   inline void OnComplete(ThreadedOpr* threaded_opr);
   // callback to the threaded engine
   static void OnCompleteStatic(Engine *engine, void *threaded_opr);
+  /*! \brief append an operator to bulk */
+  inline void BulkAppend(SyncFn exec_fn, Context exec_ctx,
+                         std::vector<VarHandle> const& const_vars,
+                         std::vector<VarHandle> const& mutable_vars) {
+    BulkStatus& bulk_status = *BulkStatusStore::Get();
+    if (!bulk_status.count) {
+      bulk_status.ctx = exec_ctx;
+      bulk_status.fn = std::move(exec_fn);
+    } else {
+      auto prev_fn = std::move(bulk_status.fn);
+      bulk_status.fn = [exec_fn, prev_fn](RunContext rctx) {
+          prev_fn(rctx);
+          exec_fn(rctx);
+        };
+    }
+
+    ++bulk_status.count;
+    bulk_status.const_vars.insert(
+        bulk_status.const_vars.end(), const_vars.begin(), const_vars.end());
+    bulk_status.mutable_vars.insert(
+        bulk_status.mutable_vars.end(), mutable_vars.begin(), mutable_vars.end());
+
+    if (bulk_status.count >= bulk_status.bulk_size) BulkFlush();
+  }
+  /*! \brief flush current bulk to execution */
+  inline void BulkFlush() {
+    BulkStatus& bulk_status = *BulkStatusStore::Get();
+    if (!bulk_status.count) return;
+    bulk_status.count = 0;
+    DeduplicateVarHandle(&bulk_status.const_vars, &bulk_status.mutable_vars);
+    auto fn = std::move(bulk_status.fn);
+    this->PushAsync([fn](RunContext ctx, CallbackOnComplete on_complete) {
+        fn(ctx);
+        on_complete();
+      }, bulk_status.ctx, bulk_status.const_vars, bulk_status.mutable_vars,
+      FnProperty::kNormal, 0, "ImperativeBulk");
+
+    bulk_status.const_vars.clear();
+    bulk_status.mutable_vars.clear();
+  }
   /*!
    * \brief Number of pending operations.
    */
diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc
index e01dd4ed45..c01de75384 100644
--- a/src/engine/threaded_engine_perdevice.cc
+++ b/src/engine/threaded_engine_perdevice.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file threaded_engine_perdevice.cc
  * \brief ThreadedEngine that uses fix amount of thread for each device.
  */
@@ -50,6 +51,38 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
   static auto constexpr kWorkerQueue = kFIFO;
 
   ThreadedEnginePerDevice() noexcept(false) {
+    this->Start();
+#ifndef _WIN32
+    pthread_atfork(
+      []() {
+        Engine::Get()->WaitForAll();
+        Engine::Get()->Stop();
+      },
+      []() {
+        Engine::Get()->Start();
+      },
+      []() {
+        // Make children single threaded since they are typically workers
+        dmlc::SetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
+        dmlc::SetEnv("OMP_NUM_THREADS", 1);
+        OpenMP::Get()->set_enabled(false);
+        Engine::Get()->Start();
+      });
+#endif
+  }
+  ~ThreadedEnginePerDevice() noexcept(false) {
+    this->Stop();
+  }
+
+  void Stop() override {
+    SignalQueuesForKill();
+    gpu_normal_workers_.Clear();
+    gpu_copy_workers_.Clear();
+    cpu_normal_workers_.Clear();
+    cpu_priority_worker_.reset(nullptr);
+  }
+
+  void Start() override {
     gpu_worker_nthreads_ = common::GetNumThreadPerGPU();
     cpu_worker_nthreads_ = dmlc::GetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
     // create CPU task
@@ -61,27 +94,20 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
         }));
     // GPU tasks will be created lazily
   }
-  ~ThreadedEnginePerDevice() noexcept(false) {
-    SignalQueuesForKill();
-    gpu_normal_workers_.Clear();
-    gpu_copy_workers_.Clear();
-    cpu_normal_workers_.Clear();
-    cpu_priority_worker_.reset(nullptr);
-  }
 
  protected:
   void PushToExecute(OprBlock *opr_block, bool pusher_thread) override {
     const Context& ctx = opr_block->ctx;
     if ((opr_block->opr->prop == FnProperty::kAsync ||
          opr_block->opr->prop == FnProperty::kDeleteVar) && pusher_thread) {
-      if (ctx.dev_mask() == gpu::kDevMask) {
+      if (ctx.dev_mask() == Context::kGPU) {
         #if MXNET_USE_CUDA
         MSHADOW_CATCH_ERROR(mshadow::SetDevice<gpu>(ctx.dev_id));
         #endif
       }
       this->ExecuteOprBlock(RunContext{ctx, nullptr}, opr_block);
     } else {
-      if (ctx.dev_mask() == cpu::kDevMask) {
+      if (ctx.dev_mask() == Context::kCPU) {
         if (opr_block->opr->prop == FnProperty::kCPUPrioritized) {
           cpu_priority_worker_->task_queue.Push(opr_block, opr_block->priority);
         } else {
@@ -104,7 +130,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
           }
         }
       } else {
-        CHECK_EQ(ctx.dev_mask(), gpu::kDevMask);
+        CHECK_EQ(ctx.dev_mask(), Context::kGPU);
         // GPU execution.
         FnProperty prop = opr_block->opr->prop;
         bool is_copy = (prop == FnProperty::kCopyFromGPU ||
diff --git a/src/engine/threaded_engine_pooled.cc b/src/engine/threaded_engine_pooled.cc
index 6db7c4bb7a..074ea4e847 100644
--- a/src/engine/threaded_engine_pooled.cc
+++ b/src/engine/threaded_engine_pooled.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file threaded_engine_pooled.cc
  * \brief Pooled threaded engine
  * \author Yutian Li
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index f595b44684..1bcc40a894 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file attach_op_execs_pass.cc
  * \brief Operator executor to execute each operator.
  */
diff --git a/src/executor/attach_op_resource_pass.cc b/src/executor/attach_op_resource_pass.cc
index 544e050721..18feec7957 100644
--- a/src/executor/attach_op_resource_pass.cc
+++ b/src/executor/attach_op_resource_pass.cc
@@ -19,6 +19,7 @@
 
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file attach_op_resource_pass.cc
  * \brief Pass to attach resource to OpExecVector of the graph.
  */
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index e6de9f324c..bf4b14771d 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file exec_pass.h
  * \brief All the execution related pass and data structures.
  */
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index dd4867559d..01484dac29 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file graph_executor.cc
  * \brief graph executor
  */
@@ -320,6 +321,7 @@ Graph AssignContext(Graph g,
                     const std::vector<Context>& in_arg_ctxes,
                     const std::vector<Context>& arg_grad_ctxes,
                     const std::vector<Context>& aux_state_ctxes,
+                    const std::vector<OpReqType>& grad_req_types,
                     size_t num_forward_inputs,
                     size_t num_forward_outputs) {
   const auto& idx = g.indexed_graph();
@@ -384,9 +386,15 @@ Graph AssignContext(Graph g,
 
   // loop through backward input nodes and populate maps and lists
   // the backward input nodes is the gradient of the loss wrt the output
-  for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i) {
+  size_t arg_grad_offset = 0;
+  // keep an offset into the arg_grad_ctxes vector,
+  // since g.outputs exclude arg_grad whose req == null
+  CHECK_GE(grad_req_types.size(), g.outputs.size() - num_forward_outputs)
+           << "insufficient number of grad_reqs";
+  for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i, ++arg_grad_offset) {
+    while (grad_req_types[arg_grad_offset] == kNullOp) ++arg_grad_offset;
     const uint32_t nid = idx.outputs()[i].node_id;
-    Context ctx = arg_grad_ctxes[i - num_forward_outputs];
+    Context ctx = arg_grad_ctxes[arg_grad_offset];
     if (ctx2id.count(ctx) == 0) {
       ctx2id[ctx] = static_cast<int>(ctx_list.size());
       ctx_list.push_back(ctx);
@@ -416,9 +424,11 @@ Graph AssignContext(Graph g,
   // if the assigned device of gradient node
   // corresponds to storage of grads
   auto &new_idx = g.indexed_graph();
-  for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i) {
+  arg_grad_offset = 0;
+  for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i, ++arg_grad_offset) {
+    while (grad_req_types[arg_grad_offset] == kNullOp) ++arg_grad_offset;
     const uint32_t nid = new_idx.outputs()[i].node_id;
-    Context ctx = arg_grad_ctxes[i - num_forward_outputs];
+    Context ctx = arg_grad_ctxes[arg_grad_offset];
     CHECK(ctx == vcontext[nid])
       << "Trying to save gradient to " << ctx
       << " while its source node \"" << new_idx[nid].source->attrs.name
@@ -1054,6 +1064,7 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
                     in_arg_ctxes,
                     arg_grad_ctxes,
                     aux_state_ctxes,
+                    grad_req_types,
                     num_forward_inputs_,
                     num_forward_outputs_);
 
@@ -1297,8 +1308,10 @@ void GraphExecutor::InitCachedOps() {
     std::copy(mutate_vars.begin(), mutate_vars.end(),
               std::inserter(all_vars, all_vars.end()));
     // setup exec vars
-    Engine::Get()->PushSync([exec](RunContext rctx) {
+    Engine::Get()->PushAsync(
+      [exec](RunContext rctx, Engine::CallbackOnComplete on_complete) {
         exec->Setup();
+        on_complete();
       }, Context::CPU(), {}, all_vars, FnProperty::kNormal, 0,
       PROFILER_MESSAGE("SetupExec"));
     auto exec_fun = [exec, is_async, is_gpu] (
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index 8b2540442d..0e5ef32989 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file graph_executor.h
  * \brief Executor to execute the computation graph.
  */
diff --git a/src/executor/inplace_addto_detect_pass.cc b/src/executor/inplace_addto_detect_pass.cc
index 9359d88635..4af2dcd663 100644
--- a/src/executor/inplace_addto_detect_pass.cc
+++ b/src/executor/inplace_addto_detect_pass.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file inplace_addto_detect_pass.cc
  * \brief Detect whether inplace addto operation is possible for certain op.
  */
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index 60d66db485..ec0b9c2530 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -136,7 +136,7 @@ std::vector<nnvm::NodeEntry> Imperative::CachedOp::Gradient(
     const std::vector<nnvm::NodeEntry>& ograds) {
   using namespace nnvm;
   static const auto _backward_CachedOp = Op::Get("_backward_CachedOp");
-  static const auto _CachedOp_NoGrad = Op::Get("_CachedOp_NoGrad");
+  static const auto _NoGrad = Op::Get("_NoGradient");
 
   auto p = Node::Create();
   p->attrs.op = _backward_CachedOp;
@@ -152,13 +152,12 @@ std::vector<nnvm::NodeEntry> Imperative::CachedOp::Gradient(
   const auto& auxs = mutable_input_nodes();
   if (auxs.size()) {
     auto nop = Node::Create();
-    nop->attrs.op = _CachedOp_NoGrad;
-    nop->attrs.parsed = static_cast<uint32_t>(auxs.size());
-    nop->control_deps.push_back(node);
+    nop->attrs.op = _NoGrad;
+    nop->attrs.name = "NoGradient";
     uint32_t j = 0, k = 0;
     for (const auto& i : fwd_graph_.indexed_graph().input_nodes()) {
       if (auxs.count(i)) {
-        ret.emplace_back(NodeEntry{nop, j++, 0});
+        ret.emplace_back(NodeEntry{nop, 0, 0});
       } else {
         ret.emplace_back(NodeEntry{p, k++, 0});
       }
@@ -381,6 +380,7 @@ OpStatePtr Imperative::CachedOp::Forward(const std::vector<NDArray*>& inputs,
                  mem_plan, arrays, &array_reqs);
 
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
+
   Imperative::Get()->RunGraph(
       false, idx, arrays, 0, idx.num_nodes(), std::move(array_reqs),
       std::move(ref_count), &states, dispatch_modes);
@@ -451,6 +451,7 @@ void Imperative::CachedOp::Backward(
                  mem_plan, arrays, &array_reqs);
 
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
+
   Imperative::Get()->RunGraph(
       retain_graph, idx, arrays, num_forward_nodes, idx.num_nodes(),
       std::move(array_reqs), std::move(ref_count), &states, dispatch_modes);
@@ -491,11 +492,4 @@ NNVM_REGISTER_OP(_backward_CachedOp)
 .set_attr<bool>("TIsLayerOpBackward", true)
 .set_attr<bool>("TIsBackward", true);
 
-NNVM_REGISTER_OP(_CachedOp_NoGrad)
-.set_num_inputs(0)
-.set_num_outputs([](const NodeAttrs& attrs) {
-    const uint32_t& nout = nnvm::get<uint32_t>(attrs.parsed);
-    return nout;
-  });
-
 }  // namespace mxnet
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index fc35c492f7..361b971a2d 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -288,6 +288,8 @@ void Imperative::RunGraph(
   DTypeVector arg_dtypes;
   std::vector<OpReqType> req;
 
+  int prev_bulk_size = Engine::Get()->set_bulk_size(10);
+
   for (size_t i = node_start; i < node_end; ++i) {
     const nnvm::IndexedGraph::Node& node = idx[i];
     if (node.source->op() == nullptr) continue;
@@ -351,6 +353,8 @@ void Imperative::RunGraph(
       if (ref_count[eid] == 0) arrays[eid]->ptr_.reset();
     }
   }
+
+  Engine::Get()->set_bulk_size(prev_bulk_size);
 }
 
 
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index dbae9c4f4d..ecc40314e9 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -66,9 +66,9 @@ inline Context GetContext(const nnvm::NodeAttrs& attrs,
   } else {
     ctx = default_ctx;
   }
-  // Pinned context doesn't propagate
-  if (ctx.dev_type == Context::kCPUPinned) {
-    ctx = Context::CPU();
+  // Non-default context (pinned, shared) does not propagate
+  if (ctx.dev_mask() != ctx.dev_type) {
+    ctx = Context::Create(ctx.dev_mask(), ctx.dev_id);
   }
 #if !MXNET_USE_CUDA
   if (ctx.dev_mask() == gpu::kDevMask) {
@@ -340,10 +340,9 @@ inline void PushFCompute(const FCompute& fn,
   bool is_train = Imperative::Get()->is_training();
   std::vector<NDArray> inputs, outputs;
   DerefInputOutput(p_inputs, p_outputs, &inputs, &outputs);
-  Engine::Get()->PushAsync(
+  Engine::Get()->PushSync(
     [ctx, attrs, fn, inputs, outputs, requested, is_train, mutate_idx, req](
-        RunContext rctx,
-        engine::CallbackOnComplete on_complete) {
+        RunContext rctx) {
       std::vector<TBlob> input_blobs, output_blobs;
       // pre-fcompute and post-fcompute storage fallback src NDArrays and dst NDArrays
       std::vector<NDArray> pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src;
@@ -364,7 +363,6 @@ inline void PushFCompute(const FCompute& fn,
       if (is_gpu) {
         rctx.get_stream<gpu>()->Wait();
       }
-      on_complete();
     }, ctx, read_vars, write_vars, FnProperty::kNormal,
     0, PROFILER_MESSAGE(op->name.c_str()));
 }
@@ -389,21 +387,19 @@ inline void PushFComputeEx(const FComputeEx& fn,
   std::vector<NDArray> inputs, outputs;
   DerefInputOutput(p_inputs, p_outputs, &inputs, &outputs);
   const auto& run = [ctx, exec_type, is_train, attrs, fn, inputs, outputs, requested, req](
-        RunContext rctx,
-        engine::CallbackOnComplete on_complete) {
-      OpContext opctx{is_train, rctx, on_complete, requested};
+        RunContext rctx) {
+      OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
       fn(attrs, opctx, inputs, req, outputs);
       if (exec_type == ExecType::kSync) {
         if (rctx.get_ctx().dev_mask() == gpu::kDevMask) {
           rctx.get_stream<gpu>()->Wait();
         }
-        on_complete();
       }
     };
   if (exec_type == ExecType::kLocal) {
-    run(RunContext{ctx, nullptr}, engine::CallbackOnComplete());
+    run(RunContext{ctx, nullptr});
   } else {
-    Engine::Get()->PushAsync(run, ctx, read_vars, write_vars, FnProperty::kNormal,
+    Engine::Get()->PushSync(run, ctx, read_vars, write_vars, FnProperty::kNormal,
       0, PROFILER_MESSAGE(op->name.c_str()));
   }
 }
@@ -436,21 +432,19 @@ inline void PushOperator(const OpStatePtr& state,
   if (fcompute_ex != nullptr && dispatch_mode == DispatchMode::kFComputeEx) {
     const auto& run = [state, fcompute_ex, inputs, outputs, requested, is_train,
                        exec_type, req](
-          RunContext rctx,
-          engine::CallbackOnComplete on_complete) {
-        OpContext opctx{is_train, rctx, on_complete, requested};
+          RunContext rctx) {
+        OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
         fcompute_ex(state, opctx, inputs, req, outputs);
         if (exec_type == ExecType::kSync) {
           if (rctx.get_ctx().dev_mask() == gpu::kDevMask) {
             rctx.get_stream<gpu>()->Wait();
           }
-          on_complete();
         }
       };
     if (exec_type == ExecType::kLocal) {
-      run(RunContext{ctx, nullptr}, engine::CallbackOnComplete());
+      run(RunContext{ctx, nullptr});
     } else {
-      Engine::Get()->PushAsync(run, ctx, read_vars, write_vars, FnProperty::kNormal,
+      Engine::Get()->PushSync(run, ctx, read_vars, write_vars, FnProperty::kNormal,
                                0, PROFILER_MESSAGE(op->name.c_str()));
     }
   } else {
@@ -458,11 +452,10 @@ inline void PushOperator(const OpStatePtr& state,
         << "One of FStatefulCompute and FStatefulComputeEx must be registered "
         << "for stateful operator " << op->name;
     CHECK(exec_type == ExecType::kSync || exec_type == ExecType::kAsync);
-    Engine::Get()->PushAsync(
+    Engine::Get()->PushSync(
       [state, fcompute, inputs, outputs, requested, is_train, exec_type, mutate_idx, req](
-          RunContext rctx,
-          engine::CallbackOnComplete on_complete) {
-        OpContext opctx{is_train, rctx, on_complete, requested};
+          RunContext rctx) {
+        OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
 
         std::vector<TBlob> input_blobs, output_blobs;
         // pre-fcompute and post-fcompute storage fallback src NDArrays and dst NDArrays
@@ -484,7 +477,6 @@ inline void PushOperator(const OpStatePtr& state,
           if (is_gpu) {
             rctx.get_stream<gpu>()->Wait();
           }
-          on_complete();
         }
       }, ctx, read_vars, write_vars, FnProperty::kNormal,
       0, PROFILER_MESSAGE(op->name.c_str()));
@@ -667,9 +659,12 @@ inline std::vector<Context> PlaceDevice(const nnvm::IndexedGraph& idx) {
       vctx[j.node_id] = vctx[i];
     }
   }
+  // check all context initialized
   for (size_t i = 0; i < idx.num_nodes(); ++i) {
     CHECK_NE(vctx[i].dev_type, -1)
         << "Cannot decide context for node " << idx[i].source->attrs.name;
+    // Non-default context do not propagate.
+    vctx[i].dev_type = vctx[i].dev_mask();
   }
 
   return vctx;
diff --git a/src/initialize.cc b/src/initialize.cc
index a3cc1164fa..56d6fe1fff 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file initialize.cc
  * \brief initialize mxnet library
  */
diff --git a/src/io/image_aug_default.cc b/src/io/image_aug_default.cc
index 6db14bd583..22af7d9275 100644
--- a/src/io/image_aug_default.cc
+++ b/src/io/image_aug_default.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file image_aug_default.cc
  * \brief Default augmenter.
  */
diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index 5b6c4e99e5..e8a56ba2e5 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file image_augmenter.h
  * \brief Interface of opencv based image augmenter
  */
diff --git a/src/io/image_det_aug_default.cc b/src/io/image_det_aug_default.cc
index 1a11c05838..79e1931836 100644
--- a/src/io/image_det_aug_default.cc
+++ b/src/io/image_det_aug_default.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file image_det_aug_default.cc
  * \brief Default augmenter.
  */
@@ -315,9 +316,9 @@ class ImageDetLabel {
     }
     // check if crop_box valid
     bool valid = false;
-    if (min_crop_overlap > 0.f && max_crop_overlap < 1.f &&
-        min_crop_sample_coverage > 0.f && max_crop_sample_coverage < 1.f &&
-        min_crop_object_coverage > 0.f && max_crop_object_coverage < 1.f) {
+    if (min_crop_overlap > 0.f || max_crop_overlap < 1.f ||
+        min_crop_sample_coverage > 0.f || max_crop_sample_coverage < 1.f ||
+        min_crop_object_coverage > 0.f || max_crop_object_coverage < 1.f) {
       for (auto& obj : objects_) {
         Rect gt_box = obj.ToRect();
         if (min_crop_overlap > 0.f || max_crop_overlap < 1.f) {
diff --git a/src/io/image_io.cc b/src/io/image_io.cc
index 9081a3734b..d95e750e79 100644
--- a/src/io/image_io.cc
+++ b/src/io/image_io.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file optimizer_op-inl.h
  * \brief Optimizer operators
  * \author Junyuan Xie
@@ -178,7 +179,7 @@ void Imdecode(const nnvm::NodeAttrs& attrs,
 #if MXNET_USE_OPENCV
   const auto& param = nnvm::get<ImdecodeParam>(attrs.parsed);
 
-  CHECK_EQ(inputs[0].ctx().dev_mask(), cpu::kDevMask) << "Only supports cpu input";
+  CHECK_EQ(inputs[0].ctx().dev_mask(), Context::kCPU) << "Only supports cpu input";
   CHECK_EQ(inputs[0].dtype(), mshadow::kUint8) << "Input needs to be uint8 buffer";
   inputs[0].WaitToRead();
 
diff --git a/src/io/image_iter_common.h b/src/io/image_iter_common.h
index f72903ccce..c61e3d12a8 100644
--- a/src/io/image_iter_common.h
+++ b/src/io/image_iter_common.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file image_iter_common.h
  * \brief common types used by image data iterators
  */
diff --git a/src/io/image_recordio.h b/src/io/image_recordio.h
index a931539aa2..24951803f3 100644
--- a/src/io/image_recordio.h
+++ b/src/io/image_recordio.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file image_recordio.h
  * \brief image recordio struct
  */
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
index 01d4b764dd..2682b94b4f 100644
--- a/src/io/inst_vector.h
+++ b/src/io/inst_vector.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file inst_vector.h
  * \brief holder of a sequence of DataInst in CPU
  *        that are not necessarily of same shape
diff --git a/src/io/io.cc b/src/io/io.cc
index e7c92843b4..b92f02e160 100644
--- a/src/io/io.cc
+++ b/src/io/io.cc
@@ -17,6 +17,7 @@
  * under the License.
  */
 
+// Copyright (c) 2015 by Contributors
 
 #include <mxnet/io.h>
 #include <dmlc/registry.h>
diff --git a/src/io/iter_batchloader.h b/src/io/iter_batchloader.h
index ade7c1a53b..be911f695c 100644
--- a/src/io/iter_batchloader.h
+++ b/src/io/iter_batchloader.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_batchloader.h
  * \brief define a batch adapter to create tblob batch
  */
diff --git a/src/io/iter_csv.cc b/src/io/iter_csv.cc
index de261f101a..a9e650b638 100644
--- a/src/io/iter_csv.cc
+++ b/src/io/iter_csv.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_csv.cc
  * \brief define a CSV Reader to read in arrays
  */
diff --git a/src/io/iter_image_det_recordio.cc b/src/io/iter_image_det_recordio.cc
index 4e80d5d531..b933700266 100644
--- a/src/io/iter_image_det_recordio.cc
+++ b/src/io/iter_image_det_recordio.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_image_recordio-inl.hpp
  * \brief recordio data iterator
  */
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 64f31a65fa..3af9e46b50 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_image_recordio-inl.hpp
  * \brief recordio data iterator
  */
diff --git a/src/io/iter_image_recordio_2.cc b/src/io/iter_image_recordio_2.cc
index ad53b80d02..fd8b6d7be8 100644
--- a/src/io/iter_image_recordio_2.cc
+++ b/src/io/iter_image_recordio_2.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file iter_image_recordio_2.cc
  * \brief new version of recordio data iterator
  */
diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc
index 9dbedbbba4..1882a560d5 100644
--- a/src/io/iter_mnist.cc
+++ b/src/io/iter_mnist.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file iter_mnist.cc
  * \brief register mnist iterator
 */
diff --git a/src/io/iter_normalize.h b/src/io/iter_normalize.h
index bc94ad071c..4bc7d53d2b 100644
--- a/src/io/iter_normalize.h
+++ b/src/io/iter_normalize.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_normalize.h
  * \brief Iterator that subtracts mean and do a few augmentations.
  */
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index e633a11cf0..fdd1d2b919 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_prefetcher.h
  * \brief define a prefetcher using threaditer to keep k batch fetched
  */
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
index deed1a15c9..fcf1e6b17f 100644
--- a/src/kvstore/comm.h
+++ b/src/kvstore/comm.h
@@ -18,6 +18,7 @@
  */
 
 /**
+ * Copyright (c) 2015 by Contributors
  */
 #ifndef MXNET_KVSTORE_COMM_H_
 #define MXNET_KVSTORE_COMM_H_
@@ -140,8 +141,10 @@ class CommCPU : public Comm {
         const_vars[i-1] = reduce[i].var();
       }
 
-      Engine::Get()->PushSync([reduce, this](RunContext rctx) {
+      Engine::Get()->PushAsync(
+        [reduce, this](RunContext rctx, Engine::CallbackOnComplete on_complete) {
           ReduceSumCPU(reduce);
+          on_complete();
         }, Context::CPU(), const_vars, {reduce[0].var()},
         FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce"));
 
@@ -163,13 +166,15 @@ class CommCPU : public Comm {
         const_vars[i] = reduce[i].var();
       }
       auto result = buf.merged;
-      Engine::Get()->PushSync([reduce, result, this](RunContext rctx) {
+      Engine::Get()->PushAsync(
+        [reduce, result, this](RunContext rctx, Engine::CallbackOnComplete on_complete) {
           NDArray out = result;
           Resource rsc = ResourceManager::Get()->Request(rctx.ctx,
               ResourceRequest(ResourceRequest::kTempSpace));
           is_serial_push_?
             ReduceSumCPUExSerial(reduce, &out)
             : mxnet::ndarray::ElementwiseSum(rctx.get_stream<cpu>(), rsc, reduce, &out);
+          on_complete();
         }, Context::CPU(), const_vars, {result.var()},
         FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce"));
     }
@@ -217,21 +222,25 @@ class CommCPU : public Comm {
           const bool is_to_gpu = out->ctx().dev_mask() == Context::kGPU;
           NDArray out_cpu = is_to_gpu? NDArray(kRowSparseStorage, src.shape(),
               src.ctx(), true, src.dtype(), src.aux_types()) : *out;
-          Engine::Get()->PushSync([=](RunContext rctx) {
+          Engine::Get()->PushAsync(
+            [=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
               const TBlob& indices = row_id.data();
               NDArray temp = out_cpu;  // get rid of const qualifier
               op::SparseRetainOpForwardRspImpl<cpu>(rctx.get_stream<cpu>(),
                                                     src, indices, kWriteTo,
                                                     &temp);
+              on_complete();
             }, Context::CPU(), {src.var(), row_id.var()}, {out_cpu.var()},
             FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreSparseRetain"));
           if (is_to_gpu) {
             CopyFromTo(out_cpu, out, priority);
           }
         } else {  // direct copy rows
-          Engine::Get()->PushSync([=](RunContext rctx) {
+          Engine::Get()->PushAsync(
+            [=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
               CopyRetainedRowsToGPU(rctx.get_stream<cpu>(), rctx.get_stream<gpu>(),
                                     src, row_id, out);
+              on_complete();
             }, out->ctx(), {src.var(), row_id.var()}, {out->var()},
             FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("KVStoreCopyRetainedRowsToGPU"));
         }
diff --git a/src/kvstore/kvstore.cc b/src/kvstore/kvstore.cc
index a288676102..ac37d5d32c 100644
--- a/src/kvstore/kvstore.cc
+++ b/src/kvstore/kvstore.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file kvstore.cc
  * \brief implement kv_store
  */
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index 5e62be8c4c..571767db7a 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -18,6 +18,7 @@
  */
 
 /**
+ * Copyright (c) 2015 by Contributors
  * @file   kvstore_dist.h
  * @brief  distributed implementation based on ps-lite
  */
diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h
index bedb5398a0..f2123e765f 100644
--- a/src/kvstore/kvstore_dist_server.h
+++ b/src/kvstore/kvstore_dist_server.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file mxnet_node.h
  * \brief implement mxnet nodes
  */
@@ -230,13 +231,15 @@ class KVStoreDistServer {
         TBlob recv_blob(data, dshape, cpu::kDevMask);  // NOLINT(*)
         NDArray recved = NDArray(recv_blob, 0);
         stored = NDArray(kRowSparseStorage, dshape, Context());
-        Engine::Get()->PushSync([recved, stored](RunContext ctx) {
+        Engine::Get()->PushAsync(
+          [recved, stored](RunContext ctx, Engine::CallbackOnComplete on_complete) {
             NDArray rsp = stored;
             stored.CheckAndAlloc({mshadow::Shape1(recved.shape()[0])});
             mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
             op::PopulateFullIdxRspImpl(s, &rsp);
             mshadow::Copy(rsp.data().FlatTo1D<cpu, float>(),
                           recved.data().FlatTo1D<cpu, float>(), s);
+            on_complete();
           }, recved.ctx(), {recved.var()}, {stored.var()},
           FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
         stored.WaitToRead();
@@ -285,15 +288,13 @@ class KVStoreDistServer {
           // TODO(haibin) override + operator for row_sparse NDArray
           // instead of calling BinaryComputeRspRsp directly
           using namespace mshadow;
-          Engine::Get()->PushSync([recved, merged, out](RunContext ctx) {
-                                    std::vector<NDArray> inputs, outputs;
-                                    inputs.push_back(recved);
-                                    inputs.push_back(merged.array);
-                                    outputs.push_back(out);
-                                    op::ElemwiseBinaryOp::ComputeEx<cpu, mshadow::op::plus>(
-                                      {}, {}, inputs, {kWriteTo}, outputs);
-                                  }, recved.ctx(), const_vars, {out.var()},
-                                  FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+          Engine::Get()->PushAsync(
+            [recved, merged, out](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+              op::ElemwiseBinaryOp::ComputeEx<cpu, mshadow::op::plus>(
+                {}, {}, {recved, merged.array}, {kWriteTo}, {out});
+              on_complete();
+            }, recved.ctx(), const_vars, {out.var()},
+            FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
           CopyFromTo(out, &merged.array, 0);
         }
         merged.request.push_back(req_meta);
diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index 15a4c6055b..1a4ced8a4f 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -18,6 +18,7 @@
  */
 
 /**
+ * Copyright (c) 2015 by Contributors
  * @file   kvstore_local.h
  * @brief  local implementation
  */
@@ -347,7 +348,8 @@ class KVStoreLocal : public KVStore {
   void Unique(NDArray *out, int priority = 0) {
     CHECK_EQ(out->ctx().dev_mask(), pinned_ctx_.dev_mask())
              << "Unique expects input with `pinned_ctx_`";
-    Engine::Get()->PushSync([out](RunContext rctx) {
+    Engine::Get()->PushAsync(
+      [out](RunContext rctx, Engine::CallbackOnComplete on_complete) {
         NDArray *output = out;
         CHECK_EQ(out->shape().ndim(), 1) << "Unique expects 1D inputs";
         const auto size = out->shape()[0];
@@ -358,6 +360,7 @@ class KVStoreLocal : public KVStore {
           auto num_unique_idx = std::unique(dptr, dptr + size) - dptr;
           *output = output->Reshape(mshadow::Shape1(num_unique_idx));
         });
+        on_complete();
       }, pinned_ctx_, {}, {out->var()},
       FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreUnique"));
     out->WaitToRead();
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 275cf40380..c5b939df41 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file ndarray.cc
  * \brief ndarry module of mxnet
  */
@@ -527,25 +528,33 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) {
   if (from.var() != to.var()) const_vars.push_back(from.var());
 
   if (a == cpu::kDevMask && b == cpu::kDevMask) {
-    Engine::Get()->PushSync([from, to](RunContext ctx) {
+    Engine::Get()->PushAsync(
+      [from, to](RunContext ctx, Engine::CallbackOnComplete on_complete) {
         CopyFromToImpl<cpu, cpu>(from, to, ctx);
+        on_complete();
       }, from.ctx(), const_vars, {to.var()},
       FnProperty::kNormal, priority, PROFILER_MESSAGE("CopyCPU2CPU"));
   } else {
 #if MXNET_USE_CUDA
     if (a == cpu::kDevMask && b == gpu::kDevMask) {
-      Engine::Get()->PushSync([from, to](RunContext ctx) {
+      Engine::Get()->PushAsync(
+        [from, to](RunContext ctx, Engine::CallbackOnComplete on_complete) {
           CopyFromToImpl<cpu, gpu>(from, to, ctx);
+          on_complete();
         }, to.ctx(), const_vars, {to.var()},
         FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("CopyCPU2GPU"));
     } else if (a == gpu::kDevMask && b == cpu::kDevMask) {
-      Engine::Get()->PushSync([from, to](RunContext ctx) {
+      Engine::Get()->PushAsync(
+        [from, to](RunContext ctx, Engine::CallbackOnComplete on_complete) {
           CopyFromToImpl<gpu, cpu>(from, to, ctx);
+          on_complete();
         }, from.ctx(), const_vars, {to.var()},
         FnProperty::kCopyFromGPU, priority, PROFILER_MESSAGE("CopyGPU2CPU"));
     } else if (a == gpu::kDevMask && b == gpu::kDevMask) {
-      Engine::Get()->PushSync([from, to](RunContext ctx) {
+      Engine::Get()->PushAsync(
+        [from, to](RunContext ctx, Engine::CallbackOnComplete on_complete) {
           CopyFromToImpl<gpu, gpu>(from, to, ctx);
+          on_complete();
         }, from.ctx(), const_vars, {to.var()},
         from.dtype() != to.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
         priority, PROFILER_MESSAGE("CopyGPU2GPU"));
@@ -572,8 +581,8 @@ void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priori
     }
     CHECK_EQ(source[i].shape() , out->shape())
         << "operands shape mismatch";
-    if (out->ctx().dev_mask() == cpu::kDevMask) {
-      CHECK_EQ(source[i].ctx().dev_mask(),  cpu::kDevMask)
+    if (out->ctx().dev_mask() == Context::kCPU) {
+      CHECK_EQ(source[i].ctx().dev_mask(), Context::kCPU)
           << "operands context mismatch";
     } else {
       CHECK(source[i].ctx() == out->ctx())
@@ -1077,12 +1086,14 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
     ndarray::Copy<cpu, cpu>(src, &dst, Context::CPU(), Context::CPU(), rctx);
   } else {
 #if MXNET_USE_CUDA
-    Engine::Get()->PushSync([&](RunContext rctx) {
+    Engine::Get()->PushAsync(
+      [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
         TBlob dst = this->data();
         ndarray::Copy<cpu, gpu>(src, &dst,
                                 Context::CPU(), this->ctx(), rctx);
         // Wait GPU kernel to complete
         rctx.get_stream<gpu>()->Wait();
+        on_complete();
       }, this->ctx(), {}, {this->var()},
       FnProperty::kCopyToGPU, 0, PROFILER_MESSAGE("SyncCopyCPU2GPU"));
     this->WaitToRead();
@@ -1145,27 +1156,33 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
   } else {
 #if MXNET_USE_CUDA
     if (src_dev_mask == cpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
-      Engine::Get()->PushSync([&](RunContext rctx) {
+      Engine::Get()->PushAsync(
+        [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
           const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
           TBlob dst_data = get_dst_data(src_data.shape_);
           ndarray::Copy<cpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
           rctx.get_stream<gpu>()->Wait();
+          on_complete();
         }, this->ctx(), const_vars, {this->var()},
         FnProperty::kCopyToGPU, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayCPU2GPU"));
     } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
-      Engine::Get()->PushSync([&](RunContext rctx) {
+      Engine::Get()->PushAsync(
+        [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
           const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
           TBlob dst_data = get_dst_data(src_data.shape_);
           ndarray::Copy<gpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
           rctx.get_stream<gpu>()->Wait();
+          on_complete();
         }, this->ctx(), const_vars, {this->var()},
         FnProperty::kCopyFromGPU, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayGPU2CPU"));
     } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
-      Engine::Get()->PushSync([&](RunContext rctx) {
+      Engine::Get()->PushAsync(
+        [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
           const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
           TBlob dst_data = get_dst_data(src_data.shape_);
           ndarray::Copy<gpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
           rctx.get_stream<gpu>()->Wait();
+          on_complete();
         }, this->ctx(), const_vars, {this->var()},
         src.dtype() != this->dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
         0, PROFILER_MESSAGE("SyncCopyFromNDArrayGPU2GPU"));
@@ -1200,11 +1217,13 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
                             Context::CPU(), Context::CPU(), rctx);
   } else {
 #if MXNET_USE_CUDA
-    Engine::Get()->PushSync([&](RunContext rctx) {
+    Engine::Get()->PushAsync(
+      [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
         ndarray::Copy<gpu, cpu>(this->data(), &dst,
                                 this->ctx(), Context::CPU(), rctx);
         // Wait GPU kernel to complete
         rctx.get_stream<gpu>()->Wait();
+        on_complete();
       }, this->ctx(), {this->var()}, {},
       FnProperty::kCopyFromGPU, 0, PROFILER_MESSAGE("SyncCopyGPU2CPU"));
     this->WaitToWrite();
@@ -1214,6 +1233,40 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
   }
 }
 
+void NDArray::SyncCheckFormat(const bool full_check) const {
+  int32_t err = kNormalErr;
+  TBlob err_cpu(&err, mshadow::Shape1(1), cpu::kDevMask, 0);
+  if (this->ctx().dev_mask() == cpu::kDevMask) {
+    Engine::Get()->PushSync([&](RunContext rctx) {
+        common::CheckFormatWrapper<cpu>(rctx, *this, err_cpu, full_check);
+      }, this->ctx(), {this->var()}, {},
+      FnProperty::kNormal, 0, PROFILER_MESSAGE("CheckFormat"));
+  } else {
+#if MXNET_USE_CUDA
+    Engine::Get()->PushSync([&](RunContext rctx) {
+        common::CheckFormatWrapper<gpu>(rctx, *this, err_cpu, full_check);
+        rctx.get_stream<gpu>()->Wait();
+      }, this->ctx(), {this->var()}, {},
+      FnProperty::kNormal, 0, PROFILER_MESSAGE("CheckFormat"));
+#else
+    LOG(FATAL) << "GPU is not enabled";
+#endif
+  }
+  this->WaitToWrite();
+  CHECK_NE(err, kCSRShapeErr) << "Shape mismatch of this csr NDArray";
+  CHECK_NE(err, kCSRIndPtrErr)
+           << "IndPtr of csr NDArray should be non-negative, in non-decreasing order, "
+           << "start with 0, and end with value equal with size of indices.";
+  CHECK_NE(err, kCSRIdxErr)
+           << "Indices of csr NDArray should be non-negative, in ascending order per row "
+           << " and less than the number of columns.";
+  CHECK_NE(err, kRSPShapeErr) << "Shape mismatch of this row_sparse NDArray";
+  CHECK_NE(err, kRSPIdxErr)
+          << "Indices of row_sparse NDArray should be non-negative, "
+          << "less than the size of first dimension and in ascending order";
+  CHECK_EQ(err, kNormalErr) << "Check the validity of this sparse NDArray";
+}
+
 #if MXNET_PREDICT_ONLY == 0
 // register API function
 // those with underscore will be registered at NDArray
@@ -1309,7 +1362,7 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
     CHECK_EQ(ret->shape().ndim(), 4U);
     buff = ret->Slice(index, index+1);
   }
-  CHECK_EQ(buff.ctx().dev_mask(), cpu::kDevMask);
+  CHECK_EQ(buff.ctx().dev_mask(), Context::kCPU);
   CHECK_EQ(n_channels, buff.shape()[1]);
   CHECK_EQ(y1-y0, buff.shape()[2]);
   CHECK_EQ(x1-x0, buff.shape()[3]);
@@ -1329,7 +1382,7 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
     })
   } else {
     CHECK_EQ(mean.dtype(), buff.dtype());
-    CHECK_EQ(mean.ctx().dev_mask(), cpu::kDevMask);
+    CHECK_EQ(mean.ctx().dev_mask(), Context::kCPU);
     CHECK_EQ(mean.shape()[0], buff.shape()[1]);
     CHECK_EQ(mean.shape()[1], buff.shape()[2]);
     CHECK_EQ(mean.shape()[2], buff.shape()[3]);
diff --git a/src/ndarray/ndarray_function-inl.h b/src/ndarray/ndarray_function-inl.h
index b284e03786..821ef2c129 100644
--- a/src/ndarray/ndarray_function-inl.h
+++ b/src/ndarray/ndarray_function-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file ndarray_function-inl.h
  * \brief The real implementation of NDArray functions.
  */
diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h
index f7da246014..98ad3e9257 100644
--- a/src/ndarray/ndarray_function.h
+++ b/src/ndarray/ndarray_function.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file ndarray_op.h
  * \brief the real execution functions of ndarray operations
  */
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index bb5a37fc87..a39fe9ab11 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation-inl.h
  * \brief Activation operator
  * \author Bing Xu
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
index a33c11ce54..bff1d5b962 100644
--- a/src/operator/activation.cc
+++ b/src/operator/activation.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief activation op
  * \author Bing Xu
diff --git a/src/operator/activation.cu b/src/operator/activation.cu
index 0ac51ad031..71efa70b82 100644
--- a/src/operator/activation.cu
+++ b/src/operator/activation.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation.cu
  * \brief
  * \author Bing Xu
diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
index 461f702728..45c8caef50 100644
--- a/src/operator/batch_norm-inl.h
+++ b/src/operator/batch_norm-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file batch_norm-inl.h
  * \brief
  * \author Bing Xu, Chris Olivier
diff --git a/src/operator/batch_norm.cc b/src/operator/batch_norm.cc
index 866b7fe619..3fb1aa2709 100644
--- a/src/operator/batch_norm.cc
+++ b/src/operator/batch_norm.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file batch_norm.cc
  * \brief
  * \author Bing Xu, Chris Olivier
diff --git a/src/operator/batch_norm.cu b/src/operator/batch_norm.cu
index 9a8b576a16..f9f5e1ef70 100644
--- a/src/operator/batch_norm.cu
+++ b/src/operator/batch_norm.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file batch_norm.cu
  * \brief CUDA Batch Normalization code
  * \author Chris Olivier, Bing Xu
diff --git a/src/operator/batch_norm_v1-inl.h b/src/operator/batch_norm_v1-inl.h
index ebfc469ecd..329d66d06d 100644
--- a/src/operator/batch_norm_v1-inl.h
+++ b/src/operator/batch_norm_v1-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file batch_norm-inl_v1.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/batch_norm_v1.cc b/src/operator/batch_norm_v1.cc
index 1abced8763..96111374b0 100644
--- a/src/operator/batch_norm_v1.cc
+++ b/src/operator/batch_norm_v1.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file batch_norm_v1.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/batch_norm_v1.cu b/src/operator/batch_norm_v1.cu
index 8ed22a4dc6..2adbdef3c7 100644
--- a/src/operator/batch_norm_v1.cu
+++ b/src/operator/batch_norm_v1.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file batch_norm_v1.cu
  * \brief
  * \author Bing Xu
diff --git a/src/operator/bilinear_sampler-inl.h b/src/operator/bilinear_sampler-inl.h
index 2d68d7855b..657aebafdb 100644
--- a/src/operator/bilinear_sampler-inl.h
+++ b/src/operator/bilinear_sampler-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file bilinear_Sampler-inl.h
  * \brief
  * \author Xu Dong
diff --git a/src/operator/bilinear_sampler.cc b/src/operator/bilinear_sampler.cc
index d03f6798fd..3365d98bb4 100644
--- a/src/operator/bilinear_sampler.cc
+++ b/src/operator/bilinear_sampler.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file bilinear_sampler.cc
  * \brief
  * \author Xu Dong
diff --git a/src/operator/bilinear_sampler.cu b/src/operator/bilinear_sampler.cu
index 14b5cd20a3..0ab628da70 100644
--- a/src/operator/bilinear_sampler.cu
+++ b/src/operator/bilinear_sampler.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file bilinear_sampler.cu
  * \brief
  * \author Xu Dong
diff --git a/src/operator/channel_op_common.h b/src/operator/channel_op_common.h
index 113da9b358..00cd8ae084 100644
--- a/src/operator/channel_op_common.h
+++ b/src/operator/channel_op_common.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file channel_op_common.h
  * \brief common function used for concat and split channel
  * \author Bing Xu
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
index 8c342c8a78..fdbe33072a 100644
--- a/src/operator/concat-inl.h
+++ b/src/operator/concat-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file concat-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/concat.cc b/src/operator/concat.cc
index 03a8b8049f..4d3c2fa166 100644
--- a/src/operator/concat.cc
+++ b/src/operator/concat.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file concat.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/concat.cu b/src/operator/concat.cu
index 9aea42e917..394fa736ee 100644
--- a/src/operator/concat.cu
+++ b/src/operator/concat.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file concat.cu
  * \brief
  * \author Bing Xu
diff --git a/src/operator/contrib/count_sketch-inl.h b/src/operator/contrib/count_sketch-inl.h
index b736117b76..76d1a7efb8 100644
--- a/src/operator/contrib/count_sketch-inl.h
+++ b/src/operator/contrib/count_sketch-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file count_sketch-inl.h
  * \brief count_sketch operator and symbol
  * \author Chen Zhu
diff --git a/src/operator/contrib/count_sketch.cc b/src/operator/contrib/count_sketch.cc
index 000425fc44..12814116bb 100644
--- a/src/operator/contrib/count_sketch.cc
+++ b/src/operator/contrib/count_sketch.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file count_sketch.cc
  * \brief count_sketch op
  * \author Chen Zhu
diff --git a/src/operator/contrib/count_sketch.cu b/src/operator/contrib/count_sketch.cu
index 0f3d295ae4..b849f4cf3e 100644
--- a/src/operator/contrib/count_sketch.cu
+++ b/src/operator/contrib/count_sketch.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file count_sketch.cu
  * \brief count_sketch op
  * \author Chen Zhu, Yang Shi
diff --git a/src/operator/contrib/ctc_loss-inl.h b/src/operator/contrib/ctc_loss-inl.h
index cb8d27aa6b..ad1d1ec91f 100644
--- a/src/operator/contrib/ctc_loss-inl.h
+++ b/src/operator/contrib/ctc_loss-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file ctc_loss-inl.h
  * \brief
  * \author Sebastian Bodenstien
diff --git a/src/operator/contrib/ctc_loss.cc b/src/operator/contrib/ctc_loss.cc
index 910d83a5d7..32e8e629f0 100644
--- a/src/operator/contrib/ctc_loss.cc
+++ b/src/operator/contrib/ctc_loss.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file ctc_loss.cc
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/contrib/ctc_loss.cu b/src/operator/contrib/ctc_loss.cu
index 4a07a44fa7..3f5f12ca43 100644
--- a/src/operator/contrib/ctc_loss.cu
+++ b/src/operator/contrib/ctc_loss.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file ctc_loss.cu
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/contrib/dequantize-inl.h b/src/operator/contrib/dequantize-inl.h
index 05a732dc13..8f24a8fd7b 100644
--- a/src/operator/contrib/dequantize-inl.h
+++ b/src/operator/contrib/dequantize-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file dequantize-inl.h
  * \brief Implementation of dequantize operation
  */
diff --git a/src/operator/contrib/dequantize.cc b/src/operator/contrib/dequantize.cc
index 422a9557dc..7814a15771 100644
--- a/src/operator/contrib/dequantize.cc
+++ b/src/operator/contrib/dequantize.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file dequantize.cc
  * \brief
  */
diff --git a/src/operator/contrib/dequantize.cu b/src/operator/contrib/dequantize.cu
index 7081c27c97..ca5f91c5de 100644
--- a/src/operator/contrib/dequantize.cu
+++ b/src/operator/contrib/dequantize.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file dequantize.cu
  * \brief
  */
diff --git a/src/operator/contrib/fft-inl.h b/src/operator/contrib/fft-inl.h
index 129aaa3796..be7b64aeb0 100644
--- a/src/operator/contrib/fft-inl.h
+++ b/src/operator/contrib/fft-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file fft-inl.h
  * \brief
  * \author Chen Zhu
diff --git a/src/operator/contrib/fft.cc b/src/operator/contrib/fft.cc
index 0b31a926c6..8332451bf9 100644
--- a/src/operator/contrib/fft.cc
+++ b/src/operator/contrib/fft.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file fft-inl.h
  * \brief
  * \author Chen Zhu
diff --git a/src/operator/contrib/fft.cu b/src/operator/contrib/fft.cu
index dfe3fbba61..ce25faebf8 100644
--- a/src/operator/contrib/fft.cu
+++ b/src/operator/contrib/fft.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file fft-inl.h
  * \brief
  * \author Chen Zhu
diff --git a/src/operator/contrib/ifft-inl.h b/src/operator/contrib/ifft-inl.h
index 1b0e5e524e..e48d653d92 100644
--- a/src/operator/contrib/ifft-inl.h
+++ b/src/operator/contrib/ifft-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file Ifft-inl.h
  * \brief
  * \author Chen Zhu
diff --git a/src/operator/contrib/ifft.cc b/src/operator/contrib/ifft.cc
index f57df69862..26e7041ce0 100644
--- a/src/operator/contrib/ifft.cc
+++ b/src/operator/contrib/ifft.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file Ifft-inl.h
  * \brief
  * \author Chen Zhu
diff --git a/src/operator/contrib/ifft.cu b/src/operator/contrib/ifft.cu
index 35cdb4836b..738ad639c6 100644
--- a/src/operator/contrib/ifft.cu
+++ b/src/operator/contrib/ifft.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file Ifft-inl.h
  * \brief
  * \author Chen Zhu
diff --git a/src/operator/contrib/krprod.h b/src/operator/contrib/krprod.h
index a54ece79e9..90a6179e07 100644
--- a/src/operator/contrib/krprod.h
+++ b/src/operator/contrib/krprod.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  *  \file krprod.h
  *  \brief Core function for Khatri-Rao product
  *  \author Jencir Lee
diff --git a/src/operator/contrib/multi_proposal-inl.h b/src/operator/contrib/multi_proposal-inl.h
index ddfe0628f3..e2ba7c48df 100644
--- a/src/operator/contrib/multi_proposal-inl.h
+++ b/src/operator/contrib/multi_proposal-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * Copyright (c) 2017 Microsoft
  * Licensed under The Apache-2.0 License [see LICENSE for details]
  * \file multi_proposal-inl.h
diff --git a/src/operator/contrib/multi_proposal.cu b/src/operator/contrib/multi_proposal.cu
index 082de6a397..a2a17d7490 100644
--- a/src/operator/contrib/multi_proposal.cu
+++ b/src/operator/contrib/multi_proposal.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * Copyright (c) 2017 Microsoft
  * Licensed under The Apache-2.0 License [see LICENSE for details]
  * \file multi_proposal.cu
diff --git a/src/operator/contrib/multibox_detection-inl.h b/src/operator/contrib/multibox_detection-inl.h
index 34099a3d69..fcf22727ff 100644
--- a/src/operator/contrib/multibox_detection-inl.h
+++ b/src/operator/contrib/multibox_detection-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_detection-inl.h
  * \brief post-process multibox detection predictions
  * \author Joshua Zhang
diff --git a/src/operator/contrib/multibox_detection.cc b/src/operator/contrib/multibox_detection.cc
index 0f6982890f..a2e681a8e6 100644
--- a/src/operator/contrib/multibox_detection.cc
+++ b/src/operator/contrib/multibox_detection.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_detection.cc
  * \brief MultiBoxDetection op
  * \author Joshua Zhang
diff --git a/src/operator/contrib/multibox_detection.cu b/src/operator/contrib/multibox_detection.cu
index 56a1e88dfe..6db8c55597 100644
--- a/src/operator/contrib/multibox_detection.cu
+++ b/src/operator/contrib/multibox_detection.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_detection.cu
  * \brief MultiBoxDetection op
  * \author Joshua Zhang
diff --git a/src/operator/contrib/multibox_prior-inl.h b/src/operator/contrib/multibox_prior-inl.h
index 88ca3dc8de..6602b43ca0 100644
--- a/src/operator/contrib/multibox_prior-inl.h
+++ b/src/operator/contrib/multibox_prior-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_prior-inl.h
  * \brief generate multibox prior boxes
  * \author Joshua Zhang
diff --git a/src/operator/contrib/multibox_prior.cc b/src/operator/contrib/multibox_prior.cc
index 6d7166542b..22a9c10cd9 100644
--- a/src/operator/contrib/multibox_prior.cc
+++ b/src/operator/contrib/multibox_prior.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_prior.cc
  * \brief generate multibox prior boxes cpu implementation
  * \author Joshua Zhang
diff --git a/src/operator/contrib/multibox_prior.cu b/src/operator/contrib/multibox_prior.cu
index 5928718576..57901585b4 100644
--- a/src/operator/contrib/multibox_prior.cu
+++ b/src/operator/contrib/multibox_prior.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_prior.cu
  * \brief generate multibox prior boxes cuda kernels
  * \author Joshua Zhang
diff --git a/src/operator/contrib/multibox_target-inl.h b/src/operator/contrib/multibox_target-inl.h
index 872ddde1de..f5a3b1189d 100644
--- a/src/operator/contrib/multibox_target-inl.h
+++ b/src/operator/contrib/multibox_target-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_target-inl.h
  * \brief
  * \author Joshua Zhang
diff --git a/src/operator/contrib/multibox_target.cc b/src/operator/contrib/multibox_target.cc
index 095613d4a9..2fa041dd34 100644
--- a/src/operator/contrib/multibox_target.cc
+++ b/src/operator/contrib/multibox_target.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_target.cc
  * \brief MultiBoxTarget op
  * \author Joshua Zhang
diff --git a/src/operator/contrib/multibox_target.cu b/src/operator/contrib/multibox_target.cu
index 3d0da6ce6f..c70dce3c1d 100644
--- a/src/operator/contrib/multibox_target.cu
+++ b/src/operator/contrib/multibox_target.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_target.cu
  * \brief MultiBoxTarget op
  * \author Joshua Zhang
diff --git a/src/operator/contrib/proposal-inl.h b/src/operator/contrib/proposal-inl.h
index f989cdec37..a1f9e49e6c 100644
--- a/src/operator/contrib/proposal-inl.h
+++ b/src/operator/contrib/proposal-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file proposal-inl.h
  * \brief Proposal Operator
  * \author Piotr Teterwak, Bing Xu, Jian Guo
diff --git a/src/operator/contrib/proposal.cc b/src/operator/contrib/proposal.cc
index ccb541a403..dd6ed5a93d 100644
--- a/src/operator/contrib/proposal.cc
+++ b/src/operator/contrib/proposal.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file proposal.cc
  * \brief
  * \author Piotr Teterwak, Bing Xu, Jian Guo
diff --git a/src/operator/contrib/proposal.cu b/src/operator/contrib/proposal.cu
index 9f56685a7a..2d676aca93 100644
--- a/src/operator/contrib/proposal.cu
+++ b/src/operator/contrib/proposal.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file proposal.cu
  * \brief Proposal Operator
  * \author Shaoqing Ren, Jian Guo
diff --git a/src/operator/contrib/psroi_pooling-inl.h b/src/operator/contrib/psroi_pooling-inl.h
index b492972527..ff05304532 100644
--- a/src/operator/contrib/psroi_pooling-inl.h
+++ b/src/operator/contrib/psroi_pooling-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * Copyright (c) 2017 Microsoft
  * Licensed under The Apache-2.0 License [see LICENSE for details]
  * \file psroi_pooling-inl.h
diff --git a/src/operator/contrib/psroi_pooling.cc b/src/operator/contrib/psroi_pooling.cc
index dd3a9e0889..75b533446b 100644
--- a/src/operator/contrib/psroi_pooling.cc
+++ b/src/operator/contrib/psroi_pooling.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * Copyright (c) 2017 Microsoft
  * Licensed under The Apache-2.0 License [see LICENSE for details]
  * \file psroi_pooling.cc
diff --git a/src/operator/contrib/psroi_pooling.cu b/src/operator/contrib/psroi_pooling.cu
index 6df64a1948..4721316374 100644
--- a/src/operator/contrib/psroi_pooling.cu
+++ b/src/operator/contrib/psroi_pooling.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * Copyright (c) 2017 Microsoft
  * Licensed under The Apache-2.0 License [see LICENSE for details]
  * \file psroi_pooling.cu
diff --git a/src/operator/contrib/quantize-inl.h b/src/operator/contrib/quantize-inl.h
index 0418d0e79b..4d55b1b5c6 100644
--- a/src/operator/contrib/quantize-inl.h
+++ b/src/operator/contrib/quantize-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file quantize-inl.h
  * \brief implementation of quantize operation
  */
diff --git a/src/operator/contrib/quantize.cc b/src/operator/contrib/quantize.cc
index dbb8985c72..43d60d1dd8 100644
--- a/src/operator/contrib/quantize.cc
+++ b/src/operator/contrib/quantize.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file quantize.cc
  * \brief
  */
diff --git a/src/operator/contrib/quantize.cu b/src/operator/contrib/quantize.cu
index 6c9db9aeec..d50eb25fe6 100644
--- a/src/operator/contrib/quantize.cu
+++ b/src/operator/contrib/quantize.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file quantize.cu
  * \brief
  */
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 5843293a36..c0cc246bee 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file convolution-inl.h
  * \brief
  * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo
diff --git a/src/operator/convolution.cc b/src/operator/convolution.cc
index 55cfe4e085..bc65cc2974 100644
--- a/src/operator/convolution.cc
+++ b/src/operator/convolution.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file convolution.cc
  * \brief
  * \author Bing Xu, Jun Wu
diff --git a/src/operator/convolution.cu b/src/operator/convolution.cu
index b327f3cff4..2962559119 100644
--- a/src/operator/convolution.cu
+++ b/src/operator/convolution.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file convolution.cu
  * \brief
  * \author Bing Xu, Jun Wu
diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
index 6b31b15c32..bc6326aad7 100644
--- a/src/operator/convolution_v1-inl.h
+++ b/src/operator/convolution_v1-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file convolution_v1-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/convolution_v1.cc b/src/operator/convolution_v1.cc
index cb47ed11b5..7de6a34425 100644
--- a/src/operator/convolution_v1.cc
+++ b/src/operator/convolution_v1.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file convolution_v1.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/convolution_v1.cu b/src/operator/convolution_v1.cu
index b20b4b2492..bcba77214e 100644
--- a/src/operator/convolution_v1.cu
+++ b/src/operator/convolution_v1.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file convolution_v1.cu
  * \brief
  * \author Bing Xu
diff --git a/src/operator/correlation-inl.h b/src/operator/correlation-inl.h
index 02507cb1d9..1379fe9a55 100644
--- a/src/operator/correlation-inl.h
+++ b/src/operator/correlation-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file correlation-inl.h
  * \brief correlation operator and symbol
  * \author Xu Dong
diff --git a/src/operator/correlation.cc b/src/operator/correlation.cc
index 2522cd45c4..d3323dc80b 100644
--- a/src/operator/correlation.cc
+++ b/src/operator/correlation.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file correlation.cc
  * \brief correlation op
  * \author Xu Dong
diff --git a/src/operator/crop-inl.h b/src/operator/crop-inl.h
index 5a8709633f..b6e49975bd 100644
--- a/src/operator/crop-inl.h
+++ b/src/operator/crop-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file crop-inl.h
  * \brief
  * \author Wei Wu
diff --git a/src/operator/crop.cc b/src/operator/crop.cc
index 8465819903..03af624ac2 100644
--- a/src/operator/crop.cc
+++ b/src/operator/crop.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file crop.cc
  * \brief
  * \author Wei Wu
diff --git a/src/operator/crop.cu b/src/operator/crop.cu
index 0b51b14495..ba0334c353 100644
--- a/src/operator/crop.cu
+++ b/src/operator/crop.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file concat.cu
  * \brief
  * \author Wei Wu
diff --git a/src/operator/cross_device_copy.cc b/src/operator/cross_device_copy.cc
index b32a68d303..08a7d52a9a 100644
--- a/src/operator/cross_device_copy.cc
+++ b/src/operator/cross_device_copy.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cross_device_copy.cc
  * \brief Special operator that copys NDArray
 */
diff --git a/src/operator/cudnn_activation-inl.h b/src/operator/cudnn_activation-inl.h
index 317ef47c12..75506b3d72 100644
--- a/src/operator/cudnn_activation-inl.h
+++ b/src/operator/cudnn_activation-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_activation-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/cudnn_algoreg-inl.h b/src/operator/cudnn_algoreg-inl.h
index c10593fb0a..fd7b8fa992 100644
--- a/src/operator/cudnn_algoreg-inl.h
+++ b/src/operator/cudnn_algoreg-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_algoreg-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/cudnn_algoreg.cc b/src/operator/cudnn_algoreg.cc
index 5b0e73f0b1..26b3484eb3 100644
--- a/src/operator/cudnn_algoreg.cc
+++ b/src/operator/cudnn_algoreg.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_algoreg.cc
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/cudnn_batch_norm-inl.h b/src/operator/cudnn_batch_norm-inl.h
index bd3c2d6a1c..c231ca3fcd 100644
--- a/src/operator/cudnn_batch_norm-inl.h
+++ b/src/operator/cudnn_batch_norm-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_batch_norm-inl.h
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/cudnn_batch_norm.cc b/src/operator/cudnn_batch_norm.cc
index 28c592b78c..e1e0c999b1 100644
--- a/src/operator/cudnn_batch_norm.cc
+++ b/src/operator/cudnn_batch_norm.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_batch_norm.cc
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/cudnn_batch_norm.cu b/src/operator/cudnn_batch_norm.cu
index c16fc0cac2..e96db2e5e7 100644
--- a/src/operator/cudnn_batch_norm.cu
+++ b/src/operator/cudnn_batch_norm.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_batch_norm.cu
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/cudnn_bilinear_sampler-inl.h b/src/operator/cudnn_bilinear_sampler-inl.h
index 57592dabd8..c2171e6651 100644
--- a/src/operator/cudnn_bilinear_sampler-inl.h
+++ b/src/operator/cudnn_bilinear_sampler-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file cudnn_bilinear_sampler-inl.h
  * \brief
  * \author Xu Dong
diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h
index b2b59944e8..148a942bc8 100644
--- a/src/operator/cudnn_convolution-inl.h
+++ b/src/operator/cudnn_convolution-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_convolution-inl.h
  * \brief
  * \author Bing Xu
@@ -586,7 +587,7 @@ class CuDNNConvolutionOp : public Operator {
                                        &back_algo_w_)) {
       // Not in algo registry, must determine via *Get*() or *Find*()
       Engine::VarHandle var = Engine::Get()->NewVariable();
-      Engine::Get()->PushSync([=](RunContext rctx) {
+      Engine::Get()->PushAsync([=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
         mshadow::Stream<gpu> *s = rctx.get_stream<gpu>();
         CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
         size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
@@ -776,6 +777,7 @@ class CuDNNConvolutionOp : public Operator {
                                           cudnn_backward_compute_type,
                                           SMArch(ctx.dev_id), this->forward_algo_,
                                           this->back_algo_, this->back_algo_w_);
+        on_complete();
       }, ctx, {}, {var});
       Engine::Get()->WaitForVar(var);
       Engine::Get()->DeleteVariable([](RunContext s) {}, ctx, var);
diff --git a/src/operator/cudnn_deconvolution-inl.h b/src/operator/cudnn_deconvolution-inl.h
index 5e9b7c5704..6796678a52 100644
--- a/src/operator/cudnn_deconvolution-inl.h
+++ b/src/operator/cudnn_deconvolution-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file cudnn_deconvolution-inl.h
  * \brief
  * \author Wei Wu, Leonard Lausen
@@ -605,7 +606,7 @@ class CuDNNDeconvolutionOp : public Operator {
                                          &back_algo_, &back_algo_w_)) {
       // Not in algo registry, must determine via *Get*() or *Find*()
       Engine::VarHandle var = Engine::Get()->NewVariable();
-      Engine::Get()->PushSync([=](RunContext rctx) {
+      Engine::Get()->PushAsync([=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
         mshadow::Stream <gpu> *s = rctx.get_stream<gpu>();
         CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
         size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
@@ -798,6 +799,7 @@ class CuDNNDeconvolutionOp : public Operator {
                                             cudnn_backward_compute_type,
                                             SMArch(ctx.dev_id), this->forward_algo_,
                                             this->back_algo_, this->back_algo_w_);
+        on_complete();
       }, ctx, {}, {var});
       Engine::Get()->WaitForVar(var);
       Engine::Get()->DeleteVariable([](RunContext s) {}, ctx, var);
diff --git a/src/operator/cudnn_lrn-inl.h b/src/operator/cudnn_lrn-inl.h
index 241ec704a9..3a45fd51ef 100644
--- a/src/operator/cudnn_lrn-inl.h
+++ b/src/operator/cudnn_lrn-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_lrn-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/cudnn_pooling-inl.h b/src/operator/cudnn_pooling-inl.h
index 5b03fe5ee6..b345e0b74f 100644
--- a/src/operator/cudnn_pooling-inl.h
+++ b/src/operator/cudnn_pooling-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_pooling-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index a260cb4ca0..4bd170cfac 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file cudnn_rnn-inl.h
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/cudnn_softmax_activation-inl.h b/src/operator/cudnn_softmax_activation-inl.h
index c604a8f3f4..b7ab18e1ff 100644
--- a/src/operator/cudnn_softmax_activation-inl.h
+++ b/src/operator/cudnn_softmax_activation-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_activation-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/cudnn_spatial_transformer-inl.h b/src/operator/cudnn_spatial_transformer-inl.h
index fc76784144..1d7242a83c 100644
--- a/src/operator/cudnn_spatial_transformer-inl.h
+++ b/src/operator/cudnn_spatial_transformer-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file cudnn_spatial_transformer-inl.h
  * \brief
  * \author Wei Wu
diff --git a/src/operator/custom/custom-inl.h b/src/operator/custom/custom-inl.h
index 4b2d620be1..13101da61b 100644
--- a/src/operator/custom/custom-inl.h
+++ b/src/operator/custom/custom-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file native_op-inl.h
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/custom/custom.cc b/src/operator/custom/custom.cc
index 456c39c17b..609f6acd2f 100644
--- a/src/operator/custom/custom.cc
+++ b/src/operator/custom/custom.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file custom.cc
  * \brief
  * \author Junyuan Xie
@@ -212,9 +213,17 @@ std::vector<nnvm::NodeEntry> Gradient(
   }
 
   std::vector<nnvm::NodeEntry> ret;
-  for (index_t i = 0; i < g->num_outputs(); ++i) {
+  for (index_t i = 0; i < params.num_args; ++i) {
     ret.emplace_back(nnvm::NodeEntry{g, i, 0});
   }
+  if (params.num_auxs) {
+    nnvm::NodePtr ng = nnvm::Node::Create();
+    ng->attrs.op = nnvm::Op::Get("_NoGradient");
+    ng->attrs.name = "NoGradient";
+    for (index_t i = 0; i < params.num_auxs; ++i) {
+      ret.emplace_back(nnvm::NodeEntry{ng, 0, 0});
+    }
+  }
 
   return ret;
 }
@@ -225,8 +234,8 @@ OpStatePtr CreateState(const NodeAttrs& attrs, Context ctx,
                        const std::vector<int>& in_type) {
   const CustomParam& params = nnvm::get<CustomParam>(attrs.parsed);
 
-  std::vector<uint32_t*> shapes(params.num_args);
-  std::vector<int> ndims(params.num_args);
+  std::vector<uint32_t*> shapes(in_shape.size());
+  std::vector<int> ndims(in_shape.size());
   size_t buff_size = 0;
   for (const auto& i : in_shape) buff_size += i.ndim();
   std::vector<uint32_t> buff(buff_size);
@@ -245,7 +254,7 @@ OpStatePtr CreateState(const NodeAttrs& attrs, Context ctx,
   MXCallbackList *op_info = new MXCallbackList;
   CHECK(reinterpret_cast<CustomOpCreateFunc>(
       params.info->callbacks[kCustomOpPropCreateOperator])(
-          os.str().c_str(), params.num_args, shapes.data(), ndims.data(), in_type.data(),
+          os.str().c_str(), shapes.size(), shapes.data(), ndims.data(), in_type.data(),
           op_info, params.info->contexts[kCustomOpPropCreateOperator]));
 
   CustomParam state = params;
@@ -342,20 +351,100 @@ void Backward(const OpStatePtr& state,
   Imperative::Get()->set_is_recording(prev_recording);
 }
 
+inline bool BackwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                     const int dev_mask,
+                                     DispatchMode* dispatch_mode,
+                                     std::vector<int>* iattr,
+                                     std::vector<int>* oattr) {
+  const CustomParam& params = nnvm::get<CustomParam>(attrs.parsed);
+
+  if (params.info->num_callbacks <= kCustomOpPropBackwardInferStorageType) {
+    for (size_t i = 0; i < iattr->size(); i++) {
+      STORAGE_TYPE_ASSIGN_CHECK(*iattr, i, kDefaultStorage);
+    }
+    for (size_t i = 0; i < oattr->size(); i++) {
+      STORAGE_TYPE_ASSIGN_CHECK(*oattr, i, kDefaultStorage);
+    }
+    DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+    return true;
+  }
+
+  std::vector<int> stypes;
+  stypes.reserve(params.num_outs * 2 + params.num_args * 2 + params.num_auxs);
+  for (size_t i = 0; i < iattr->size(); ++i) {
+    stypes.push_back((*iattr)[i]);
+  }
+  for (size_t i = 0; i < oattr->size(); ++i) {
+    stypes.push_back((*oattr)[i]);
+  }
+
+  CHECK(reinterpret_cast<CustomOpBackwardInferStorageTypeFunc>(
+      params.info->callbacks[kCustomOpPropBackwardInferStorageType])(
+      stypes.size(), stypes.data(),
+      params.info->contexts[kCustomOpPropBackwardInferStorageType]));
+  for (size_t i = 0; i < 2 * params.num_outs + params.num_args; ++i) {
+    STORAGE_TYPE_ASSIGN_CHECK(*iattr, i, stypes[i]);
+  }
+  for (size_t i = 0; i < params.num_args; ++i) {
+    STORAGE_TYPE_ASSIGN_CHECK(
+        *oattr, i, stypes[i + 2 * params.num_outs + params.num_args]);
+  }
+  for (size_t i = 0; i < params.num_auxs; ++i) {
+    STORAGE_TYPE_ASSIGN_CHECK(
+        *iattr, i + 2 * params.num_outs + params.num_args,
+        stypes[i + 2 * params.num_outs + 2 * params.num_args]);
+  }
+
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+  return true;
+}
+
 // infer storage function for custom op, which assigns kDefaultStorage for
 // all undefined stypes, and dispatch on DispatchMode::kFComputeEx.
-inline bool InferStorageType(const nnvm::NodeAttrs& attrs,
-                             const int dev_mask,
+inline bool InferStorageType(const nnvm::NodeAttrs& attrs, const int dev_mask,
                              DispatchMode* dispatch_mode,
-                             std::vector<int> *iattr,
-                             std::vector<int> *oattr) {
-  for (int& v : *oattr) {
-    if (v == -1) v = kDefaultStorage;
+                             std::vector<int>* iattr, std::vector<int>* oattr) {
+  const CustomParam& params = nnvm::get<CustomParam>(attrs.parsed);
+
+  if (params.info->num_callbacks <= kCustomOpPropInferStorageType) {
+    for (size_t i = 0; i < iattr->size(); i++) {
+      STORAGE_TYPE_ASSIGN_CHECK(*iattr, i, kDefaultStorage);
+    }
+    for (size_t i = 0; i < oattr->size(); i++) {
+      STORAGE_TYPE_ASSIGN_CHECK(*oattr, i, kDefaultStorage);
+    }
+    DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+    return true;
+  }
+
+  std::vector<int> stypes;
+  stypes.reserve(params.num_args + params.num_outs + params.num_auxs);
+  for (size_t i = 0; i < params.num_args; ++i) {
+    stypes.push_back((*iattr)[i]);
+  }
+  for (const auto& i : *oattr) {
+    stypes.push_back(i);
   }
-  for (int& v : *iattr) {
-    if (v == -1) v = kDefaultStorage;
+  for (size_t i = 0; i < params.num_auxs; ++i) {
+    stypes.push_back((*iattr)[params.num_args + i]);
   }
-  dispatch_mode_assign(dispatch_mode, DispatchMode::kFComputeEx);
+
+  CHECK(reinterpret_cast<CustomOpInferStorageTypeFunc>(
+      params.info->callbacks[kCustomOpPropInferStorageType])(
+      stypes.size(), stypes.data(),
+      params.info->contexts[kCustomOpPropInferStorageType]));
+  for (size_t i = 0; i < params.num_args; ++i) {
+    STORAGE_TYPE_ASSIGN_CHECK(*iattr, i, stypes[i]);
+  }
+  for (size_t i = 0; i < params.num_outs; ++i) {
+    STORAGE_TYPE_ASSIGN_CHECK(*oattr, i, stypes[params.num_args + i]);
+  }
+  for (size_t i = 0; i < params.num_auxs; ++i) {
+    STORAGE_TYPE_ASSIGN_CHECK(*iattr, params.num_args + i,
+                              stypes[params.num_args + params.num_outs + i]);
+  }
+
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
   return true;
 }
 
@@ -421,7 +510,7 @@ NNVM_REGISTER_OP(_backward_Custom)
   })
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", Backward)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", Backward)
-.set_attr<FInferStorageType>("FInferStorageType", InferStorageType);
+.set_attr<FInferStorageType>("FInferStorageType", BackwardInferStorageType);
 
 }  // namespace custom
 }  // namespace op
diff --git a/src/operator/custom/native_op-inl.h b/src/operator/custom/native_op-inl.h
index ebce18611b..d2fb1149f7 100644
--- a/src/operator/custom/native_op-inl.h
+++ b/src/operator/custom/native_op-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file native_op-inl.h
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/custom/native_op.cc b/src/operator/custom/native_op.cc
index 5dd35049d5..2b15f8484a 100644
--- a/src/operator/custom/native_op.cc
+++ b/src/operator/custom/native_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file native_op.cc
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/custom/native_op.cu b/src/operator/custom/native_op.cu
index ad8d65e3c2..9363214449 100644
--- a/src/operator/custom/native_op.cu
+++ b/src/operator/custom/native_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file native_op.cu
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/custom/ndarray_op-inl.h b/src/operator/custom/ndarray_op-inl.h
index b3a4662b66..20624d2d46 100644
--- a/src/operator/custom/ndarray_op-inl.h
+++ b/src/operator/custom/ndarray_op-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file native_op-inl.h
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/custom/ndarray_op.cc b/src/operator/custom/ndarray_op.cc
index 48426baea8..9ad0d09e3b 100644
--- a/src/operator/custom/ndarray_op.cc
+++ b/src/operator/custom/ndarray_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file ndarray_op.cc
  * \brief
  * \author Junyuan Xie
@@ -84,9 +85,11 @@ void NDArrayOp<xpu>::Forward(const OpContext &ctx,
   }
 
   CHECK(param_.pinfo->forward(ptrs.size(), ptrs.data(), tags.data(), param_.pinfo->p_forward));
-  Engine::Get()->PushSync([ndcpy, ctx](RunContext rctx) {ctx.async_on_complete(); },
-                          ndctx, ndvar, {}, FnProperty::kNormal, 0,
-                          PROFILER_MESSAGE("NDArrayOpForward"));
+  Engine::Get()->PushAsync(
+      [ndcpy, ctx](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+        ctx.async_on_complete();
+        on_complete();
+      }, ndctx, ndvar, {}, FnProperty::kNormal, 0, PROFILER_MESSAGE("NDArrayOpForward"));
 }
 
 template<typename xpu>
@@ -131,9 +134,11 @@ void NDArrayOp<xpu>::Backward(const OpContext &ctx,
   }
 
   CHECK(param_.pinfo->backward(ptrs.size(), ptrs.data(), tags.data(), param_.pinfo->p_backward));
-  Engine::Get()->PushSync([ndcpy, ctx](RunContext rctx){ ctx.async_on_complete(); },
-                          ndctx, ndvar, {}, FnProperty::kNormal, 0,
-                          PROFILER_MESSAGE("NDArrayOpBackward"));
+  Engine::Get()->PushAsync(
+      [ndcpy, ctx](RunContext rctx, Engine::CallbackOnComplete on_complete){
+        ctx.async_on_complete();
+        on_complete();
+      }, ndctx, ndvar, {}, FnProperty::kNormal, 0, PROFILER_MESSAGE("NDArrayOpBackward"));
 }
 
 Operator* NDArrayOpProp::CreateOperator(Context ctx) const {
diff --git a/src/operator/deconvolution-inl.h b/src/operator/deconvolution-inl.h
index 41fcf9bfa7..a1e3b906a1 100644
--- a/src/operator/deconvolution-inl.h
+++ b/src/operator/deconvolution-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file deconvolution-inl.h
  * \brief
  * \author Wei Wu
diff --git a/src/operator/deconvolution.cc b/src/operator/deconvolution.cc
index 6a59ff6588..45867f7859 100644
--- a/src/operator/deconvolution.cc
+++ b/src/operator/deconvolution.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file deconvolution.cc
  * \brief
  * \author Wei Wu
diff --git a/src/operator/deconvolution.cu b/src/operator/deconvolution.cu
index de7dff5569..2f0f37b6ec 100644
--- a/src/operator/deconvolution.cu
+++ b/src/operator/deconvolution.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file deconvolution.cu
  * \brief
  * \author Wei Wu
diff --git a/src/operator/dropout-inl.h b/src/operator/dropout-inl.h
index 7fcd7adf86..3071131ad6 100644
--- a/src/operator/dropout-inl.h
+++ b/src/operator/dropout-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file dropout-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/dropout.cc b/src/operator/dropout.cc
index af65578ec6..bbf5e2dea2 100644
--- a/src/operator/dropout.cc
+++ b/src/operator/dropout.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file dropout.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/dropout.cu b/src/operator/dropout.cu
index 5265d8013f..f416c58832 100644
--- a/src/operator/dropout.cu
+++ b/src/operator/dropout.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file dropout.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index 8216eacd80..e22e23cea5 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+* Copyright (c) 2016 by Contributors
 * \file elemwise_op_common.h
 * \brief common function used for broadcasting and reducing
 * \author Xingjian Shi
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
index c507e4251f..0fe828aa6e 100644
--- a/src/operator/fully_connected-inl.h
+++ b/src/operator/fully_connected-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file fully_connect_op-inl.h
  * \brief fully connect operator and symbol
 */
diff --git a/src/operator/fully_connected.cc b/src/operator/fully_connected.cc
index 82c32a7d25..9a97816029 100644
--- a/src/operator/fully_connected.cc
+++ b/src/operator/fully_connected.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file fully_connected.cc
  * \brief fully connect operator
 */
diff --git a/src/operator/fully_connected.cu b/src/operator/fully_connected.cu
index 28a0307b70..279a378e2a 100644
--- a/src/operator/fully_connected.cu
+++ b/src/operator/fully_connected.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file fully_connected.cu
  * \brief fully connect operator
 */
diff --git a/src/operator/grid_generator-inl.h b/src/operator/grid_generator-inl.h
index 0be6e7806b..105630cfc2 100644
--- a/src/operator/grid_generator-inl.h
+++ b/src/operator/grid_generator-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file grid_generator-inl.h
  * \brief
  * The operator generate sampling grid
diff --git a/src/operator/grid_generator.cc b/src/operator/grid_generator.cc
index 411f856be0..ea6e66145c 100644
--- a/src/operator/grid_generator.cc
+++ b/src/operator/grid_generator.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file grid_generator.cc
  * \brief
  * \author Xu Dong
diff --git a/src/operator/grid_generator.cu b/src/operator/grid_generator.cu
index 7c0a80258d..b363bea0ad 100644
--- a/src/operator/grid_generator.cu
+++ b/src/operator/grid_generator.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file grid_generator.cu
  * \brief
  * \author Xu Dong
diff --git a/src/operator/identity_attach_KL_sparse_reg-inl.h b/src/operator/identity_attach_KL_sparse_reg-inl.h
index 2307914f62..591ea59563 100644
--- a/src/operator/identity_attach_KL_sparse_reg-inl.h
+++ b/src/operator/identity_attach_KL_sparse_reg-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sparse_reg-inl.h
  * \brief
 */
diff --git a/src/operator/identity_attach_KL_sparse_reg.cc b/src/operator/identity_attach_KL_sparse_reg.cc
index 5e776774e0..df0919dc6c 100644
--- a/src/operator/identity_attach_KL_sparse_reg.cc
+++ b/src/operator/identity_attach_KL_sparse_reg.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file identity_attach_KL_sparse_reg.cc
  * \brief\
 */
diff --git a/src/operator/identity_attach_KL_sparse_reg.cu b/src/operator/identity_attach_KL_sparse_reg.cu
index 0a11fb1673..2ba7916b2b 100644
--- a/src/operator/identity_attach_KL_sparse_reg.cu
+++ b/src/operator/identity_attach_KL_sparse_reg.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file identity_attach_KL_sparse_reg.cu
  * \brief
 */
diff --git a/src/operator/instance_norm-inl.h b/src/operator/instance_norm-inl.h
index 6e78f7628a..258c164450 100644
--- a/src/operator/instance_norm-inl.h
+++ b/src/operator/instance_norm-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file instance_norm-inl.h
  * \brief Reproducing paper Instance Normalization: The Missing Ingredient for
  * Fast Stylization, D. Ulyanov, A. Vedaldi, V. Lempitsky, 2016
diff --git a/src/operator/instance_norm.cc b/src/operator/instance_norm.cc
index 0666b4bd03..9305a74b0e 100644
--- a/src/operator/instance_norm.cc
+++ b/src/operator/instance_norm.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file instance_norm.cc
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/instance_norm.cu b/src/operator/instance_norm.cu
index 9f8cbea797..54e970fd72 100644
--- a/src/operator/instance_norm.cu
+++ b/src/operator/instance_norm.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file instance_norm.cu
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/l2_normalization-inl.h b/src/operator/l2_normalization-inl.h
index c1f17acbbc..cb8e740d7f 100644
--- a/src/operator/l2_normalization-inl.h
+++ b/src/operator/l2_normalization-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file l2_normalization_op-inl.h
  * \brief instance l2 Normalization op
 */
diff --git a/src/operator/l2_normalization.cc b/src/operator/l2_normalization.cc
index 6995a0d1e4..76e64c8d35 100644
--- a/src/operator/l2_normalization.cc
+++ b/src/operator/l2_normalization.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file l2_normalization.cc
  * \brief l2 normalization operator
 */
diff --git a/src/operator/l2_normalization.cu b/src/operator/l2_normalization.cu
index ae76278559..1c1c0e5ed0 100644
--- a/src/operator/l2_normalization.cu
+++ b/src/operator/l2_normalization.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file l2_normalization.cu
  * \brief l2 normalization operator
 */
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index d228e3e67d..ab0ee8295d 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file leaky_relu-inl.h
  * \brief leaky relu family operator
  * \author Bing Xu
diff --git a/src/operator/leaky_relu.cc b/src/operator/leaky_relu.cc
index da58bd94bd..aa89089175 100644
--- a/src/operator/leaky_relu.cc
+++ b/src/operator/leaky_relu.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file leaky_relu.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/leaky_relu.cu b/src/operator/leaky_relu.cu
index b9b3a7b73f..9de237c573 100644
--- a/src/operator/leaky_relu.cu
+++ b/src/operator/leaky_relu.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file leaky_relu.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/loss_binary_op-inl.h b/src/operator/loss_binary_op-inl.h
index 8add827252..1362997231 100644
--- a/src/operator/loss_binary_op-inl.h
+++ b/src/operator/loss_binary_op-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file loss_binary_op-inl.h
  * \brief Loss functions
  */
diff --git a/src/operator/loss_binary_op.cc b/src/operator/loss_binary_op.cc
index d0a77946ff..c1fedb3de6 100644
--- a/src/operator/loss_binary_op.cc
+++ b/src/operator/loss_binary_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file loss_binary_op.cc
  * \brief loss function that takes a data and label
 */
diff --git a/src/operator/loss_binary_op.cu b/src/operator/loss_binary_op.cu
index 8694b9f284..74ff563cf3 100644
--- a/src/operator/loss_binary_op.cu
+++ b/src/operator/loss_binary_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file loss_binary_op.cu
  * \brief loss function that takes a data and label
 */
diff --git a/src/operator/lrn-inl.h b/src/operator/lrn-inl.h
index 00879435a3..adfe467670 100644
--- a/src/operator/lrn-inl.h
+++ b/src/operator/lrn-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file lrn-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/lrn.cc b/src/operator/lrn.cc
index 46f4fca486..9b3afd80cd 100644
--- a/src/operator/lrn.cc
+++ b/src/operator/lrn.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file lrn.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/lrn.cu b/src/operator/lrn.cu
index 702f4b2fa9..ba872f1d26 100644
--- a/src/operator/lrn.cu
+++ b/src/operator/lrn.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file lrn.cu
  * \brief
  * \author Bing Xu
diff --git a/src/operator/make_loss-inl.h b/src/operator/make_loss-inl.h
index 3f4a99373c..b83e5b9b68 100644
--- a/src/operator/make_loss-inl.h
+++ b/src/operator/make_loss-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file make_loss-inl.h
  * \brief special layer for propagating loss
 */
diff --git a/src/operator/make_loss.cc b/src/operator/make_loss.cc
index 748357d243..14304d3cc2 100644
--- a/src/operator/make_loss.cc
+++ b/src/operator/make_loss.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file make_loss.cc
  * \brief special layer for propagating loss
 */
diff --git a/src/operator/make_loss.cu b/src/operator/make_loss.cu
index 7f508500f5..e1e217e361 100644
--- a/src/operator/make_loss.cu
+++ b/src/operator/make_loss.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file make_loss.cu
  * \brief special layer for propagating loss
 */
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 04db326496..a34c11791a 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file mshadow_op.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index e5c3b51410..c34d9c9083 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file mxnet_op.h
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/nn/im2col.h b/src/operator/nn/im2col.h
index 621b2451a1..256319dd1e 100644
--- a/src/operator/nn/im2col.h
+++ b/src/operator/nn/im2col.h
@@ -67,6 +67,7 @@
  *
  ***************** END Caffe Copyright Notice and Disclaimer ********************
  *
+ * Copyright (c) 2017 by Contributors
  * \file im2col.h
  * \brief Function definitions of converting an image to
  * column matrix based on kernel, padding, and dilation.
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index 3bac865604..67412586c8 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -67,6 +67,7 @@
  *
  ***************** END Caffe Copyright Notice and Disclaimer ********************
  *
+ * Copyright (c) 2017 by Contributors
  * \file pool.h
  * \brief Function definitions of pooling 1/2/3-D images.
  * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases.
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
index e1150b14f6..2badecf3d0 100644
--- a/src/operator/nn/softmax-inl.h
+++ b/src/operator/nn/softmax-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file softmax-inl.h
  * \brief
 */
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index 58c1a05124..e804c67c07 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file softmax.cc
  * \brief CPU Implementation of softmax
  */
diff --git a/src/operator/nn/softmax.cu b/src/operator/nn/softmax.cu
index d5a843ddc0..4b9c04cdbe 100644
--- a/src/operator/nn/softmax.cu
+++ b/src/operator/nn/softmax.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file softmax.cc
  * \brief CPU Implementation of softmax
  */
diff --git a/src/operator/nnpack/nnpack_convolution-inl.h b/src/operator/nnpack/nnpack_convolution-inl.h
index 4a13426889..0e2c73693d 100644
--- a/src/operator/nnpack/nnpack_convolution-inl.h
+++ b/src/operator/nnpack/nnpack_convolution-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file nnpack_convolution-inl.h
  * \brief
  * \author Carwin
diff --git a/src/operator/nnpack/nnpack_fully_connected-inl.h b/src/operator/nnpack/nnpack_fully_connected-inl.h
index f85ddd89c7..d9412d20d0 100644
--- a/src/operator/nnpack/nnpack_fully_connected-inl.h
+++ b/src/operator/nnpack/nnpack_fully_connected-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file nnpack_fully_connected-inl.h
  * \brief
  * \author Wei Wu
diff --git a/src/operator/nnpack/nnpack_pooling-inl.h b/src/operator/nnpack/nnpack_pooling-inl.h
index 968ead1620..25b4783227 100644
--- a/src/operator/nnpack/nnpack_pooling-inl.h
+++ b/src/operator/nnpack/nnpack_pooling-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file nnpack_pooling-inl.h
  * \brief
  * \author Wei Wu
diff --git a/src/operator/nnpack/nnpack_util.cc b/src/operator/nnpack/nnpack_util.cc
index b873b591fa..7d075e0554 100644
--- a/src/operator/nnpack/nnpack_util.cc
+++ b/src/operator/nnpack/nnpack_util.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file nnpack_util.cc
  * \brief
  * \author Wei Wu
diff --git a/src/operator/nnpack/nnpack_util.h b/src/operator/nnpack/nnpack_util.h
index cde1880257..2edfb79ad4 100644
--- a/src/operator/nnpack/nnpack_util.h
+++ b/src/operator/nnpack/nnpack_util.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file nnpack_util.h
  * \brief
  * \author Carwin
diff --git a/src/operator/operator.cc b/src/operator/operator.cc
index 9117c1c128..6474cd0870 100644
--- a/src/operator/operator.cc
+++ b/src/operator/operator.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file operator.cc
  * \brief operator module of mxnet
  */
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index d036355392..560d11e67e 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file  operator_common.h
  * \brief common internal header of most operators
  *   this header includes utility functions operator can use
@@ -207,6 +208,10 @@ inline bool dispatch_mode_assign(DispatchMode *y, const DispatchMode& x) {
   return true;
 }
 
+/*! \brief Register op name as an alias */
+#define MXNET_ADD_SPARSE_OP_ALIAS(__name$) \
+  .add_alias("_sparse_" #__name$)
+
 /*!
  * \brief macro assign shape to out if out is unknown otherwise check consistency
  *  Use macro so we can see the error file more clearly
diff --git a/src/operator/operator_util.cc b/src/operator/operator_util.cc
index 25fa209a02..bae3cb6a29 100644
--- a/src/operator/operator_util.cc
+++ b/src/operator/operator_util.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file operator_util.cc
  *  Implementation of operator util.
  */
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 61b97ba60d..5aad48ed8d 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file optimizer_op-inl.h
  * \brief Optimizer operators
  * \author Junyuan Xie
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index 0382070010..080991bb8e 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file optimizer_op.cc
  * \brief Optimizer operators
  * \author Junyuan Xie
@@ -36,6 +37,7 @@ DMLC_REGISTER_PARAMETER(RMSPropAlexParam);
 DMLC_REGISTER_PARAMETER(FtrlParam);
 
 NNVM_REGISTER_OP(sgd_update)
+MXNET_ADD_SPARSE_OP_ALIAS(sgd_update)
 .describe(R"code(Update function for Stochastic Gradient Descent (SDG) optimizer.
 
 It updates the weights using::
@@ -62,6 +64,7 @@ only the row slices whose indices appear in grad.indices are updated::
 .add_arguments(SGDParam::__FIELDS__());
 
 NNVM_REGISTER_OP(sgd_mom_update)
+MXNET_ADD_SPARSE_OP_ALIAS(sgd_mom_update)
 .describe(R"code(Momentum update function for Stochastic Gradient Descent (SDG) optimizer.
 
 Momentum update has better convergence rates on neural networks. Mathematically it looks
@@ -141,6 +144,7 @@ NNVM_REGISTER_OP(mp_sgd_mom_update)
 .add_arguments(SGDMomParam::__FIELDS__());
 
 NNVM_REGISTER_OP(adam_update)
+MXNET_ADD_SPARSE_OP_ALIAS(adam_update)
 .describe(R"code(Update function for Adam optimizer. Adam is seen as a generalization
 of AdaGrad.
 
@@ -280,6 +284,7 @@ to be 0.9 and the learning rate :math:`\eta` to be 0.0001.
 .add_arguments(RMSPropAlexParam::__FIELDS__());
 
 NNVM_REGISTER_OP(ftrl_update)
+MXNET_ADD_SPARSE_OP_ALIAS(ftrl_update)
 .describe(R"code(Update function for Ftrl optimizer.
 Referenced from *Ad Click Prediction: a View from the Trenches*, available at
 http://dl.acm.org/citation.cfm?id=2488200.
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index dd3874f62c..5969d331b4 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file optimizer_op.cu
  * \brief Optimizer operators
  * \author Junyuan Xie
diff --git a/src/operator/pad-inl.h b/src/operator/pad-inl.h
index 80f9e0bf92..520cd124c4 100644
--- a/src/operator/pad-inl.h
+++ b/src/operator/pad-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file pad-inl.h
  * \brief
  * \author Sebastian Bodenstien
diff --git a/src/operator/pad.cc b/src/operator/pad.cc
index 468629a436..2332c93b8d 100644
--- a/src/operator/pad.cc
+++ b/src/operator/pad.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file pad.cc
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/pad.cu b/src/operator/pad.cu
index 98220b6c39..54242a485e 100644
--- a/src/operator/pad.cu
+++ b/src/operator/pad.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file pad.cu
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index fbc6981a75..caa6717c49 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file pooling-inl.h
  * \brief
  * \author Bing Xu, Jun Wu
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index 98a3e076fa..382e17166a 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file pooling.cc
  * \brief
  * \author Bing Xu, Jun Wu
diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu
index 950f099562..d6e093223c 100644
--- a/src/operator/pooling.cu
+++ b/src/operator/pooling.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file pooling.cu
  * \brief
  * \author Bing Xu, Jun Wu
diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h
index e541298ed2..7070c0dd22 100644
--- a/src/operator/pooling_v1-inl.h
+++ b/src/operator/pooling_v1-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file pooling_v1-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/pooling_v1.cc b/src/operator/pooling_v1.cc
index 40de745752..5b68a08db6 100644
--- a/src/operator/pooling_v1.cc
+++ b/src/operator/pooling_v1.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file pooling_v1.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/pooling_v1.cu b/src/operator/pooling_v1.cu
index 4db22c1842..fccda40513 100644
--- a/src/operator/pooling_v1.cu
+++ b/src/operator/pooling_v1.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file pooling_v1.cu
  * \brief
  * \author Bing Xu
diff --git a/src/operator/random/multisample_op.cc b/src/operator/random/multisample_op.cc
index bacfe0199a..5f2af61f03 100644
--- a/src/operator/random/multisample_op.cc
+++ b/src/operator/random/multisample_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file multisample_op.cc
  * \brief CPU-implementation of multi-sampling operators
  */
diff --git a/src/operator/random/multisample_op.h b/src/operator/random/multisample_op.h
index f0851da172..38ccbb6925 100644
--- a/src/operator/random/multisample_op.h
+++ b/src/operator/random/multisample_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file sampling_op.h
  * \brief Function definitions of operators for sampling from multiple distributions
  */
diff --git a/src/operator/random/sample_multinomial_op.cc b/src/operator/random/sample_multinomial_op.cc
index 7032a6ec10..a513f9866e 100644
--- a/src/operator/random/sample_multinomial_op.cc
+++ b/src/operator/random/sample_multinomial_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file sample_multinomial_op.h
  * \brief Operator for sampling from multinomial distributions
  */
diff --git a/src/operator/random/sample_multinomial_op.cu b/src/operator/random/sample_multinomial_op.cu
index 5b59b2afd5..27f288834a 100644
--- a/src/operator/random/sample_multinomial_op.cu
+++ b/src/operator/random/sample_multinomial_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file sample_multinomial_op.h
  * \brief Operator for sampling from multinomial distributions
  */
diff --git a/src/operator/random/sample_multinomial_op.h b/src/operator/random/sample_multinomial_op.h
index 2b016329f3..48b9897aa4 100644
--- a/src/operator/random/sample_multinomial_op.h
+++ b/src/operator/random/sample_multinomial_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file sample_multinomial_op.h
  * \brief Operator for sampling from multinomial distributions
  */
diff --git a/src/operator/random/sample_op.cc b/src/operator/random/sample_op.cc
index 5b27afbb69..a2b332456f 100644
--- a/src/operator/random/sample_op.cc
+++ b/src/operator/random/sample_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file sample_op.cc
  * \brief CPU Implementation of sample op
  */
diff --git a/src/operator/random/sample_op.cu b/src/operator/random/sample_op.cu
index 5f43a62e49..7a593d0d36 100644
--- a/src/operator/random/sample_op.cu
+++ b/src/operator/random/sample_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file sample_op.cu
  * \brief GPU Implementation of sample op
  */
diff --git a/src/operator/random/sample_op.h b/src/operator/random/sample_op.h
index d9c3868533..240825bfff 100644
--- a/src/operator/random/sample_op.h
+++ b/src/operator/random/sample_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file sample_op.h
  * \brief Elementary sampling operators
  */
diff --git a/src/operator/regression_output-inl.h b/src/operator/regression_output-inl.h
index 0de312cff8..08b2f0a4a8 100644
--- a/src/operator/regression_output-inl.h
+++ b/src/operator/regression_output-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file regression_ouput-inl.h
  * \brief Regression output operator.
  */
diff --git a/src/operator/regression_output.cc b/src/operator/regression_output.cc
index d19f336d2a..0c70a86b26 100644
--- a/src/operator/regression_output.cc
+++ b/src/operator/regression_output.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file regression_output.cc
  * \brief regression output operator
 */
diff --git a/src/operator/regression_output.cu b/src/operator/regression_output.cu
index 64dcef3df6..255b020d20 100644
--- a/src/operator/regression_output.cu
+++ b/src/operator/regression_output.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file regression_output.cu
  * \brief regression output operator
 */
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 28c8d610ce..b4735b8eec 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file rnn-inl.h
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 4c7954f3e5..908428b383 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file rnn.cc
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu
index 0daee32abe..59517932b7 100644
--- a/src/operator/rnn.cu
+++ b/src/operator/rnn.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file rnn.cu
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/roi_pooling-inl.h b/src/operator/roi_pooling-inl.h
index 05a0ae41ab..2f83a8ff32 100644
--- a/src/operator/roi_pooling-inl.h
+++ b/src/operator/roi_pooling-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file roi_pooling-inl.h
  * \brief roi pooling operator and symbol
  * \author Kye-Hyeon Kim, Jian Guo
diff --git a/src/operator/roi_pooling.cc b/src/operator/roi_pooling.cc
index a7bd60872a..ced80fb87a 100644
--- a/src/operator/roi_pooling.cc
+++ b/src/operator/roi_pooling.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file roi_pooling.cc
  * \brief roi pooling operator
  * \author Ross Girshick, Kye-Hyeon Kim, Jian Guo
diff --git a/src/operator/roi_pooling.cu b/src/operator/roi_pooling.cu
index 80d38e476f..0f637b0e1b 100644
--- a/src/operator/roi_pooling.cu
+++ b/src/operator/roi_pooling.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file roi_pooling.cu
  * \brief roi pooling operator
  * \author Ross Girshick, Kye-Hyeon Kim, Jian Guo
diff --git a/src/operator/sequence_last-inl.h b/src/operator/sequence_last-inl.h
index b91c24c90b..f71b8cf8e3 100644
--- a/src/operator/sequence_last-inl.h
+++ b/src/operator/sequence_last-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file sequence_last-inl.h
  * \brief
  * \author Sebastian Bodenstien
diff --git a/src/operator/sequence_last.cc b/src/operator/sequence_last.cc
index 6c04bdd34d..d943cd68a3 100644
--- a/src/operator/sequence_last.cc
+++ b/src/operator/sequence_last.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sequence_last.cc
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/sequence_last.cu b/src/operator/sequence_last.cu
index 9215b2478c..c63369d580 100644
--- a/src/operator/sequence_last.cu
+++ b/src/operator/sequence_last.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sequence_last.cu
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/sequence_mask-inl.h b/src/operator/sequence_mask-inl.h
index 73e52f4497..7f53a0ba82 100644
--- a/src/operator/sequence_mask-inl.h
+++ b/src/operator/sequence_mask-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file wl_sequence_mask-inl.h
  * \brief
  * \author Sebastian Bodenstien
diff --git a/src/operator/sequence_mask.cc b/src/operator/sequence_mask.cc
index 61086f7a7a..48a97680b5 100644
--- a/src/operator/sequence_mask.cc
+++ b/src/operator/sequence_mask.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sequence_mask.cc
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/sequence_mask.cu b/src/operator/sequence_mask.cu
index 69cee24550..b82157a3b6 100644
--- a/src/operator/sequence_mask.cu
+++ b/src/operator/sequence_mask.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sequence_mask.cu
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/sequence_op_common.h b/src/operator/sequence_op_common.h
index 9ad8353ecb..c2a1165d33 100644
--- a/src/operator/sequence_op_common.h
+++ b/src/operator/sequence_op_common.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sequence_op_common.h
  * \brief common function used for sequence layers
  * \author Sebastian Bodenstein
diff --git a/src/operator/sequence_reverse-inl.h b/src/operator/sequence_reverse-inl.h
index 7cac10927f..47154011bc 100644
--- a/src/operator/sequence_reverse-inl.h
+++ b/src/operator/sequence_reverse-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*
+ * Copyright (c) 2016 by Contributors
  * \file sequence_reverse-inl.h
  * \brief
  * \author Sebastian Bodenstien
diff --git a/src/operator/sequence_reverse.cc b/src/operator/sequence_reverse.cc
index 61821d3945..9a2f6983f0 100644
--- a/src/operator/sequence_reverse.cc
+++ b/src/operator/sequence_reverse.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sequence_reverse.cc
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/sequence_reverse.cu b/src/operator/sequence_reverse.cu
index c6cc3f66d0..531fde167c 100644
--- a/src/operator/sequence_reverse.cu
+++ b/src/operator/sequence_reverse.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sequence_reverse.cu
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/slice_channel-inl.h b/src/operator/slice_channel-inl.h
index 791b90e570..3b14a26ea6 100644
--- a/src/operator/slice_channel-inl.h
+++ b/src/operator/slice_channel-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file slice_channel-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/slice_channel.cc b/src/operator/slice_channel.cc
index 7293ba6afc..7c633bb819 100644
--- a/src/operator/slice_channel.cc
+++ b/src/operator/slice_channel.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file slice_channel.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/slice_channel.cu b/src/operator/slice_channel.cu
index eb1c9c8b6e..adc8741a5a 100644
--- a/src/operator/slice_channel.cu
+++ b/src/operator/slice_channel.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file slice_channel.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/softmax_activation-inl.h b/src/operator/softmax_activation-inl.h
index b1b76930b4..1e65434acf 100644
--- a/src/operator/softmax_activation-inl.h
+++ b/src/operator/softmax_activation-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file softmax_activation-inl.h
  * \brief SoftmaxActivation operator
  * \author Junyuan Xie
diff --git a/src/operator/softmax_activation.cc b/src/operator/softmax_activation.cc
index 115b0a730c..23e4eb89db 100644
--- a/src/operator/softmax_activation.cc
+++ b/src/operator/softmax_activation.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief softmax_activation op
  * \author Junyuan Xie
diff --git a/src/operator/softmax_activation.cu b/src/operator/softmax_activation.cu
index 5bebed2846..ad9628bbf2 100644
--- a/src/operator/softmax_activation.cu
+++ b/src/operator/softmax_activation.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file softmax_activation.cu
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 7216c76dc2..9a4db2c969 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file softmax_output-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/softmax_output.cc b/src/operator/softmax_output.cc
index 52bb2a4007..27b3295654 100644
--- a/src/operator/softmax_output.cc
+++ b/src/operator/softmax_output.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file softmax_output.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/softmax_output.cu b/src/operator/softmax_output.cu
index 8de5df6655..afcc8f4fc6 100644
--- a/src/operator/softmax_output.cu
+++ b/src/operator/softmax_output.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file softmax_output.cu
  * \brief
  * \author Bing Xu
diff --git a/src/operator/spatial_transformer-inl.h b/src/operator/spatial_transformer-inl.h
index e29ad49c4a..301c55c937 100644
--- a/src/operator/spatial_transformer-inl.h
+++ b/src/operator/spatial_transformer-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file spatial_transformer-inl.h
  * \brief
  *  Reproducing paper: aderberg M, Simonyan K, Zisserman A. "Spatial transformer networks"
diff --git a/src/operator/spatial_transformer.cc b/src/operator/spatial_transformer.cc
index 9149bc0e22..78f64a7059 100644
--- a/src/operator/spatial_transformer.cc
+++ b/src/operator/spatial_transformer.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file spatial_transformer.cc
  * \brief
  * \author Wei Wu
diff --git a/src/operator/spatial_transformer.cu b/src/operator/spatial_transformer.cu
index d5e4480dc1..27fe73ee26 100644
--- a/src/operator/spatial_transformer.cu
+++ b/src/operator/spatial_transformer.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file spatial_transformer.cu
  * \brief
  * \author Wei Wu
diff --git a/src/operator/special_functions-inl.h b/src/operator/special_functions-inl.h
index b9460a3e7f..f51cfeec9f 100644
--- a/src/operator/special_functions-inl.h
+++ b/src/operator/special_functions-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file special_functions-inl.h
  * \brief
  * \author Valentin Flunkert
diff --git a/src/operator/svm_output-inl.h b/src/operator/svm_output-inl.h
index 085d2b9d47..9ae0ced7a7 100644
--- a/src/operator/svm_output-inl.h
+++ b/src/operator/svm_output-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file svm_output-inl.h
  * \brief
  * \author Jonas Amaro
diff --git a/src/operator/svm_output.cc b/src/operator/svm_output.cc
index 766968dfaf..c84c2af284 100644
--- a/src/operator/svm_output.cc
+++ b/src/operator/svm_output.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file svm_output.cc
  * \brief
  * \author Jonas Amaro
diff --git a/src/operator/svm_output.cu b/src/operator/svm_output.cu
index 250df9147f..d9501071fd 100644
--- a/src/operator/svm_output.cu
+++ b/src/operator/svm_output.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file svm_output.cu
  * \brief
  * \author Jonas Amaro
diff --git a/src/operator/swapaxis-inl.h b/src/operator/swapaxis-inl.h
index 89c724556b..e4bdfd5a66 100644
--- a/src/operator/swapaxis-inl.h
+++ b/src/operator/swapaxis-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file swapaxis-inl.h
  * \brief
  * \author Ming Zhang
diff --git a/src/operator/swapaxis.cc b/src/operator/swapaxis.cc
index a6c3e8bff0..0b32db7b46 100644
--- a/src/operator/swapaxis.cc
+++ b/src/operator/swapaxis.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file swapaxis.cc
  * \brief
  * \author Ming Zhang
diff --git a/src/operator/swapaxis.cu b/src/operator/swapaxis.cu
index e9b105d71e..e622958a47 100644
--- a/src/operator/swapaxis.cu
+++ b/src/operator/swapaxis.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file swapaxis.cu
  * \brief
  * \author Ming Zhang
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index df4f421157..1bfe68a771 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015-2017 by Contributors
  * \file broadcast_reduce_kernel.h
  * \brief Function definition of elementwise unary operators
  */
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index 8e8b0a1fbb..79f7c39c87 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file elementwise_unary_op-inl.h
  * \brief Function definition of elementwise unary operators
  */
diff --git a/src/operator/tensor/broadcast_reduce_op_index.cc b/src/operator/tensor/broadcast_reduce_op_index.cc
index 98cd73607b..dc07e67fb6 100644
--- a/src/operator/tensor/broadcast_reduce_op_index.cc
+++ b/src/operator/tensor/broadcast_reduce_op_index.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file broadcast_reduce_op.cc
  * \brief CPU Implementation of broadcast and reduce functions.
  */
diff --git a/src/operator/tensor/broadcast_reduce_op_index.cu b/src/operator/tensor/broadcast_reduce_op_index.cu
index defa35ea62..0d7b29dfa6 100644
--- a/src/operator/tensor/broadcast_reduce_op_index.cu
+++ b/src/operator/tensor/broadcast_reduce_op_index.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file broadcast_reduce_op.cu
  * \brief GPU Implementation of broadcast and reduce functions.
  */
diff --git a/src/operator/tensor/broadcast_reduce_op_value.cc b/src/operator/tensor/broadcast_reduce_op_value.cc
index 0d376c31e7..29e1f5b051 100644
--- a/src/operator/tensor/broadcast_reduce_op_value.cc
+++ b/src/operator/tensor/broadcast_reduce_op_value.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file broadcast_reduce_op.cc
  * \brief CPU Implementation of broadcast and reduce functions.
  */
@@ -72,7 +73,7 @@ Example::
   data = [[1,2,0],
           [3,0,1],
           [4,1,0]]
- 
+
   csr = cast_storage(data, 'csr')
 
   sum(csr, axis=0)
diff --git a/src/operator/tensor/broadcast_reduce_op_value.cu b/src/operator/tensor/broadcast_reduce_op_value.cu
index 2c216e7898..73c32f09cc 100644
--- a/src/operator/tensor/broadcast_reduce_op_value.cu
+++ b/src/operator/tensor/broadcast_reduce_op_value.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file broadcast_reduce_op.cu
  * \brief GPU Implementation of broadcast and reduce functions.
  */
diff --git a/src/operator/tensor/control_flow_op.cc b/src/operator/tensor/control_flow_op.cc
index bf08fe7e9d..9e1091effe 100644
--- a/src/operator/tensor/control_flow_op.cc
+++ b/src/operator/tensor/control_flow_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file control_flow_op.cc
  * \brief CPU Implementation of flow control
  */
diff --git a/src/operator/tensor/control_flow_op.cu b/src/operator/tensor/control_flow_op.cu
index da2c47247f..cc5198ddb1 100644
--- a/src/operator/tensor/control_flow_op.cu
+++ b/src/operator/tensor/control_flow_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file control_flow_op.cu
  * \brief
  */
diff --git a/src/operator/tensor/control_flow_op.h b/src/operator/tensor/control_flow_op.h
index c240247202..f1136c8e37 100644
--- a/src/operator/tensor/control_flow_op.h
+++ b/src/operator/tensor/control_flow_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file control_flow.h
  * \brief Function definitions of operators for controlling flow
  */
diff --git a/src/operator/tensor/dot-inl.cuh b/src/operator/tensor/dot-inl.cuh
index 2b346bfaf2..c546c4351a 100644
--- a/src/operator/tensor/dot-inl.cuh
+++ b/src/operator/tensor/dot-inl.cuh
@@ -454,13 +454,16 @@ inline void DotCsrDnsDnsImpl(const OpContext& ctx,
                              TBlob* ret) {
   if (kNullOp == req) return;
   CHECK_EQ(lhs.storage_type(), kCSRStorage);
-  if (!lhs.storage_initialized()) return;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  if (!lhs.storage_initialized()) {
+    Fill(s, *ret, req, 0);
+    return;
+  }
 
   using mshadow::cuda::kBaseThreadNum;
   using mxnet_op::Kernel;
   using mxnet_op::set_zero;
   using nnvm::dim_t;
-  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
 
   const dim_t num_rows_l = lhs.shape()[0];
   const dim_t num_cols_r = rhs.shape_[1];
@@ -587,13 +590,16 @@ inline void DotCsrDnsRspImpl(const OpContext& ctx,
   CHECK_EQ(lhs.storage_type(), kCSRStorage);
   CHECK_EQ(ret->storage_type(), kRowSparseStorage);
   CHECK_EQ(req, kWriteTo);
-  if (!lhs.storage_initialized()) return;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  if (!lhs.storage_initialized()) {
+    FillZerosRspImpl(s, *ret);
+    return;
+  }
 
   using mshadow::Shape1;
   using mxnet_op::Kernel;
   using mxnet_op::set_zero;
   using nnvm::dim_t;
-  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
 
   const TBlob data_l = lhs.data();
   const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
@@ -648,6 +654,10 @@ inline void DotCsrDnsRspImpl(const OpContext& ctx,
           dim_t nnr_out = 0;
           CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg_out[num_cols_l-1], sizeof(dim_t),
                                cudaMemcpyDeviceToHost));
+          if (0 == nnr_out) {
+            FillZerosRspImpl(s, *ret);
+            return;
+          }
 
           // Allocate output matrix space
           ret->CheckAndAlloc({Shape1(nnr_out)});
@@ -702,14 +712,17 @@ inline void DotCsrRspRspImpl(const OpContext& ctx,
   CHECK_EQ(lhs.storage_type(), kCSRStorage);
   CHECK_EQ(rhs.storage_type(), kRowSparseStorage);
   CHECK_EQ(ret->storage_type(), kRowSparseStorage);
-  if (!lhs.storage_initialized() || !rhs.storage_initialized()) return;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  if (!lhs.storage_initialized() || !rhs.storage_initialized()) {
+    FillZerosRspImpl(s, *ret);
+    return;
+  }
   CHECK_EQ(req, kWriteTo);
 
   using mshadow::Shape1;
   using mxnet_op::Kernel;
   using mxnet_op::set_zero;
   using nnvm::dim_t;
-  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
 
   const TBlob data_l = lhs.data();
   const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
@@ -767,6 +780,10 @@ inline void DotCsrRspRspImpl(const OpContext& ctx,
             dim_t nnr_out = 0;
             CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg_out[num_cols_l-1], sizeof(dim_t),
                                  cudaMemcpyDeviceToHost));
+            if (0 == nnr_out) {
+              FillZerosRspImpl(s, *ret);
+              return;
+            }
 
             // Allocate output matrix space
             ret->CheckAndAlloc({mshadow::Shape1(nnr_out)});
diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h
index 26061cbab6..2432703291 100644
--- a/src/operator/tensor/dot-inl.h
+++ b/src/operator/tensor/dot-inl.h
@@ -30,8 +30,10 @@
 #include <algorithm>
 #include <utility>
 #include <type_traits>
+#include "./util/tensor_util-inl.h"
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
+#include "./init_op.h"
 #include "../mxnet_op.h"
 #ifdef __CUDACC__
 #include "./dot-inl.cuh"
@@ -363,19 +365,17 @@ struct DotCsrTransDnsDnsByRowBlocks {
 
 /*!
  * \brief CPU Kernel of dot(csr.T(), dns) = rsp
- * Parallelization by row blocks.
- * This kernel fills up the row_idx array of the rsp
- * with 1 for nonzero rows and 0 for zero rows.
- * The matrix will be compacted after this kernel call.
+ * Parallelization by row blocks which evenly partition the non-zero rows.
  */
 struct DotCsrTransDnsRspByRowBlocks {
   /*!
    * \brief
    * \param i the i-th thread
    */
-  template<typename DType, typename RType, typename IType, typename CType>
+  template<typename DType, typename IType, typename CType, typename RType>
   MSHADOW_CINLINE static void Map(int i,
                                   DType* out,
+                                  nnvm::dim_t* row_flg_sum,
                                   RType* row_idx,
                                   const DType* data_l,
                                   const IType* indptr_l,
@@ -383,21 +383,25 @@ struct DotCsrTransDnsRspByRowBlocks {
                                   const DType* data_r,
                                   const nnvm::dim_t seg_len,
                                   const nnvm::dim_t num_rows_l,
-                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t nnr,
                                   const nnvm::dim_t num_cols) {
     using nnvm::dim_t;
     const dim_t seg_start = i * seg_len;
-    if (seg_start >= num_rows) return;
+    if (seg_start >= nnr) return;
     const dim_t seg_end = (i + 1) * seg_len;
+    const dim_t col_start = row_idx[seg_start];
+    const dim_t col_end = seg_end >= nnr ? (row_idx[nnr-1] + 1) : row_idx[seg_end];
     for (dim_t j = 0; j < num_rows_l; ++j) {
       if (indptr_l[j] == indptr_l[j+1]) continue;
       const dim_t offset_r = j * num_cols;
       for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) {
         const CType col_idx = col_idx_l[k];
-        if (col_idx < seg_start || col_idx >= seg_end) continue;
-        const dim_t offset_out = col_idx * num_cols;
-        row_idx[col_idx] = 1;
+        if (col_idx < col_start || col_idx >= col_end) continue;
+
+        const nnvm::dim_t rsp_row = row_flg_sum[col_idx] - 1;
+        const nnvm::dim_t offset_out = rsp_row * num_cols;
         const DType val = data_l[k];
+
         for (dim_t l = 0; l < num_cols; ++l) {
           out[offset_out+l] += data_r[offset_r+l] * val;
         }
@@ -535,11 +539,14 @@ inline void DotCsrDnsDnsImpl(const OpContext& ctx,
                              TBlob* ret) {
   if (kNullOp == req) return;
   CHECK_EQ(lhs.storage_type(), kCSRStorage);
-  if (!lhs.storage_initialized()) return;
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  if (!lhs.storage_initialized()) {
+    Fill(s, *ret, req, 0);
+    return;
+  }
 
   using nnvm::dim_t;
 
-  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
   const TBlob data_l = lhs.data();
   const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
   const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
@@ -586,52 +593,66 @@ inline void DotCsrDnsRspImpl(const OpContext& ctx,
   if (kNullOp == req) return;
   CHECK_EQ(lhs.storage_type(), kCSRStorage);
   CHECK_EQ(ret->storage_type(), kRowSparseStorage);
-  if (!lhs.storage_initialized()) return;
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  if (!lhs.storage_initialized()) {
+    FillZerosRspImpl(s, *ret);
+    return;
+  }
   CHECK_EQ(req, kWriteTo);
 
   using mxnet_op::set_zero;
   using nnvm::dim_t;
 
-  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
   const TBlob data_l = lhs.data();
   const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
   const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
   const TBlob& data_r = rhs;
 
-  // pre-allocate spaces for ret using the dense dimension size
-  ret->CheckAndAlloc({mshadow::Shape1(lhs.shape()[1])});
-  const TBlob data_out = ret->data();
-  const TBlob row_idx_out = ret->aux_data(rowsparse::kIdx);
-
   MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
-        MSHADOW_IDX_TYPE_SWITCH(row_idx_out.type_flag_, RType, {  // row idx type
+        MSHADOW_IDX_TYPE_SWITCH(ret->aux_type(rowsparse::kIdx), RType, {  // row idx type
+          const dim_t num_rows = lhs.shape()[1];
+          size_t workspace_size = 2 * (num_rows * sizeof(dim_t));
+          mshadow::Tensor<cpu, 1, char> workspace =
+            ctx.requested[0].get_space_typed<cpu, 1, char>(
+            mshadow::Shape1(workspace_size), s);
+          dim_t* row_flg = reinterpret_cast<dim_t*>(workspace.dptr_);
+          dim_t* prefix_sum = row_flg + num_rows;
+
+          Fill<false>(s, TBlob(row_flg, mshadow::Shape1(num_rows), cpu::kDevMask), kWriteTo, 0);
+          mxnet_op::Kernel<MarkRowFlgKernel, cpu>::Launch(s, lhs.aux_shape(csr::kIdx)[0], row_flg,
+            col_idx_l.dptr<CType>());
+
+          prefix_sum[0] = row_flg[0];
+          for (nnvm::dim_t i = 1; i < num_rows; i++) {
+            prefix_sum[i] = prefix_sum[i - 1] + row_flg[i];
+          }
+          dim_t nnr = prefix_sum[num_rows - 1];
+
+          if (nnr == 0) {
+            FillZerosRspImpl(s, *ret);
+            return;
+          }
+
+          ret->CheckAndAlloc({mshadow::Shape1(nnr)});
+          const TBlob& data_out = ret->data();
+          const TBlob& row_idx = ret->aux_data(rowsparse::kIdx);
+
           dim_t num_threads = data_out.Size();
           mxnet_op::Kernel<set_zero, cpu>::Launch(s, num_threads, data_out.dptr<DType>());
-          RType* row_idx = row_idx_out.dptr<RType>();
-          num_threads = row_idx_out.Size();
-          mxnet_op::Kernel<set_zero, cpu>::Launch(s, num_threads, row_idx);
-          num_threads = mxnet_op::get_num_threads<cpu>(data_out.shape_[0]);
-          dim_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads;
+          RType* row_idx_out = row_idx.dptr<RType>();
+
+          mxnet_op::Kernel<FillRspRowIdxKernel, cpu>::Launch(s, num_rows,
+            row_idx_out, prefix_sum, num_rows);
+
+          num_threads = mxnet_op::get_num_threads<cpu>(nnr);
+          dim_t seg_len = (nnr + num_threads - 1) / num_threads;
           if (trans_lhs) {
             mxnet_op::Kernel<DotCsrTransDnsRspByRowBlocks, cpu>::Launch(s, num_threads,
-                data_out.dptr<DType>(), row_idx, data_l.dptr<DType>(),
-                indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(), data_r.dptr<DType>(),
-                seg_len, lhs.shape()[0], data_out.shape_[0], data_out.shape_[1]);
-            dim_t nnr = 0;
-            nnr = mxnet::common::ParallelAccumulate(row_idx, ret->shape()[0], nnr);
-            ret->set_aux_shape(rowsparse::kIdx, mshadow::Shape1(nnr));
-            if (0 == nnr) return;
-            mshadow::Tensor<cpu, 2, DType> rsp_data = data_out.FlatTo2D<cpu, DType>(s);
-            dim_t idx = 0;
-            for (index_t i = 0; i < ret->shape()[0]; ++i) {
-              if (row_idx[i] > 0) {
-                row_idx[idx] = i;
-                mshadow::Copy(rsp_data[idx], rsp_data[i], s);
-                ++idx;
-              }
-            }
+              data_out.dptr<DType>(), prefix_sum, row_idx_out, data_l.dptr<DType>(),
+              indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(), data_r.dptr<DType>(),
+              seg_len, lhs.shape()[0], nnr, ret->shape()[1]);
           } else {
             LOG(FATAL) << "DotCsrDnsRspImpl has not implemented dot(csr, dns)=rsp yet.";
           }
@@ -725,13 +746,16 @@ inline void DotCsrRspRspImpl(const OpContext& ctx,
   CHECK_EQ(lhs.storage_type(), kCSRStorage);
   CHECK_EQ(rhs.storage_type(), kRowSparseStorage);
   CHECK_EQ(ret->storage_type(), kRowSparseStorage);
-  if (!lhs.storage_initialized() || !rhs.storage_initialized()) return;
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  if (!lhs.storage_initialized() || !rhs.storage_initialized()) {
+    FillZerosRspImpl(s, *ret);
+    return;
+  }
   CHECK_EQ(req, kWriteTo);
 
   using mxnet_op::set_zero;
   using nnvm::dim_t;
 
-  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
   const TBlob data_l = lhs.data();
   const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
   const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
@@ -764,8 +788,11 @@ inline void DotCsrRspRspImpl(const OpContext& ctx,
                 ret->shape()[0], ret->shape()[1], seg_len);
             dim_t nnr = 0;
             nnr = mxnet::common::ParallelAccumulate(row_idx, ret->shape()[0], nnr);
+            if (0 == nnr) {
+              FillZerosRspImpl(s, *ret);
+              return;
+            }
             ret->set_aux_shape(rowsparse::kIdx, mshadow::Shape1(nnr));
-            if (0 == nnr) return;
             mshadow::Tensor<cpu, 2, DType> rsp_data = data_out.FlatTo2D<cpu, DType>(s);
             dim_t idx = 0;
             for (index_t i = 0; i < ret->shape()[0]; ++i) {
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index 1aab714625..211b567ba1 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file elementwise_binary_broadcast_op.h
  * \brief Function definition of elementwise unary operators
  */
@@ -249,8 +250,9 @@ void BinaryBroadcastBackwardUseIn(const nnvm::NodeAttrs& attrs,
                                   const std::vector<OpReqType>& req,
                                   const std::vector<TBlob>& outputs) {
   TShape new_lshape, new_rshape, new_oshape;
-  bool need_bc = BinaryBroadcastShapeCompact(outputs[0].shape_, outputs[1].shape_, inputs[0].shape_,
-                                             &new_lshape, &new_rshape, &new_oshape);
+  const bool need_bc = BinaryBroadcastShapeCompact(outputs[0].shape_,
+                                                   outputs[1].shape_, inputs[0].shape_,
+                                                   &new_lshape, &new_rshape, &new_oshape) != 0;
   if (!need_bc) {
     ElemwiseBinaryOp::BackwardUseIn<xpu, LOP, ROP>(attrs, ctx, inputs, req, outputs);
   } else {
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc
index 8c97849e20..04281087f0 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
index bf69132cff..dd3c1b2e12 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc b/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc
index 42da19155e..fe7ad7619d 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu b/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu
index 2b7cc70b59..27a764cd92 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
index 957b00b5e7..6d74f2d59d 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cu b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cu
index 8673b4f1f1..4e80ae9572 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cu
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index 9c8f180116..d54636c055 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_op.h
  * \brief Function definition of elementwise binary operators
  */
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc
index b3be9e4c20..10e7fac5e9 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_op_basic.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
@@ -35,6 +36,7 @@ MXNET_ADD_SPARSE_OP_ALIAS(elemwise_add)
 The storage type of ``elemwise_add`` output depends on storage types of inputs
 
    - elemwise_add(row_sparse, row_sparse) = row_sparse
+   - elemwise_add(csr, csr) = csr
    - otherwise, ``elemwise_add`` generates output with default storage
 
 )code")
@@ -69,7 +71,8 @@ MXNET_ADD_SPARSE_OP_ALIAS(elemwise_sub)
 The storage type of ``elemwise_sub`` output depends on storage types of inputs
 
    - elemwise_sub(row_sparse, row_sparse) = row_sparse
-   - otherwise, ``elemwise_add`` generates output with default storage
+   - elemwise_sub(csr, csr) = csr
+   - otherwise, ``elemwise_sub`` generates output with default storage
 
 )code")
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_sub"});
@@ -100,6 +103,7 @@ The storage type of ``elemwise_mul`` output depends on storage types of inputs
    - elemwise_mul(row_sparse, row_sparse) = row_sparse
    - elemwise_mul(default, row_sparse) = default
    - elemwise_mul(row_sparse, default) = default
+   - elemwise_mul(csr, csr) = csr
    - otherwise, ``elemwise_mul`` generates output with default storage
 
 )code")
@@ -138,7 +142,7 @@ MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(elemwise_div, mshadow::op::div
 MXNET_ADD_SPARSE_OP_ALIAS(elemwise_div)
 .describe(R"code(Divides arguments element-wise.
 
-The storage type of ``elemwise_dev`` output is always dense
+The storage type of ``elemwise_div`` output is always dense
 
 )code")
 .add_alias("_div").add_alias("_Div")
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu
index d9ca8c6a17..9b55e2fd76 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_op_basic.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_op_extended.cc b/src/operator/tensor/elemwise_binary_op_extended.cc
index d0037b9b7e..26d429c533 100644
--- a/src/operator/tensor/elemwise_binary_op_extended.cc
+++ b/src/operator/tensor/elemwise_binary_op_extended.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_op_extended.cu b/src/operator/tensor/elemwise_binary_op_extended.cu
index 60156214e9..54eceb2f8a 100644
--- a/src/operator/tensor/elemwise_binary_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_op_extended.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_op_logic.cc b/src/operator/tensor/elemwise_binary_op_logic.cc
index f88e5a8faa..5d328b56b5 100644
--- a/src/operator/tensor/elemwise_binary_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_op_logic.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_op_logic.cu b/src/operator/tensor/elemwise_binary_op_logic.cu
index b9904d1ebc..be5b722045 100644
--- a/src/operator/tensor/elemwise_binary_op_logic.cu
+++ b/src/operator/tensor/elemwise_binary_op_logic.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h
index 27d8ed343c..cdf14055cf 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.h
+++ b/src/operator/tensor/elemwise_binary_scalar_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.h
  * \brief Function definition of elementwise binary scalar operators
  */
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
index 742c142053..2d6662ef2b 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
index 32e51d3990..21be0a0e12 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cc b/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
index cef571d6f3..0f00e4893d 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
index 4fb717f8e2..96884724d4 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_logic.cc b/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
index 134920cc0c..61f1dd0d1c 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_logic.cu b/src/operator/tensor/elemwise_binary_scalar_op_logic.cu
index dcf61c052d..91bcaa8de4 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_logic.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_logic.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc
index 771efe5a2f..041a0be007 100644
--- a/src/operator/tensor/elemwise_sum.cc
+++ b/src/operator/tensor/elemwise_sum.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file elemwise_sum.cc
  * \brief elementwise sum operator
 */
diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu
index 8976c49989..21a80f6264 100644
--- a/src/operator/tensor/elemwise_sum.cu
+++ b/src/operator/tensor/elemwise_sum.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file elemwise_sum.cu
  * \brief elementwise sum operator
 */
diff --git a/src/operator/tensor/elemwise_sum.h b/src/operator/tensor/elemwise_sum.h
index 3d6d725111..acf73e722b 100644
--- a/src/operator/tensor/elemwise_sum.h
+++ b/src/operator/tensor/elemwise_sum.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file elemwise_sum.h
  * \brief elementwise sum
  * \author Bing Xu
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 6fbde05c46..82ecf4f5ad 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file elementwise_unary_op.h
  * \brief Function definition of elementwise unary operators
  */
@@ -416,10 +417,6 @@ void CastCompute(const nnvm::NodeAttrs& attrs,
     })                                                              \
   .add_argument("data", "NDArray-or-Symbol", "The input array.")
 
-/*! \brief Register scalar op name as an alias */
-#define MXNET_ADD_SPARSE_OP_ALIAS(__name$) \
-  .add_alias("_sparse_" #__name$)
-
 /*! \brief Unary compute, with FComputeEx for csr and rsp available  */
 #define MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$)                     \
   MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                           \
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 273ebec488..7d885ad473 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file indexing_op.cc
  * \brief
  * \author Siyi Li, Chi Zhang
@@ -26,6 +27,115 @@
 #include "./indexing_op.h"
 namespace mxnet {
 namespace op {
+
+template<>
+void SparseEmbeddingOpForwardRspImpl<cpu>(mshadow::Stream<cpu>* s,
+                                          const TBlob& data,
+                                          const NDArray& weight,
+                                          const OpReqType req,
+                                          const TBlob& output) {
+  if (req == kNullOp) return;
+  using namespace rowsparse;
+  using namespace mxnet_op;
+  // zeros weight
+  if (req == kWriteTo && !weight.storage_initialized()) {
+    size_t out_size = output.shape_.Size();
+    MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
+      Fill<false>(s, TBlob(output.dptr<DType>(), mshadow::Shape1(out_size),
+          cpu::kDevMask), kWriteTo, 0);
+    })
+    return;
+  }
+  // check out-of-bound indices
+  bool is_valid = true;
+  MSHADOW_TYPE_SWITCH(data.type_flag_, DType, {
+    DType min = 0;
+    DType max = static_cast<DType>(weight.shape()[0] - 1);
+    // check with single thread is faster since data is small
+    DType* data_ptr = data.dptr<DType>();
+    size_t data_size = data.shape_.Size();
+    for (size_t i = 0; i < data_size; i++) {
+      if (data_ptr[i] > max || data_ptr[i] < min) is_valid = false;
+    }
+  })
+  CHECK(is_valid) << "SparseEmbedding input contains data out of bound";
+  // the weight is actually dense
+  if (weight.aux_shape(kIdx)[0] == weight.shape()[0]) {
+    EmbeddingOpForwardDnsImpl<cpu>(s, data, weight.data(), req, output);
+  } else {
+    EmbeddingOpForwardRspImpl<cpu>(s, data, weight, req, output);
+  }
+}
+
+
+template<>
+inline void SparseEmbeddingOpBackwardRspImpl<cpu>(const OpContext& ctx,
+                                                  const TBlob& ograd,
+                                                  const TBlob& data,
+                                                  const OpReqType req,
+                                                  const NDArray& output) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace mshadow::expr;
+  using namespace rowsparse;
+  using nnvm::dim_t;
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
+                          << "weight gradient calculation with req != write";
+
+  // Request temporary storage for marking non-zero rows and prefix sum
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  dim_t num_rows = output.shape()[0];
+  dim_t row_length = output.shape()[1];
+  // TODO(haibin) request less storage to save space in the future
+  size_t workspace_size = 2 * (num_rows * sizeof(dim_t));
+  Tensor<cpu, 1, char> workspace =
+    ctx.requested[embedding::kTempSpace].get_space_typed<cpu, 1, char>(
+      Shape1(workspace_size), s);
+  dim_t* row_flg = reinterpret_cast<dim_t*>(workspace.dptr_);
+  dim_t* prefix_sum = row_flg + num_rows;
+  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
+
+  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
+    MSHADOW_SGL_DBL_TYPE_SWITCH(ograd.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
+        // mark row flags
+        Fill<false>(s, TBlob(row_flg, Shape1(num_rows), cpu::kDevMask), kWriteTo, 0);
+        Kernel<MarkRowFlgKernel, cpu>::Launch(s, data_size, row_flg, data.dptr<IType>());
+        // calculate inclusive prefix sum
+        // TODO(haibin) ideally this is should be done in parallel
+        prefix_sum[0] = row_flg[0];
+        for (dim_t i = 1; i < num_rows; i++) {
+          prefix_sum[i] = prefix_sum[i - 1] + row_flg[i];
+        }
+        // total number of non-zero rows
+        dim_t nnr = prefix_sum[num_rows - 1];
+        if (nnr == 0) {
+          FillZerosRspImpl(s, output);
+          return;
+        }
+        output.CheckAndAlloc({Shape1(nnr)});
+        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
+        // fill row_idx array of output matrix, using the row_flg values
+        Kernel<FillRspRowIdxKernel, cpu>::Launch(s, num_rows,
+            grad_row_idx, prefix_sum, num_rows);
+        // prefill with zeros
+        DType* grad_data = output.data().dptr<DType>();
+        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length),
+            cpu::kDevMask), kWriteTo, 0);
+        // add the final gradients
+        const int num_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+        dim_t segment_len = (nnr + num_threads - 1) / num_threads;
+        Kernel<AddTakeGradRspKernel, cpu>::Launch(s, num_threads, grad_data, prefix_sum,
+                                                  ograd.dptr<DType>(), row_length,
+                                                  data.dptr<IType>(), data_size, segment_len,
+                                                  num_rows);
+      });
+    });
+  });
+}
+
+
 DMLC_REGISTER_PARAMETER(EmbeddingParam);
 DMLC_REGISTER_PARAMETER(TakeParam);
 DMLC_REGISTER_PARAMETER(OneHotParam);
@@ -116,8 +226,7 @@ The storage type of weight must be `row_sparse`, and the gradient of the weight
 .. Note::
 
     `SparseEmbedding` is designed for the use case where `input_dim` is very large (e.g. 100k).
-    The `row_sparse` weight cannot be used in a `BucketingModule`.
-    The operator is only available on CPU.
+    The operator is available on both CPU and GPU.
 
 Examples::
 
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 2cddd006a6..f029f02099 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -18,20 +18,177 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file indexing_op.cu
  * \brief
  * \author Siyi Li, Chi Zhang
 */
 
 #include "./indexing_op.h"
+#include "./util/tensor_util-inl.cuh"
+
 namespace mxnet {
 namespace op {
+
+/*! \brief If there are out-of-bound indices, out will be assigned to 1.
+ */
+
+struct is_valid_check {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, int32_t* out, const DType* data,
+                                  const DType min, const DType max) {
+    if (data[i] < min || data[i] > max) *out = 1;
+  }
+};
+
+
+struct AddTakeGradRspGPUKernel {
+  template<typename DType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const nnvm::dim_t* prefix_sum,
+                                             const IType* data,
+                                             const DType* ograd,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    const dim_t data_i = tid / row_length;
+    const dim_t grad_i = tid % row_length;
+    const dim_t irow = static_cast<dim_t>(data[data_i]);
+    const dim_t rsp_row = prefix_sum[irow] - 1;
+    const DType val = ograd[data_i * row_length + grad_i];
+    atomicAdd(static_cast<DType *>(&(out[rsp_row*row_length+grad_i])), val);
+  }
+};
+
+template<>
+void SparseEmbeddingOpForwardRspImpl<gpu>(mshadow::Stream<gpu>* s,
+                                          const TBlob& data,
+                                          const NDArray& weight,
+                                          const OpReqType req,
+                                          const TBlob& output) {
+  if (req == kNullOp) return;
+  using namespace rowsparse;
+  using namespace mxnet_op;
+  // zeros weight
+  if (req == kWriteTo && !weight.storage_initialized()) {
+    size_t out_size = output.shape_.Size();
+    MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
+      Fill<false>(s, TBlob(output.dptr<DType>(), mshadow::Shape1(out_size),
+          gpu::kDevMask), kWriteTo, 0);
+    })
+    return;
+  }
+  // check out-of-bound indices
+  int32_t is_valid = 0;
+  MSHADOW_TYPE_SWITCH(data.type_flag_, DType, {
+    DType min = 0;
+    DType max = static_cast<DType>(weight.shape()[0] - 1);
+    DType* data_ptr = data.dptr<DType>();
+    size_t data_size = data.shape_.Size();
+    int32_t* is_valid_ptr = NULL;
+    CUDA_CALL(cudaMalloc(&is_valid_ptr, sizeof(int32_t)));
+    Kernel<set_zero, gpu>::Launch(s, 1, is_valid_ptr);
+    Kernel<is_valid_check, gpu>::Launch(s, data_size, is_valid_ptr, data_ptr, min, max);
+    CUDA_CALL(cudaMemcpy(&is_valid, is_valid_ptr, sizeof(int32_t),
+              cudaMemcpyDeviceToHost));
+  })
+  CHECK_EQ(is_valid, 0) << "SparseEmbedding input contains data out of bound";
+  // the weight is actually dense
+  if (weight.aux_shape(kIdx)[0] == weight.shape()[0]) {
+    EmbeddingOpForwardDnsImpl<gpu>(s, data, weight.data(), req, output);
+  } else {
+    EmbeddingOpForwardRspImpl<gpu>(s, data, weight, req, output);
+  }
+}
+
+
+template<>
+inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
+                                                  const TBlob& ograd,
+                                                  const TBlob& data,
+                                                  const OpReqType req,
+                                                  const NDArray& output) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace mshadow::expr;
+  using namespace rowsparse;
+  using nnvm::dim_t;
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
+                          << "weight gradient calculation with req != write";
+
+  // Request temporary storage for marking non-zero rows and prefix sum
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  dim_t num_rows = output.shape()[0];
+  dim_t row_length = output.shape()[1];
+  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
+  dim_t num_threads;
+
+  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
+    MSHADOW_SGL_DBL_TYPE_SWITCH(ograd.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
+        dim_t* prefix_sum = NULL;
+        void* d_temp_storage = NULL;
+        size_t temp_storage_bytes = 0;
+        cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      prefix_sum,
+                                      prefix_sum,
+                                      num_rows,
+                                      Stream<gpu>::GetStream(s));
+        Tensor<gpu, 1, char> workspace = ctx.requested[0]
+            .get_space_typed<gpu, 1, char>(Shape1(num_rows * sizeof(dim_t) +
+                                           temp_storage_bytes), s);
+        prefix_sum = reinterpret_cast<dim_t*>(workspace.dptr_);
+        d_temp_storage = workspace.dptr_ + num_rows*sizeof(dim_t);
+        num_threads = num_rows;
+        Fill<false>(s, TBlob(prefix_sum, Shape1(num_threads), gpu::kDevMask), kWriteTo, 0);
+        Kernel<MarkRowFlgKernel, gpu>::Launch(s, data_size, prefix_sum, data.dptr<IType>());
+
+        cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      prefix_sum,
+                                      prefix_sum,
+                                      num_rows,
+                                      mshadow::Stream<gpu>::GetStream(s));
+        dim_t nnr = 0;
+        CUDA_CALL(cudaMemcpy(&nnr, &prefix_sum[num_rows-1], sizeof(dim_t),
+            cudaMemcpyDeviceToHost));
+
+        if (nnr == 0) {
+          FillZerosRspImpl(s, output);
+          return;
+        }
+        output.CheckAndAlloc({Shape1(nnr)});
+        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
+        // fill row_idx array of output matrix, using the row_flg values
+        Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_rows,
+            grad_row_idx, prefix_sum, num_rows);
+        // prefill with zeros
+        DType* grad_data = output.data().dptr<DType>();
+        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
+            kWriteTo, 0);
+        // add the final gradients
+        num_threads = row_length * data_size;
+        Kernel<AddTakeGradRspGPUKernel, gpu>::Launch(s, num_threads, grad_data, prefix_sum,
+            data.dptr<IType>(), ograd.dptr<DType>(), row_length);
+      });
+    });
+  });
+}
+
 NNVM_REGISTER_OP(Embedding)
 .set_attr<FCompute>("FCompute<gpu>", EmbeddingOpForward<gpu>);
 
+NNVM_REGISTER_OP(_contrib_SparseEmbedding)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SparseEmbeddingOpForwardEx<gpu>);
+
 NNVM_REGISTER_OP(_backward_Embedding)
 .set_attr<FCompute>("FCompute<gpu>", EmbeddingOpBackward<gpu>);
 
+NNVM_REGISTER_OP(_backward_SparseEmbedding)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SparseEmbeddingOpBackwardEx<gpu>);
+
 NNVM_REGISTER_OP(take)
 .set_attr<FCompute>("FCompute<gpu>", TakeOpForward<gpu>);
 
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 684794bcd9..b0f06de9ae 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file indexing_op.h
  * \brief
  * \author Bing Xu, Siyi Li, Chi Zhang
@@ -187,9 +188,8 @@ inline bool SparseEmbeddingOpForwardStorageType(const nnvm::NodeAttrs& attrs,
   const int& weight_stype = in_attrs->at(embedding::kWeight);
   int& out_stype = out_attrs->at(embedding::kOut);
   bool dispatched = false;
-  const bool invalid_ctx = dev_mask != mshadow::cpu::kDevMask;
   if (!dispatched && data_stype == kDefaultStorage &&
-      weight_stype == kRowSparseStorage && !invalid_ctx) {
+      weight_stype == kRowSparseStorage) {
     // dns, rsp -> dns
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                      dispatch_mode, DispatchMode::kFComputeEx);
@@ -215,9 +215,8 @@ inline bool SparseEmbeddingOpBackwardStorageType(const nnvm::NodeAttrs& attrs,
   int& data_grad_stype = out_attrs->at(0);
   int& weight_grad_stype = out_attrs->at(1);
   bool dispatched = false;
-  const bool invalid_ctx = dev_mask != mshadow::cpu::kDevMask;
   if (!dispatched && ograd_stype == kDefaultStorage &&
-      data_stype == kDefaultStorage && !invalid_ctx) {
+      data_stype == kDefaultStorage) {
     // dns, dns -> dns, rsp
     if (type_assign(&data_grad_stype, kDefaultStorage) &&
         type_assign(&weight_grad_stype, kRowSparseStorage) &&
@@ -336,8 +335,8 @@ struct TakeRspKernel {
   }
 };
 
-inline void EmbeddingOpForwardRspImpl(mshadow::Stream<mshadow::cpu>* s,
-                                      const cpu& cpu_dev,
+template<typename xpu>
+inline void EmbeddingOpForwardRspImpl(mshadow::Stream<xpu>* s,
                                       const TBlob& data,
                                       const NDArray& weight,
                                       const OpReqType req,
@@ -351,7 +350,7 @@ inline void EmbeddingOpForwardRspImpl(mshadow::Stream<mshadow::cpu>* s,
           size_t data_size = data.shape_.Size();
           // only using the second dim since weight.ndim() == 2
           const nnvm::dim_t row_length = weight.shape()[1];
-          Kernel<TakeRspKernel<req_t>, cpu>::Launch(s, data_size, data.dptr<IType>(),
+          Kernel<TakeRspKernel<req_t>, xpu>::Launch(s, data_size, data.dptr<IType>(),
                                                     output.dptr<DType>(),
                                                     weight.aux_data(kIdx).dptr<RType>(),
                                                     weight.data().dptr<DType>(),
@@ -369,39 +368,7 @@ void SparseEmbeddingOpForwardRspImpl(mshadow::Stream<xpu>* s,
                                      const TBlob& data,
                                      const NDArray& weight,
                                      const OpReqType req,
-                                     const TBlob& output) {
-  if (req == kNullOp) return;
-  CHECK((std::is_same<xpu, mshadow::cpu>::value)) << "SparseEmbedding is only implemented for CPU";
-  using namespace rowsparse;
-  using namespace mxnet_op;
-  // zeros weight
-  if (req == kWriteTo && !weight.storage_initialized()) {
-    size_t out_size = output.shape_.Size();
-    MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
-      Kernel<set_zero, xpu>::Launch(s, out_size, output.dptr<DType>());
-    })
-    return;
-  }
-  // check out-of-bound indices
-  bool is_valid = true;
-  MSHADOW_TYPE_SWITCH(data.type_flag_, DType, {
-    DType min = 0;
-    DType max = static_cast<DType>(weight.shape()[0] - 1);
-    // check with single thread is faster since data is small
-    DType* data_ptr = data.dptr<DType>();
-    size_t data_size = data.shape_.Size();
-    for (size_t i = 0; i < data_size; i++) {
-      if (data_ptr[i] > max || data_ptr[i] < min) is_valid = false;
-    }
-  })
-  CHECK(is_valid) << "SparseEmbedding input contains data out of bound";
-  // the weight is actually dense
-  if (weight.aux_shape(kIdx)[0] == weight.shape()[0]) {
-    EmbeddingOpForwardDnsImpl(s, data, weight.data(), req, output);
-  } else {
-    EmbeddingOpForwardRspImpl(s, xpu(), data, weight, req, output);
-  }
-}
+                                     const TBlob& output);
 
 template<typename xpu>
 void EmbeddingOpForward(const nnvm::NodeAttrs& attrs,
@@ -603,71 +570,12 @@ struct AddTakeGradRspKernel {
   }
 };
 
+template<typename xpu>
 inline void SparseEmbeddingOpBackwardRspImpl(const OpContext& ctx,
-                                             const cpu& cpu_dev,
                                              const TBlob& ograd,
                                              const TBlob& data,
                                              const OpReqType req,
-                                             const NDArray& output) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using namespace mshadow::expr;
-  using namespace rowsparse;
-  using nnvm::dim_t;
-  if (req == kNullOp) return;
-  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
-                          << "weight gradient calculation with req != write";
-
-  // Request temporary storage for marking non-zero rows and prefix sum
-  Stream<cpu> *s = ctx.get_stream<cpu>();
-  dim_t num_rows = output.shape()[0];
-  dim_t row_length = output.shape()[1];
-  // TODO(haibin) request less storage to save space in the future
-  size_t workspace_size = 2 * (num_rows * sizeof(dim_t));
-  Tensor<cpu, 1, char> workspace =
-    ctx.requested[embedding::kTempSpace].get_space_typed<cpu, 1, char>(
-      Shape1(workspace_size), s);
-  dim_t* row_flg = reinterpret_cast<dim_t*>(workspace.dptr_);
-  dim_t* prefix_sum = row_flg + num_rows;
-  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
-
-  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
-    MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
-      MSHADOW_TYPE_SWITCH(output.aux_type(kIdx), RType, {
-        // mark row flags
-        Fill<false>(s, TBlob(row_flg, mshadow::Shape1(num_rows), cpu::kDevMask), kWriteTo, 0);
-        Kernel<MarkRowFlgKernel, cpu>::Launch(s, data_size, row_flg, data.dptr<IType>());
-        // calculate inclusive prefix sum
-        // TODO(haibin) ideally this is should be done in parallel
-        prefix_sum[0] = row_flg[0];
-        for (dim_t i = 1; i < num_rows; i++) {
-          prefix_sum[i] = prefix_sum[i - 1] + row_flg[i];
-        }
-        // total number of non-zero rows
-        dim_t nnr = prefix_sum[num_rows - 1];
-        if (nnr == 0) {
-          FillZerosRspImpl(s, output);
-          return;
-        }
-        output.CheckAndAlloc({Shape1(nnr)});
-        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
-        // fill row_idx array of output matrix, using the row_flg values
-        Kernel<FillRspRowIdxKernel, cpu>::Launch(s, num_rows,
-               grad_row_idx, prefix_sum, num_rows);
-        // prefill with zeros
-        DType* grad_data = output.data().dptr<DType>();
-        Kernel<set_zero, cpu>::Launch(s, nnr * row_length, grad_data);
-        // add the final gradients
-        const int num_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-        dim_t segment_len = (nnr + num_threads - 1) / num_threads;
-        Kernel<AddTakeGradRspKernel, cpu>::Launch(s, num_threads, grad_data, prefix_sum,
-                                                  ograd.dptr<DType>(), row_length,
-                                                  data.dptr<IType>(), data_size, segment_len,
-                                                  num_rows);
-      });
-    });
-  });
-}
+                                             const NDArray& output);
 
 template<typename xpu>
 void SparseEmbeddingOpBackwardEx(const nnvm::NodeAttrs& attrs,
@@ -687,8 +595,8 @@ void SparseEmbeddingOpBackwardEx(const nnvm::NodeAttrs& attrs,
           << "SparseEmbedding layer doesn't support calculate data gradient";
   if (data.storage_type() == kDefaultStorage && ograd.storage_type() == kDefaultStorage &&
       weight_grad.storage_type() == kRowSparseStorage) {
-    SparseEmbeddingOpBackwardRspImpl(ctx, xpu(), ograd.data(), data.data(),
-                                     req[embedding::kWeight], weight_grad);
+    SparseEmbeddingOpBackwardRspImpl<xpu>(ctx, ograd.data(), data.data(),
+                                          req[embedding::kWeight], weight_grad);
   } else {
     LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
   }
diff --git a/src/operator/tensor/init_op.cc b/src/operator/tensor/init_op.cc
index 52e488acd9..5be4a2f421 100644
--- a/src/operator/tensor/init_op.cc
+++ b/src/operator/tensor/init_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file init_op.cc
  * \brief CPU Implementation of init op
  */
diff --git a/src/operator/tensor/init_op.cu b/src/operator/tensor/init_op.cu
index 4c16a94353..aeea2895b0 100644
--- a/src/operator/tensor/init_op.cu
+++ b/src/operator/tensor/init_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file init_op.cu
  * \brief GPU Implementation of init op
  */
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index c621f6e9a6..1d30c88011 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file init_op.h
  * \brief Function definition of initialization op
  */
diff --git a/src/operator/tensor/la_op.cc b/src/operator/tensor/la_op.cc
index 58a15ae1e1..7083efe2f1 100644
--- a/src/operator/tensor/la_op.cc
+++ b/src/operator/tensor/la_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file la_op.cc
  * \brief CPU-Operators for advanced linear algebra.
  */
diff --git a/src/operator/tensor/la_op.h b/src/operator/tensor/la_op.h
index a3231391b6..3d411b2d71 100644
--- a/src/operator/tensor/la_op.h
+++ b/src/operator/tensor/la_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file la_op.h
  * \brief Operators for advanced linear algebra.
  */
diff --git a/src/operator/tensor/la_op_inline.h b/src/operator/tensor/la_op_inline.h
index f372162a83..a508eb7736 100644
--- a/src/operator/tensor/la_op_inline.h
+++ b/src/operator/tensor/la_op_inline.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file la_op_inline.h
  * \brief Operators for advanced linear algebra.
  */
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 1f25b94ff3..367f8de053 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file matrix_op-inl.h
  * \brief Function definition of matrix related operators
  */
@@ -502,7 +503,7 @@ struct SliceDimTwoCsrAssign {
   /*!
    * \brief This function slices a CSRNDArray on axis one between begin_col and end_col
    * \param i           loop index
-   * \param out_idx     output csr ndarray column indices    
+   * \param out_idx     output csr ndarray column indices
    * \param out_data    output csr ndarray data
    * \param out_indptr  output csr ndarray row index pointer
    * \param in_idx      input csr ndarray column indices
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index cba9efd1a9..1bba69a439 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file matrix_op.cc
  * \brief CPU Implementation of matrix operations
  */
@@ -247,7 +248,7 @@ will return a new array with shape ``(2,1,3,4)``.
 .add_arguments(ExpandDimParam::__FIELDS__());
 
 NNVM_REGISTER_OP(slice)
-.add_alias("_sparse_slice")
+MXNET_ADD_SPARSE_OP_ALIAS(slice)
 .add_alias("crop")
 .describe(R"code(Slices a region of the array.
 
@@ -395,6 +396,7 @@ NNVM_REGISTER_OP(_backward_slice_axis)
 .set_attr<FCompute>("FCompute<cpu>", SliceAxisGrad_<cpu>);
 
 NNVM_REGISTER_OP(clip)
+MXNET_ADD_SPARSE_OP_ALIAS(clip)
 .describe(R"code(Clips (limits) the values in an array.
 
 Given an interval, values outside the interval are clipped to the interval edges.
diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu
index 237b87296c..30eaf23b10 100644
--- a/src/operator/tensor/matrix_op.cu
+++ b/src/operator/tensor/matrix_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file matrix_op.cu
  * \brief GPU Implementation of matrix operations
  */
@@ -47,7 +48,7 @@ NNVM_REGISTER_OP(_backward_slice)
 NNVM_REGISTER_OP(_slice_assign)
 .set_attr<FCompute>("FCompute<gpu>", SliceAssignOpForward<gpu>);
 
-NNVM_REGISTER_OP(_crop_assign_scalar)
+NNVM_REGISTER_OP(_slice_assign_scalar)
 .set_attr<FCompute>("FCompute<gpu>", SliceAssignScalarOpForward<gpu>);
 
 NNVM_REGISTER_OP(slice_axis)
diff --git a/src/operator/tensor/ordering_op-inl.h b/src/operator/tensor/ordering_op-inl.h
index b491b64cc9..606406dfe0 100644
--- a/src/operator/tensor/ordering_op-inl.h
+++ b/src/operator/tensor/ordering_op-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file ordering_op-inl.h
  * \brief Function definition of matrix related operators
  */
diff --git a/src/operator/tensor/ordering_op.cc b/src/operator/tensor/ordering_op.cc
index 22712a82b4..ebd7c62ec8 100644
--- a/src/operator/tensor/ordering_op.cc
+++ b/src/operator/tensor/ordering_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file ordering.cc
  * \brief CPU Implementation of the ordering operations
  */
diff --git a/src/operator/tensor/ordering_op.cu b/src/operator/tensor/ordering_op.cu
index 8e40b4a350..d78361448d 100644
--- a/src/operator/tensor/ordering_op.cu
+++ b/src/operator/tensor/ordering_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file matrix_op.cu
  * \brief GPU Implementation of matrix operations
  */
diff --git a/src/operator/tensor/sort_op.h b/src/operator/tensor/sort_op.h
index a0425a5afe..3fa95bb660 100644
--- a/src/operator/tensor/sort_op.h
+++ b/src/operator/tensor/sort_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file sort_op.h
  * \brief SortByKey function
  */
diff --git a/src/operator/tensor/square_sum-inl.h b/src/operator/tensor/square_sum-inl.h
index 7ce5b1e1b0..a052ad96cf 100644
--- a/src/operator/tensor/square_sum-inl.h
+++ b/src/operator/tensor/square_sum-inl.h
@@ -179,7 +179,7 @@ struct SquareSumRspKernel<req, 1, true> {
   }
 };
 
-template<int req, int axis, int ograd_stype = kDefaultStorage>
+template<int req, int axis, int ograd_stype = kDefaultStorage, bool is_data_full_rsp = false>
 struct SquareSumRspGradKernel;
 
 template<int req>
@@ -224,11 +224,10 @@ struct SquareSumRspGradKernel<req, 1> {
 
 /*!
  * Note: This kernel assumes that the ograd and in_data
- * are all rsp and have equal row_idx array, or
- * in_data is a full rsp.
+ * are all rsp and have equal row_idx array.
  */
 template<int req>
-struct SquareSumRspGradKernel<req, 1, kRowSparseStorage> {
+struct SquareSumRspGradKernel<req, 1, kRowSparseStorage, false> {
   /*!
    * \param i index of igrad.data()
    * \param in_grad_row_idx row_idx of the gradient of the op's input
@@ -243,10 +242,36 @@ struct SquareSumRspGradKernel<req, 1, kRowSparseStorage> {
                                   const DType* in_data, const int64_t num_cols) {
     const int64_t row = i / num_cols;
     in_grad_row_idx[row] = out_grad_row_idx[row];
-    KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[row]);
+    KERNEL_ASSIGN(in_grad[i], req, 2 * in_data[i] * out_grad[row]);
   }
 };
 
+/*!
+ * Note: This kernel assumes that the ograd and in_data
+ * are all rsp and in_data is a full rsp.
+ */
+template<int req>
+struct SquareSumRspGradKernel<req, 1, kRowSparseStorage, true> {
+  /*!
+   * \param i index of igrad.data()
+   * \param in_grad_row_idx row_idx of the gradient of the op's input
+   * \param in_grad gradient of the op's input
+   * \param out_grad_row_idx row_idx of the gradient of the op's output
+   * \param out_grad gradient of the op's output
+   * \param in_data op's input
+   */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad,
+                                  const IType* out_grad_row_idx, const DType* out_grad,
+                                  const DType* in_data, const int64_t num_cols) {
+    const int64_t row = i / num_cols;
+    const int64_t row_dns = out_grad_row_idx[row];
+    in_grad_row_idx[row] = row_dns;
+    KERNEL_ASSIGN(in_grad[i], req, 2 * in_data[row_dns*num_cols+i%num_cols] * out_grad[row]);
+  }
+};
+
+
 template<typename xpu>
 void SquareSumRspImpl(const nnvm::NodeAttrs& attrs,
                       mshadow::Stream<xpu>* s,
@@ -334,6 +359,12 @@ void SquareSumRspImpl(const nnvm::NodeAttrs& attrs,
   }
 }
 
+/*!\brief
+ * This function only supports the following three situations:
+ * 1. ograd is a dns and input is an rsp
+ * 2. ograd and input are both rsp and have the same row_idx array
+ * 3. ograd and input are both rsp and input is a full rsp
+ */
 template<typename xpu>
 void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
                           mshadow::Stream<xpu>* s,
@@ -350,23 +381,21 @@ void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(input.storage_type(), kRowSparseStorage);
   CHECK_EQ(igrad->storage_type(), kRowSparseStorage);
   CHECK_EQ(req, kWriteTo);
-  if (!input.storage_initialized()) {
+  if (!input.storage_initialized()
+      || (ograd.storage_type() == kRowSparseStorage && !ograd.storage_initialized())) {
     FillZerosRspImpl(s, *igrad);
     return;
   }
 
   using namespace mxnet_op;
-  // TODO(junwu) change the input of CheckAndAlloc
-  // if we want to support differen row idx arrays
-  // for ograd and input when they are both row-sparse ndarrays
-  igrad->CheckAndAlloc({input.aux_shape(rowsparse::kIdx)});
   const int64_t num_cols = input.storage_shape()[1];
-  const TBlob& igrad_data = igrad->data();
-  const TBlob igrad_row_idx = igrad->aux_data(rowsparse::kIdx);
   const TBlob& ograd_data = ograd.data();
   const TBlob& in_data = input.data();
   const TBlob in_row_idx = input.aux_data(rowsparse::kIdx);
   if (ograd.storage_type() == kDefaultStorage) {
+    igrad->CheckAndAlloc({input.aux_shape(rowsparse::kIdx)});
+    const TBlob& igrad_data = igrad->data();
+    const TBlob igrad_row_idx = igrad->aux_data(rowsparse::kIdx);
     if (0 == param.axis[0]) {  // forward is sum per column
       MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, {
         MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, {
@@ -396,18 +425,28 @@ void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(ograd.shape().ndim(), 2U);
     const TBlob ograd_row_idx = ograd.aux_data(rowsparse::kIdx);
     CHECK(ograd_row_idx.Size() == in_row_idx.Size() || in_row_idx.Size() == in_data.shape_[0]);
+    igrad->CheckAndAlloc({ograd.aux_shape(rowsparse::kIdx)});
+    const TBlob& igrad_data = igrad->data();
+    const TBlob igrad_row_idx = igrad->aux_data(rowsparse::kIdx);
     MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, {
       if (std::is_same<xpu, cpu>::value) {
-        const IType* first1 = ograd_row_idx.dptr<IType>();
-        const IType* last1 = first1 + ograd_row_idx.Size();
-        const IType* first2 = in_row_idx.dptr<IType>();
         // when ograd_row_idx and in_row_idx have the same size and input is not a full rsp
         // ograd_row_idx and in_row_idx are expected to have the same elements
-        if (ograd_row_idx.Size() == in_row_idx.Size() && in_row_idx.Size() != in_data.shape_[0]) {
+        if (in_row_idx.Size() != input.shape()[0]) {  // if input data is not a full rsp
+          CHECK_EQ(ograd_row_idx.Size(), in_row_idx.Size()) << "SquareSumRspGradImpl only supports"
+                                                               " equal ograd_row_idx and"
+                                                               " input_row_idx when ograd and"
+                                                               " input are both row-sparse and"
+                                                               " input data is not a full"
+                                                               " row-sparse matrix";
+          const IType* first1 = ograd_row_idx.dptr<IType>();
+          const IType* last1 = first1 + ograd_row_idx.Size();
+          const IType* first2 = in_row_idx.dptr<IType>();
           CHECK(std::equal(first1, last1, first2)) << "SquareSumRspGradImpl only supports"
                                                       " equal ograd_row_idx and input_row_idx"
                                                       " when ograd and input are both"
-                                                      " row-sparse";
+                                                      " row-sparse and input data is not a full"
+                                                      " row-sparse matrix";
         }
       } else {
         LOG(FATAL) << "SquareSumRspGradImpl has not implemented GPU version when"
@@ -415,10 +454,17 @@ void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
       }
       MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, {
         MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
-          Kernel<SquareSumRspGradKernel<req_type, 1, kRowSparseStorage>, xpu>::Launch(
-              s, igrad_data.Size(), igrad_row_idx.dptr<IType>(),
-              igrad_data.dptr<DType>(), ograd_row_idx.dptr<IType>(),
-              ograd_data.dptr<DType>(), in_data.dptr<DType>(), num_cols);
+          if (in_row_idx.Size() != input.shape()[0]) {  // input data is not a full rsp
+            Kernel<SquareSumRspGradKernel<req_type, 1, kRowSparseStorage, false>, xpu>::Launch(
+                s, igrad_data.Size(), igrad_row_idx.dptr<IType>(),
+                igrad_data.dptr<DType>(), ograd_row_idx.dptr<IType>(),
+                ograd_data.dptr<DType>(), in_data.dptr<DType>(), num_cols);
+          } else {  // input data is a full rsp
+            Kernel<SquareSumRspGradKernel<req_type, 1, kRowSparseStorage, true>, xpu>::Launch(
+                s, igrad_data.Size(), igrad_row_idx.dptr<IType>(),
+                igrad_data.dptr<DType>(), ograd_row_idx.dptr<IType>(),
+                ograd_data.dptr<DType>(), in_data.dptr<DType>(), num_cols);
+          }
         })
       })
     })
diff --git a/src/operator/upsampling-inl.h b/src/operator/upsampling-inl.h
index 77ea13bd6c..bac9709f4e 100644
--- a/src/operator/upsampling-inl.h
+++ b/src/operator/upsampling-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file upsampling-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/upsampling.cc b/src/operator/upsampling.cc
index 653b5709f1..8942e35ab3 100644
--- a/src/operator/upsampling.cc
+++ b/src/operator/upsampling.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file upsampling_nearest.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/upsampling.cu b/src/operator/upsampling.cu
index 8152535233..f83535a2b2 100644
--- a/src/operator/upsampling.cu
+++ b/src/operator/upsampling.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file upsampling_nearest.cc
  * \brief
  * \author Bing Xu
diff --git a/src/optimizer/sgd-inl.h b/src/optimizer/sgd-inl.h
index 01a330bece..3c0224d280 100644
--- a/src/optimizer/sgd-inl.h
+++ b/src/optimizer/sgd-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file sgd-inl.h
  * \brief Operator interface of mxnet.
  * \author Junyuan Xie
diff --git a/src/resource.cc b/src/resource.cc
index 4c2dbee33f..d1038dc57c 100644
--- a/src/resource.cc
+++ b/src/resource.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file resource.cc
  * \brief Implementation of resource manager.
  */
@@ -112,14 +113,14 @@ class ResourceManagerImpl : public ResourceManager {
 
   // request resources
   Resource Request(Context ctx, const ResourceRequest &req) override {
-    if (ctx.dev_mask() == cpu::kDevMask) {
+    if (ctx.dev_mask() == Context::kCPU) {
       switch (req.type) {
         case ResourceRequest::kRandom: return cpu_rand_->resource;
         case ResourceRequest::kTempSpace: return cpu_space_->GetNext();
         default: LOG(FATAL) << "Unknown supported type " << req.type;
       }
     } else {
-      CHECK_EQ(ctx.dev_mask(), gpu::kDevMask);
+      CHECK_EQ(ctx.dev_mask(), Context::kGPU);
 #if MSHADOW_USE_CUDA
       switch (req.type) {
         case ResourceRequest::kRandom: {
@@ -186,9 +187,11 @@ class ResourceManagerImpl : public ResourceManager {
     inline void Seed(uint32_t global_seed) {
       uint32_t seed = ctx.dev_id + global_seed * kRandMagic;
       mshadow::Random<xpu> *r = prnd;
-      Engine::Get()->PushSync([r, seed](RunContext rctx) {
+      Engine::Get()->PushAsync(
+        [r, seed](RunContext rctx, Engine::CallbackOnComplete on_complete) {
           r->set_stream(rctx.get_stream<xpu>());
           r->Seed(seed);
+          on_complete();
         }, ctx, {}, {resource.var},
         FnProperty::kNormal, 0, PROFILER_MESSAGE("ResourceRandomSetSeed"));
     }
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index ead00dafbf..f0dd61f01a 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cpu_device_storage.h
  * \brief CPU storage implementation.
  */
diff --git a/src/storage/cpu_shared_storage_manager.h b/src/storage/cpu_shared_storage_manager.h
new file mode 100644
index 0000000000..d623cf2c7b
--- /dev/null
+++ b/src/storage/cpu_shared_storage_manager.h
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_STORAGE_CPU_SHARED_STORAGE_MANAGER_H_
+#define MXNET_STORAGE_CPU_SHARED_STORAGE_MANAGER_H_
+
+#if MXNET_USE_CUDA
+  #include <cuda_runtime.h>
+#endif  // MXNET_USE_CUDA
+#include <mxnet/base.h>
+
+#ifndef _WIN32
+#include <sys/mman.h>
+#include <sys/fcntl.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif  // _WIN32
+
+#include <unordered_map>
+#include <vector>
+#include <atomic>
+#include <iostream>
+#include <mutex>
+#include <new>
+#include <string>
+#include <limits>
+
+#include "./storage_manager.h"
+#include "../common/cuda_utils.h"
+
+
+namespace mxnet {
+namespace storage {
+/*!
+ * \brief Storage manager for cpu shared memory
+ */
+class CPUSharedStorageManager final : public StorageManager {
+ public:
+  /*!
+   * \brief Default constructor.
+   */
+  CPUSharedStorageManager() : rand_gen_(std::random_device()()) {}
+  /*!
+   * \brief Default destructor.
+   */
+  ~CPUSharedStorageManager() {
+    for (const auto& kv : pool_) {
+      FreeImpl(kv.second);
+    }
+  }
+
+  void Alloc(Storage::Handle* handle) override;
+  void Free(Storage::Handle handle) override {
+    pool_.erase(handle.dptr);
+    FreeImpl(handle);
+  }
+
+  void DirectFree(Storage::Handle handle) override {
+    Free(handle);
+  }
+
+  void IncrementRefCount(const Storage::Handle& handle) {
+    std::atomic<int>* counter = reinterpret_cast<std::atomic<int>*>(
+        static_cast<char*>(handle.dptr) - alignment_);
+    ++(*counter);
+  }
+
+  int DecrementRefCount(const Storage::Handle& handle) {
+    std::atomic<int>* counter = reinterpret_cast<std::atomic<int>*>(
+        static_cast<char*>(handle.dptr) - alignment_);
+    return --(*counter);
+  }
+
+ private:
+  static constexpr size_t alignment_ = 16;
+
+  std::mutex mutex_;
+  std::mt19937 rand_gen_;
+  std::unordered_map<void*, Storage::Handle> pool_;
+
+  void FreeImpl(const Storage::Handle& handle);
+
+  std::string SharedHandleToString(int shared_pid, int shared_id) {
+    std::stringstream name;
+    name << "/mx_" << std::hex << shared_pid << "_" << std::hex << shared_id;
+    return name.str();
+  }
+  DISALLOW_COPY_AND_ASSIGN(CPUSharedStorageManager);
+};  // class CPUSharedStorageManager
+
+void CPUSharedStorageManager::Alloc(Storage::Handle* handle) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  std::uniform_int_distribution<> dis(0, std::numeric_limits<int>::max());
+  int fid = -1;
+  bool is_new = false;
+  size_t size = handle->size + alignment_;
+  void* ptr = nullptr;
+#ifdef _WIN32
+  LOG(FATAL) << "Shared memory is not supported on Windows yet.";
+#else
+  if (handle->shared_id == -1 && handle->shared_pid == -1) {
+    is_new = true;
+    handle->shared_pid = getpid();
+    for (int i = 0; i < 10; ++i) {
+      handle->shared_id = dis(rand_gen_);
+      auto filename = SharedHandleToString(handle->shared_pid, handle->shared_id);
+      fid = shm_open(filename.c_str(), O_EXCL|O_CREAT|O_RDWR, 0666);
+      if (fid != -1) break;
+    }
+  } else {
+    auto filename = SharedHandleToString(handle->shared_pid, handle->shared_id);
+    fid = shm_open(filename.c_str(), O_RDWR, 0666);
+  }
+
+  if (fid == -1) {
+    LOG(FATAL) << "Failed to open shared memory. shm_open failed with error "
+               << strerror(errno);
+  }
+
+  if (is_new) CHECK_EQ(ftruncate(fid, size), 0);
+
+  ptr = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fid, 0);
+  CHECK_NE(ptr, MAP_FAILED)
+      << "Failed to map shared memory. mmap failed with error " << strerror(errno);
+#endif  // _WIN32
+
+  if (is_new) {
+    new (ptr) std::atomic<int>(1);
+  }
+  handle->dptr = static_cast<char*>(ptr) + alignment_;
+  pool_[handle->dptr] = *handle;
+}
+
+void CPUSharedStorageManager::FreeImpl(const Storage::Handle& handle) {
+  int count = DecrementRefCount(handle);
+  CHECK_GE(count, 0);
+#ifdef _WIN32
+  LOG(FATAL) << "Shared memory is not supported on Windows yet.";
+#else
+  CHECK_EQ(munmap(static_cast<char*>(handle.dptr) - alignment_,
+                  handle.size + alignment_), 0)
+      << "Failed to unmap shared memory. munmap failed with error "
+      << strerror(errno);
+
+  if (count == 0) {
+    auto filename = SharedHandleToString(handle.shared_pid, handle.shared_id);
+    CHECK_EQ(shm_unlink(filename.c_str()), 0)
+        << "Failed to unlink shared memory. shm_unlink failed with error "
+        << strerror(errno);
+  }
+#endif  // _WIN32
+}
+
+}  // namespace storage
+}  // namespace mxnet
+
+#endif  // MXNET_STORAGE_CPU_SHARED_STORAGE_MANAGER_H_
diff --git a/src/storage/gpu_device_storage.h b/src/storage/gpu_device_storage.h
index 3c4f732c80..f902306cb9 100644
--- a/src/storage/gpu_device_storage.h
+++ b/src/storage/gpu_device_storage.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file gpu_device_storage.h
  * \brief GPU storage implementation.
  */
diff --git a/src/storage/naive_storage_manager.h b/src/storage/naive_storage_manager.h
index 731f374bbf..b05b242a79 100644
--- a/src/storage/naive_storage_manager.h
+++ b/src/storage/naive_storage_manager.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file naive_storage_manager.h
  * \brief Naive storage manager.
  */
@@ -44,11 +45,11 @@ class NaiveStorageManager final : public StorageManager {
    * \brief Default destructor.
    */
   ~NaiveStorageManager() = default;
-  void* Alloc(size_t size) override;
-  void Free(void* ptr, size_t) override;
+  void Alloc(Storage::Handle* handle) override;
+  void Free(Storage::Handle handle) override;
 
-  void DirectFree(void* ptr, size_t size) override {
-    DeviceStorage::Free(ptr);
+  void DirectFree(Storage::Handle handle) override {
+    DeviceStorage::Free(handle.dptr);
   }
 
  private:
@@ -56,13 +57,13 @@ class NaiveStorageManager final : public StorageManager {
 };  // class NaiveStorageManager
 
 template <class DeviceStorage>
-void* NaiveStorageManager<DeviceStorage>::Alloc(size_t size) {
-  return DeviceStorage::Alloc(size);
+void NaiveStorageManager<DeviceStorage>::Alloc(Storage::Handle* handle) {
+  handle->dptr = DeviceStorage::Alloc(handle->size);
 }
 
 template <class DeviceStorage>
-void NaiveStorageManager<DeviceStorage>::Free(void* ptr, size_t) {
-  DeviceStorage::Free(ptr);
+void NaiveStorageManager<DeviceStorage>::Free(Storage::Handle handle) {
+  DeviceStorage::Free(handle.dptr);
 }
 
 }  // namespace storage
diff --git a/src/storage/pinned_memory_storage.h b/src/storage/pinned_memory_storage.h
index 69e05f7cf9..0ba7f20c9d 100644
--- a/src/storage/pinned_memory_storage.h
+++ b/src/storage/pinned_memory_storage.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cpu_device_storage.h
  * \brief CPU storage with pinned memory
  */
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index b2c6633a80..7defa54b69 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file pooled_storage_manager.h
  * \brief Storage manager with a memory pool.
  */
@@ -58,12 +59,12 @@ class GPUPooledStorageManager final : public StorageManager {
     ReleaseAll();
   }
 
-  void* Alloc(size_t raw_size) override;
-  void Free(void* ptr, size_t raw_size) override;
+  void Alloc(Storage::Handle* handle) override;
+  void Free(Storage::Handle handle) override;
 
-  void DirectFree(void* ptr, size_t raw_size) override {
-    cudaError_t err = cudaFree(ptr);
-    size_t size = raw_size + NDEV;
+  void DirectFree(Storage::Handle handle) override {
+    cudaError_t err = cudaFree(handle.dptr);
+    size_t size = handle.size + NDEV;
     // ignore unloading error, as memory has already been recycled
     if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
       LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
@@ -86,9 +87,9 @@ class GPUPooledStorageManager final : public StorageManager {
   DISALLOW_COPY_AND_ASSIGN(GPUPooledStorageManager);
 };  // class GPUPooledStorageManager
 
-void* GPUPooledStorageManager::Alloc(size_t raw_size) {
+void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
   std::lock_guard<std::mutex> lock(mutex_);
-  size_t size = raw_size + NDEV;
+  size_t size = handle->size + NDEV;
   auto&& reuse_it = memory_pool_.find(size);
   if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
     size_t free, total;
@@ -102,26 +103,29 @@ void* GPUPooledStorageManager::Alloc(size_t raw_size) {
       LOG(FATAL) << "cudaMalloc failed: " << cudaGetErrorString(e);
     }
     used_memory_ += size;
-    return ret;
+    handle->dptr = ret;
   } else {
     auto&& reuse_pool = reuse_it->second;
     auto ret = reuse_pool.back();
     reuse_pool.pop_back();
-    return ret;
+    handle->dptr = ret;
   }
 }
 
-void GPUPooledStorageManager::Free(void* ptr, size_t raw_size) {
+void GPUPooledStorageManager::Free(Storage::Handle handle) {
   std::lock_guard<std::mutex> lock(mutex_);
-  size_t size = raw_size + NDEV;
+  size_t size = handle.size + NDEV;
   auto&& reuse_pool = memory_pool_[size];
-  reuse_pool.push_back(ptr);
+  reuse_pool.push_back(handle.dptr);
 }
 
 void GPUPooledStorageManager::ReleaseAll() {
   for (auto&& i : memory_pool_) {
     for (auto&& j : i.second) {
-      DirectFree(j, i.first - NDEV);
+      Storage::Handle handle;
+      handle.dptr = j;
+      handle.size = i.first - NDEV;
+      DirectFree(handle);
     }
   }
   memory_pool_.clear();
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index fa15a44b4f..ce40daa904 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  */
 #include <mxnet/storage.h>
 #include <mshadow/tensor.h>
@@ -26,6 +27,7 @@
 #include "./storage_manager.h"
 #include "./naive_storage_manager.h"
 #include "./pooled_storage_manager.h"
+#include "./cpu_shared_storage_manager.h"
 #include "./cpu_device_storage.h"
 #include "./pinned_memory_storage.h"
 #include "../common/cuda_utils.h"
@@ -36,9 +38,10 @@ namespace mxnet {
 // consider change storage as a pure abstract class
 class StorageImpl : public Storage {
  public:
-  Handle Alloc(size_t size, Context ctx) override;
+  void Alloc(Handle* handle) override;
   void Free(Handle handle) override;
   void DirectFree(Handle handle) override;
+  void SharedIncrementRefCount(Handle handle) override;
   StorageImpl() {}
   virtual ~StorageImpl() = default;
 
@@ -51,12 +54,13 @@ class StorageImpl : public Storage {
 
   static void ActivateDevice(Context ctx) {
     switch (ctx.dev_type) {
-      case Context::kCPU: break;
+      case Context::kCPU:
+      case Context::kCPUShared: break;
       case Context::kGPU:
       case Context::kCPUPinned: {
 #if MXNET_USE_CUDA
           if (num_gpu_device > 0) {
-            CUDA_CALL(cudaSetDevice(ctx.dev_id));
+            CUDA_CALL(cudaSetDevice(ctx.real_dev_id()));
           }
 #endif  // MXNET_USE_CUDA
           break;
@@ -73,20 +77,21 @@ class StorageImpl : public Storage {
 int StorageImpl::num_gpu_device = 0;
 #endif  // MXNET_USE_CUDA
 
-Storage::Handle StorageImpl::Alloc(size_t size, Context ctx) {
+void StorageImpl::Alloc(Storage::Handle* handle) {
   // space already recycled, ignore request
-  Handle hd;
-  hd.ctx = ctx;
-  hd.size = size;
-  auto&& device = storage_managers_.at(ctx.dev_type);
+  auto&& device = storage_managers_.at(handle->ctx.dev_type);
   std::shared_ptr<storage::StorageManager> manager = device.Get(
-      ctx.dev_id, [ctx]() {
+      handle->ctx.real_dev_id(), [handle]() {
         storage::StorageManager *ptr = nullptr;
-        switch (ctx.dev_type) {
+        switch (handle->ctx.dev_type) {
           case Context::kCPU: {
             ptr = new storage::NaiveStorageManager<storage::CPUDeviceStorage>();
             break;
           }
+          case Context::kCPUShared: {
+            ptr = new storage::CPUSharedStorageManager();
+            break;
+          }
           case Context::kCPUPinned: {
 #if MXNET_USE_CUDA
             num_gpu_device = 0;
@@ -114,38 +119,47 @@ Storage::Handle StorageImpl::Alloc(size_t size, Context ctx) {
 #endif  // MXNET_USE_CUDA
             break;
           }
-          default: LOG(FATAL) <<  "Unimplemented device " << ctx.dev_type;
+          default: LOG(FATAL) <<  "Unimplemented device " << handle->ctx.dev_type;
         }
         return ptr;
       });
-  this->ActivateDevice(ctx);
-  hd.dptr = manager->Alloc(size);
-  return hd;
+
+  this->ActivateDevice(handle->ctx);
+  manager->Alloc(handle);
 }
 
 void StorageImpl::Free(Storage::Handle handle) {
   const Context &ctx = handle.ctx;
   auto&& device = storage_managers_.at(ctx.dev_type);
   std::shared_ptr<storage::StorageManager> manager = device.Get(
-      ctx.dev_id, []() {
+      ctx.real_dev_id(), []() {
         LOG(FATAL) <<  "Cannot Free space to a device you have not allocated";
         return nullptr;
       });
   this->ActivateDevice(ctx);
-  manager->Free(handle.dptr, handle.size);
+  manager->Free(handle);
 }
 
 void StorageImpl::DirectFree(Storage::Handle handle) {
   const Context &ctx = handle.ctx;
   auto&& device = storage_managers_.at(ctx.dev_type);
   std::shared_ptr<storage::StorageManager> manager = device.Get(
-      ctx.dev_id, []() {
+      ctx.real_dev_id(), []() {
         LOG(FATAL) <<  "Cannot Free space to a device you have not allocated";
         return nullptr;
       });
   this->ActivateDevice(ctx);
-  // directly free ths data.
-  manager->DirectFree(handle.dptr, handle.size);
+  manager->DirectFree(handle);
+}
+
+void StorageImpl::SharedIncrementRefCount(Storage::Handle handle) {
+  CHECK_EQ(handle.ctx.dev_type, Context::kCPUShared);
+  auto&& device = storage_managers_.at(Context::kCPUShared);
+  auto manager = device.Get(0, []() {
+      LOG(FATAL) << "Cannot increment ref count before allocating any shared memory.";
+      return nullptr;
+    });
+  dynamic_cast<storage::CPUSharedStorageManager*>(manager.get())->IncrementRefCount(handle);
 }
 
 std::shared_ptr<Storage> Storage::_GetSharedRef() {
diff --git a/src/storage/storage_manager.h b/src/storage/storage_manager.h
index 924d2ed48b..15a2c7ecff 100644
--- a/src/storage/storage_manager.h
+++ b/src/storage/storage_manager.h
@@ -18,12 +18,15 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file storage_manager.h
  * \brief Storage manager.
  */
+
 #ifndef MXNET_STORAGE_STORAGE_MANAGER_H_
 #define MXNET_STORAGE_STORAGE_MANAGER_H_
 
+#include <mxnet/storage.h>
 #include <cstddef>
 
 namespace mxnet {
@@ -39,19 +42,19 @@ class StorageManager {
    * \param size Size to allocate.
    * \return Pointer to the storage.
    */
-  virtual void* Alloc(size_t size) = 0;
+  virtual void Alloc(Storage::Handle* handle) = 0;
   /*!
    * \brief Deallocation.
    * \param ptr Pointer to deallocate.
    * \param size Size of the storage.
    */
-  virtual void Free(void* ptr, size_t size) = 0;
+  virtual void Free(Storage::Handle handle) = 0;
   /*!
    * \brief Direct de-allocation.
    * \param ptr Pointer to deallocate.
    * \param size Size of the storage.
    */
-  virtual void DirectFree(void* ptr, size_t size) = 0;
+  virtual void DirectFree(Storage::Handle handle) = 0;
   /*!
    * \brief Destructor.
    */
diff --git a/tests/cpp/engine/threaded_engine_test.cc b/tests/cpp/engine/threaded_engine_test.cc
index 58b7e57a50..be60ecfc53 100644
--- a/tests/cpp/engine/threaded_engine_test.cc
+++ b/tests/cpp/engine/threaded_engine_test.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file threaded_engine_test.cc
  * \brief threaded engine tests
 */
diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h
index c454c95847..1bcd0e2df4 100644
--- a/tests/cpp/include/test_core_op.h
+++ b/tests/cpp/include/test_core_op.h
@@ -611,6 +611,50 @@ class CoreOpProp {
 template<typename DType>
 using CoreOperatorRunner = test::OperatorRunner<CoreOpProp, CoreOpExecutor<DType>>;
 
+
+/*!
+ * \brief Rune a core op forward and backward
+ * \tparam DType Data type
+ * \param isGPU true if operation is to be run on the GPU
+ * \param op_kwargs Operator parameters
+ * \param op_name Operator name as registered with nnvm
+ * \param backward_op_name Backwards operator name as registered with nnvm
+ *        If blank, the runner will attempt to determine the backwards operator. If it fails,
+ *        an exception will be thrown.
+ *        If the string is [none], then no backward operator will be created or executed
+ */
+template<typename DType = float>
+inline void BasicRunCoreOpBidirectional(const bool isGPU,
+                                        bool verbose,
+                                        const kwargs_t& op_kwargs,
+                                        const std::vector<TShape>& shapes,
+                                        const char *op_name,
+                                        const char *backward_op_name = "") {
+  test::op::CoreOpExecutor<DType> op(isGPU, shapes);
+  op.set_verbose(false);
+
+  op.Init(op.ArgsWithOpName(op_kwargs, op_name, backward_op_name));
+
+  if (verbose) {
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.inputs());
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs());
+  }
+  op.Execute();
+  if (verbose) {
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs());
+  }
+  if (op.HasBackward()) {
+    if (verbose) {
+      PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_inputs());
+      PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs());
+    }
+    op.ExecuteBackward();
+    if (verbose) {
+      PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs());
+    }
+  }
+}
+
 }  // namespace op
 }  // namespace test
 }  // namespace mxnet
diff --git a/tests/cpp/include/test_op.h b/tests/cpp/include/test_op.h
index cbafe14152..bddade0830 100644
--- a/tests/cpp/include/test_op.h
+++ b/tests/cpp/include/test_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file test_op.h
  * \brief operator unit test utility functions
  * \author Chris Olivier
diff --git a/tests/cpp/include/test_op_runner.h b/tests/cpp/include/test_op_runner.h
index eb259997cd..3b06b1a29f 100644
--- a/tests/cpp/include/test_op_runner.h
+++ b/tests/cpp/include/test_op_runner.h
@@ -145,8 +145,14 @@ class OperatorRunner {
     std::stringstream ss;
     ss << "Timing: " << COUNT << " iterations of " << count << " calls";
     if (timing_shapes[0].ndim()) {
-      // TODO(cjolivier01): Print all shapes (if they differ)
-      ss << ", shape = " << timing_shapes[0] << std::endl << std::flush;
+      ss << ", shape = ";
+      for (size_t i = 0, n = timing_shapes.size(); i < n; ++i) {
+        if (i) {
+          ss << ", ";
+        }
+        ss << timing_shapes[i];
+      }
+      ss << std::endl << std::flush;
     }
     std::cout << ss.str();
 
diff --git a/tests/cpp/include/test_perf.h b/tests/cpp/include/test_perf.h
index 7971ed7985..672b28a426 100644
--- a/tests/cpp/include/test_perf.h
+++ b/tests/cpp/include/test_perf.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file test_perf.h
  * \brief operator unit test utility functions
  * \author Chris Olivier
diff --git a/tests/cpp/include/test_util.h b/tests/cpp/include/test_util.h
index 33ca3c47d0..edfa2d0660 100644
--- a/tests/cpp/include/test_util.h
+++ b/tests/cpp/include/test_util.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file test_util.h
  * \brief unit test performance analysis functions
  * \author Chris Olivier
diff --git a/tests/cpp/misc/memory_test.cc b/tests/cpp/misc/memory_test.cc
index a36f7f93ae..8f4e8c25e8 100644
--- a/tests/cpp/misc/memory_test.cc
+++ b/tests/cpp/misc/memory_test.cc
@@ -79,7 +79,7 @@ TEST(MEMORY_TEST, MemsetAndMemcopyPerformance) {
 
       start = test::perf::getNannoTickCount();
       #pragma omp parallel for num_threads(GetOMPThreadCount())
-      for (int i = 0; i < test_size; ++i) {
+      for (int i = 0; i < static_cast<int>(test_size); ++i) {
         src[i] = 42;
       }
       const uint64_t omp_set_time = test::perf::getNannoTickCount() - start;
@@ -94,7 +94,7 @@ TEST(MEMORY_TEST, MemsetAndMemcopyPerformance) {
 
       start = test::perf::getNannoTickCount();
       #pragma omp parallel for num_threads(GetOMPThreadCount())
-      for (int i = 0; i < test_size; ++i) {
+      for (int i = 0; i < static_cast<int>(test_size); ++i) {
         dest[i] = src[i];
       }
       const uint64_t omp_copy_time = test::perf::getNannoTickCount() - start;
diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc
index 24b5600a71..8f53ee5588 100644
--- a/tests/cpp/operator/batchnorm_test.cc
+++ b/tests/cpp/operator/batchnorm_test.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file batchnorm_test.cc
  * \brief batchnorm operator unit test utility functions
  * \author Chris Olivier
@@ -1424,7 +1425,7 @@ static TShape MakeShape(const std::vector<index_t>& shape,
   CHECK_LT(channelAxis, shape.size() + 1);
   const index_t dim = index_t(shape.size()) + 1;
   TShape newShape(dim);
-  for (size_t x = 0; x < channelAxis; ++x) {
+  for (size_t x = 0; x < static_cast<size_t>(channelAxis); ++x) {
     newShape[x] = index_t(shape[x]);
   }
   newShape[channelAxis] = index_t(channelCount);
diff --git a/tests/cpp/operator/broadcast_perf.cc b/tests/cpp/operator/broadcast_perf.cc
index 6986c4d27e..5edba0b6d1 100644
--- a/tests/cpp/operator/broadcast_perf.cc
+++ b/tests/cpp/operator/broadcast_perf.cc
@@ -31,34 +31,35 @@ using namespace mxnet;
 
 using kwargs_t = test::op::kwargs_t;
 
-template<typename DType = float>
-static void RunCoreOpBidirectional(const bool isGPU,
-                                   const kwargs_t& op_kwargs,
-                                   const char *op_name,
-                                   const char *backward_op_name = "") {
-  const std::vector<TShape> shapes = { {2, 3}, {2, 1} };
-  test::op::CoreOpExecutor<DType> op(isGPU, shapes);
-  op.set_verbose(false);
-
-  op.Init(op.ArgsWithOpName(op_kwargs, op_name, backward_op_name));
-
-  PRINT_NDARRAYS(op.ctx().run_ctx, op.inputs());
-  PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs());
-  op.Execute();
-  PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs());
-  if (op.HasBackward()) {
-    PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_inputs());
-    PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs());
-    op.ExecuteBackward();
-    PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs());
-  }
-}
-
 /*!
  * \brief Generic bidirectional sanity test
  */
 TEST(BROADCAST_PERF, ExecuteBidirectional) {
-  RunCoreOpBidirectional(false, {}, "broadcast_add", "_backward_broadcast_add");
+  test::op::BasicRunCoreOpBidirectional(false, true, {},
+                                        { {2, 3}, {2, 1} },
+                                        "broadcast_add", "_backward_broadcast_add");
+}
+
+static const std::vector<std::vector<TShape>> broadcast_shapes() {
+  std::vector<std::vector<TShape>> shapes;
+  if (test::performance_run) {
+    shapes = {
+      { {28,  28},  {28, 1} },
+      { {64,  28},  {1, 28} },
+      { {28,  28, 28},  {28, 28, 1} },
+      { {128, 128}, {1, 128} },
+      { {1024, 12, 256}, {1024, 1, 1} },
+      { {2560, 1280}, {2560, 1} }
+    };
+  } else {
+    shapes = {
+      // Non-performance dataset acts as a sanity test
+      { {28,  28},  {28, 1} },
+      { {128, 128}, {128, 1} },
+      { {28,  28, 28},  {28, 28, 1} }
+    };
+  }
+  return std::move(shapes);
 }
 
 template<typename DType = float>
@@ -74,20 +75,7 @@ static void RunCoreOpTimingTest(const bool isGPU,
   runner.RunBidirectional(false, { {2, 3}, {2, 1} }, kwargs, 1);
 
   // Do the performance runs
-  std::vector<std::vector<TShape>> shapes;
-  if (test::performance_run) {
-    shapes = {
-      { {28,  28},  {28, 1} },
-      { {18,  32} , {18, 1} },
-      { {128, 128}, {128, 1} },
-      { {2560, 1280}, {2560, 1} }
-    };
-  } else {
-    shapes = {
-      { {28,  28},  {28, 1} },
-      { {128, 128}, {128, 1} }
-    };
-  }
+  std::vector<std::vector<TShape>> shapes = broadcast_shapes();
   const char *pu = isGPU ? "GPU" : "CPU";
   for (const std::vector<TShape> &shape : shapes) {
     runner.TimingTest(std::string(op_name) + " Operator " + pu, isGPU, false, kwargs,
diff --git a/tests/cpp/operator/krprod_test.cc b/tests/cpp/operator/krprod_test.cc
index 31b8ab9dd7..26c2661bc3 100644
--- a/tests/cpp/operator/krprod_test.cc
+++ b/tests/cpp/operator/krprod_test.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  *  \file krprod_test.cc
  *  \brief Test Khatri-Rao product
  *  \author Jencir Lee
diff --git a/tests/cpp/storage/storage_test.cc b/tests/cpp/storage/storage_test.cc
index 8af3984eb4..269480b83c 100644
--- a/tests/cpp/storage/storage_test.cc
+++ b/tests/cpp/storage/storage_test.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file storage_test.cc
  * \brief cpu/gpu storage tests
 */
diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc
index eaf9e3c219..fff1ca2ebd 100644
--- a/tests/cpp/test_main.cc
+++ b/tests/cpp/test_main.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file test_main.cc
  * \brief operator unit test utility functions
  * \author Chris Olivier
diff --git a/tests/nightly/TestDoc/doc_spell_checker.py b/tests/nightly/TestDoc/doc_spell_checker.py
index a7b8b250c9..a33807e3d5 100644
--- a/tests/nightly/TestDoc/doc_spell_checker.py
+++ b/tests/nightly/TestDoc/doc_spell_checker.py
@@ -92,7 +92,7 @@ def check_doc(file_content, spell_checker, spell_check_ret):
     """
     spell_checker.set_text(file_content)
     for error in spell_checker:
-        if spell_check_ret.has_key(error.word):
+        if error.word in spell_check_ret:
             spell_check_ret[error.word] += 1
         else:
             spell_check_ret[error.word] = 1
diff --git a/tests/nightly/dist_device_sync_kvstore.py b/tests/nightly/dist_device_sync_kvstore.py
new file mode 100644
index 0000000000..75b48f42c5
--- /dev/null
+++ b/tests/nightly/dist_device_sync_kvstore.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+sys.path.insert(0, "../../python/")
+import mxnet as mx
+import numpy as np
+import numpy.random as rnd
+import time
+
+def check_diff_to_scalar(A, x, rank=None):
+    """ assert A == x"""
+    assert(np.sum(np.abs((A - x).asnumpy())) == 0), (rank, A.asnumpy(), x)
+
+# setup
+keys = ['3', '5', '7']
+init_test_keys = [str(i) for i in range(200,300)]
+init_test_keys_big = [str(i) for i in range(300,400)]
+init_test_keys_device = [str(i) for i in range(400,500)]
+init_test_keys_device_big = [str(i) for i in range(500,600)]
+
+rate = 2
+shape = (2, 3)
+big_shape = (1200, 1200)        # bigger than MXNET_KVSTORE_BIGARRAY_BOUND
+
+kv = mx.kv.create('dist_device_sync')
+
+def init_kv():
+    # init kv dns keys
+    kv.init(keys, [mx.nd.ones(shape)] * len(keys))
+    kv.init('99', mx.nd.ones(big_shape))
+    # worker info
+    my_rank = kv.rank
+    nworker = kv.num_workers
+    # init updater on servers
+    kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate))
+    return kv, my_rank, nworker
+
+def test_sync_push_pull():
+    kv, my_rank, nworker = init_kv()
+    num_gpus = 2
+    def check_default_keys(kv, my_rank, nworker):
+        nrepeat = 3
+        # checks pull after push in loop, because behavior during
+        # consecutive pushes doesn't offer any guarantees
+        for i in range(nrepeat):
+            scale = my_rank + 1
+            kv.push('3', [mx.nd.ones(shape, ctx=mx.gpu(j)) * scale for j in range(num_gpus)])
+            kv.push('99', [mx.nd.ones(big_shape, ctx=mx.gpu(j)) * scale for j in range(num_gpus)])
+            num = (nworker + 1) * nworker * rate * num_gpus / 2 * (i + 1) + 1
+            val = mx.nd.zeros(shape)
+            kv.pull('3', out=val)
+            check_diff_to_scalar(val, num)
+            val2 = mx.nd.zeros(big_shape)
+            kv.pull('99', out=val2)
+            check_diff_to_scalar(val2, num)
+
+    check_default_keys(kv, my_rank, nworker)
+    print('worker ' + str(my_rank) + ' is done')
+
+def test_sync_init():
+    def check_init(kv, cur_keys, cur_shape, device=False):
+        ctx = mx.gpu(0) if device else mx.cpu()
+        val = [mx.nd.zeros(cur_shape, ctx) for i in cur_keys]
+        for i in range(len(cur_keys)):
+            expected = i
+            kv.init(cur_keys[i], [mx.nd.ones(cur_shape, ctx) * i])
+            kv.pull(cur_keys[i], out=val[i])
+            check_diff_to_scalar(val[i], expected)
+    check_init(kv, init_test_keys, shape)
+    check_init(kv, init_test_keys_big, big_shape)
+    check_init(kv, init_test_keys_device, shape, device=True)
+    check_init(kv, init_test_keys_device_big, big_shape, device=True)
+    my_rank = kv.rank
+    print('worker ' + str(my_rank) + ' is initialized')
+
+if __name__ == "__main__":
+    test_sync_init()
+    test_sync_push_pull()
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 4126a8e33d..13b547eb47 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -37,6 +37,7 @@
 from test_sparse_ndarray import test_create_csr, test_create_row_sparse, test_sparse_nd_slice
 from test_sparse_ndarray import test_create_sparse_nd_empty, test_create_sparse_nd_from_sparse
 from test_sparse_ndarray import test_create_sparse_nd_from_dense, test_create_sparse_nd_infer_shape
+from test_sparse_ndarray import test_sparse_nd_check_format
 from test_sparse_operator import *
 from test_ndarray import *
 
diff --git a/tests/python/unittest/test_engine.py b/tests/python/unittest/test_engine.py
new file mode 100644
index 0000000000..29b7b822b3
--- /dev/null
+++ b/tests/python/unittest/test_engine.py
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import nose
+import mxnet as mx
+
+def test_bulk():
+    with mx.engine.bulk(10):
+        x = mx.nd.ones((10,))
+        x *= 2
+        x += 1
+        x.wait_to_read()
+        x += 1
+        assert (x.asnumpy() == 4).all()
+        for i in range(100):
+            x += 1
+    assert (x.asnumpy() == 104).all()
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index 397fbbd33e..63c5d28b7c 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -17,6 +17,7 @@
 
 import os
 import tarfile
+import unittest
 import mxnet as mx
 import numpy as np
 from mxnet import gluon
@@ -92,6 +93,20 @@ def test_image_folder_dataset():
     assert len(dataset.items) == 16
 
 
+class Dataset(gluon.data.Dataset):
+    def __len__(self):
+        return 100
+    def __getitem__(self, key):
+        return mx.nd.full((10,), key)
+
+@unittest.skip("Somehow fails with MKL. Cannot reproduce locally")
+def test_multi_worker():
+    data = Dataset()
+    loader = gluon.data.DataLoader(data, batch_size=1, num_workers=5)
+    for i, batch in enumerate(loader):
+        assert (batch.asnumpy() == i).all()
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index 2c70358397..fa314e0f8b 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -162,10 +162,10 @@ def test_NDArrayIter_csr():
     csr, _ = rand_sparse_ndarray(shape, 'csr')
     dns = csr.asnumpy()
 
-    # CSRNDArray with last_batch_handle not equal to 'discard' will throw NotImplementedError 
+    # CSRNDArray with last_batch_handle not equal to 'discard' will throw NotImplementedError
     assertRaises(NotImplementedError, mx.io.NDArrayIter, {'data': csr}, dns, batch_size,
                  last_batch_handle='pad')
-    
+
     # CSRNDArray with shuffle
     csr_iter = iter(mx.io.NDArrayIter({'csr_data': csr, 'dns_data': dns}, dns, batch_size,
                     shuffle=True, last_batch_handle='discard'))
@@ -247,7 +247,7 @@ def check_libSVMIter_news_data():
 
     check_libSVMIter_synthetic()
     check_libSVMIter_news_data()
-    
+
 @unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/7826")
 def test_CSVIter():
     def check_CSVIter_synthetic():
diff --git a/tests/python/unittest/test_kvstore.py b/tests/python/unittest/test_kvstore.py
index fc9e3be3c4..174d577556 100644
--- a/tests/python/unittest/test_kvstore.py
+++ b/tests/python/unittest/test_kvstore.py
@@ -300,4 +300,4 @@ def check_invalid_key_types_list(kv, key):
     import nose
     nose.runmodule()
 
-    
+
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index 722ba9885c..a8fb99dfc1 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -117,7 +117,7 @@ def sym_gen(seq_len):
                 fc  = mx.symbol.FullyConnected(data=fc, weight=weight, bias=bias,
                                                name='dev2_fc_%d' % i, num_hidden=num_hidden)
             sym = mx.symbol.SoftmaxOutput(fc, label, name='softmax')
-        
+
         return sym, ('data',), ('label',)
 
     mod = mx.mod.BucketingModule(sym_gen=sym_gen, default_bucket_key=10, context=[mx.cpu(0)],
diff --git a/tests/python/unittest/test_multi_device_exec.py b/tests/python/unittest/test_multi_device_exec.py
index 0a2739d9bb..aa279b1837 100644
--- a/tests/python/unittest/test_multi_device_exec.py
+++ b/tests/python/unittest/test_multi_device_exec.py
@@ -20,6 +20,17 @@
 import mxnet as mx
 
 def test_ctx_group():
+    def check_ctx_group(group2ctx, grad_req, mlp, set_stage1):
+        texec = mlp.simple_bind(mx.cpu(0),
+                                group2ctx=group2ctx,
+                                data=(1,200), grad_req=grad_req)
+
+        for arr, name in zip(texec.arg_arrays, mlp.list_arguments()):
+            if name in set_stage1:
+                assert arr.context == group2ctx['stage1']
+            else:
+                assert arr.context == group2ctx['stage2']
+
     with mx.AttrScope(ctx_group='stage1'):
         data = mx.symbol.Variable('data')
         fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
@@ -40,15 +51,14 @@ def test_ctx_group():
         'stage2' : mx.cpu(2)
     }
 
-    texec = mlp.simple_bind(mx.cpu(0),
-                            group2ctx=group2ctx,
-                            data=(1,200))
+    # generate reqs with null
+    grad_req_with_null = {}
+    for arg in mlp.list_arguments():
+        grad_req_with_null[arg] = 'null' if arg == 'data' else 'write'
 
-    for arr, name in zip(texec.arg_arrays, mlp.list_arguments()):
-        if name in set_stage1:
-            assert arr.context == group2ctx['stage1']
-        else:
-            assert arr.context == group2ctx['stage2']
+    grad_reqs = ['write', grad_req_with_null]
+    for grad_req in grad_reqs:
+        check_ctx_group(group2ctx, grad_req, mlp, set_stage1)
 
 def test_ctx_group_sparse():
     with mx.AttrScope(ctx_group='stage1'):
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 5bdadc4a29..8e1f68fd62 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -586,7 +586,7 @@ def gt_topk(dat, axis, ret_typ, k, is_ascend):
     nd_ret_argsort = mx.nd.argsort(a_nd, axis=None, is_ascend=False).asnumpy()
     gt = gt_topk(a_npy, axis=None, ret_typ="indices", k=5*5*5*5, is_ascend=False)
     assert_almost_equal(nd_ret_argsort, gt)
-    
+
     # test topk with a big shape
     a = mx.nd.arange(0, 54686454, step=1, repeat=1)
     assert_almost_equal(a.topk(k=54686454).asnumpy(), a.asnumpy()[::-1])
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 93dc4a0534..55a3a57218 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1043,6 +1043,7 @@ def test_convolution_grouping():
         np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4)
 
 
+@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/8712")
 def test_depthwise_convolution():
     for num_base in [1, 4, 16, 32, 64]:
         for kernel in [(3,3), (5,5)]:
@@ -1267,6 +1268,18 @@ def test_bneq(a, b):
 
 
 def test_broadcast_binary_op():
+    def check_bmaxmin_gradient(test_sym, x, y, delta, rtol, atol):
+        """This function ensures that checking the numerical gradient of
+        broadcast_max/min is not crossing the boundary y=x where there
+        is no gradient definition at those sigularities."""
+        x_max = np.max(x)
+        y = x_max + 2 * delta + np.random.random(y.shape)
+        check_numeric_gradient(test_sym, [x, y], numeric_eps=delta, rtol=rtol, atol=atol)
+
+        x_min = np.min(x)
+        y = x_min - 2 * delta - np.random.random(y.shape)
+        check_numeric_gradient(test_sym, [x, y], numeric_eps=delta, rtol=rtol, atol=atol)
+
     a = mx.sym.Variable('a')
     b = mx.sym.Variable('b')
 
@@ -1316,13 +1329,15 @@ def test_bmax(a, b):
         c = mx.sym.broadcast_maximum(a, b)
         check_binary_op_forward(c, lambda x, y: np.maximum(x, y), gen_broadcast_data, mx_nd_func=mx.nd.maximum)
         # pass idx=200 to gen_broadcast_data so that generated ndarrays' sizes are not too big
-        check_numeric_gradient(c, gen_broadcast_data(idx=200), rtol=1e-2, atol=1e-3)
+        data = gen_broadcast_data(idx=200)
+        check_bmaxmin_gradient(c, data[0], data[1], 0.001, 1e-2, 1e-3)
 
     def test_bmin(a, b):
         c = mx.sym.broadcast_minimum(a, b)
         check_binary_op_forward(c, lambda x, y: np.minimum(x, y), gen_broadcast_data, mx_nd_func=mx.nd.minimum)
         # pass idx=200 to gen_broadcast_data so that generated ndarrays' sizes are not too big
-        check_numeric_gradient(c, gen_broadcast_data(idx=200), rtol=1e-2, atol=1e-3)
+        data = gen_broadcast_data(idx=200)
+        check_bmaxmin_gradient(c, data[0], data[1], 0.001, 1e-2, 1e-3)
 
     test_bplus(a, b)
     test_bminus(a, b)
@@ -3570,10 +3585,18 @@ def test_rcbrt_op():
 def test_custom_op():
     class Sqr(mx.operator.CustomOp):
         def forward(self, is_train, req, in_data, out_data, aux):
-            self.assign(out_data[0], req[0], in_data[0]*in_data[0])
+            if in_data[0].stype == 'default':
+                aux[0][:] = 1
+                self.assign(out_data[0], req[0], in_data[0]*in_data[0])
+            else:
+                self.assign(out_data[0], req[0], mx.nd.sparse.square(in_data[0]))
+                if in_data[0].stype == 'csr':
+                    assert(isinstance(in_data[0], mx.nd.sparse.CSRNDArray))
 
         def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
             self.assign(in_grad[0], req[0], 2*in_data[0]*out_grad[0])
+            if in_data[0].stype == 'default':
+                assert (aux[0].asnumpy() == 1).all()
 
     @mx.operator.register("sqr")
     class SqrProp(mx.operator.CustomOpProp):
@@ -3586,33 +3609,49 @@ def list_arguments(self):
         def list_outputs(self):
             return ['output']
 
+        def list_auxiliary_states(self):
+            return ['aux']
+
         def infer_shape(self, in_shape):
-            return in_shape, [in_shape[0]], []
+            return in_shape, [in_shape[0]], [in_shape[0]]
 
         def infer_type(self, in_type):
-            return in_type, [in_type[0]], []
+            return in_type, [in_type[0]], [in_type[0]]
+
+        def infer_storage_type(self, in_stype):
+            if in_stype[0] == 'default':
+                return ['default'], ['default'], ['default']
+            return ['csr'], ['csr'], ['csr']
+
+        def infer_storage_type_backward(self, in_stype):
+            if in_stype[1] == 'default':
+                return ['default', 'default', 'default'], ['default'], ['default']
+            return ['default', 'csr', 'csr'], ['csr'], ['csr']
 
         def create_operator(self, ctx, shapes, dtypes):
             return Sqr()
 
     data = mx.symbol.Variable('data')
-    op = mx.symbol.Custom(data=data, name='sqr', op_type='sqr')
+    aux = mx.symbol.Variable('aux')
+    op = mx.symbol.Custom(data=data, aux=aux, name='sqr', op_type='sqr')
     x = mx.nd.array(np.random.uniform(-1, 1, size=(4, 10)))
-    check_numeric_gradient(op, [x])
+    aux = mx.nd.zeros_like(x)
+    check_numeric_gradient(op, [x], [aux])
 
-    data = mx.symbol.Variable('data')
     data = mx.symbol.cast(data, dtype='float64')
-    op = mx.symbol.Custom(data=data, name='sqr', op_type='sqr')
     op = mx.symbol.cast(op, dtype='float32')
-    x = mx.nd.array(np.random.uniform(-1, 1, size=(4, 10)))
-    check_numeric_gradient(op, [x])
+    check_numeric_gradient(op, [x], [aux])
 
-    dx = mx.nd.zeros_like(x)
-    mx.contrib.autograd.mark_variables([x], [dx])
+    x = x.tostype('csr')
+    aux = mx.nd.zeros_like(x)
+    x.attach_grad()
     with mx.contrib.autograd.train_section():
-        y = mx.nd.Custom(x, op_type='sqr')
+        y = mx.nd.Custom(x, aux, op_type='sqr')
         y.backward()
-
+    mx.nd.waitall()
+    assert (x.grad.stype == 'csr')
+    assert (y.stype == 'csr')
+    assert (aux.stype == 'csr')
 
 def test_psroipooling():
     for num_rois in [1, 2]:
diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py
index 7576050f55..e59e476601 100644
--- a/tests/python/unittest/test_sparse_ndarray.py
+++ b/tests/python/unittest/test_sparse_ndarray.py
@@ -22,7 +22,7 @@
 from mxnet.base import mx_real_t
 from numpy.testing import assert_allclose
 import numpy.random as rnd
-
+from common import assertRaises
 from mxnet.ndarray.sparse import RowSparseNDArray, CSRNDArray
 
 
@@ -233,6 +233,7 @@ def check_binary(fn, stype):
             oshape = np.random.randint(1, 6, size=(ndim,))
             bdim = 2
             lshape = list(oshape)
+            # one for broadcast op, another for elemwise op
             rshape = list(oshape[ndim-bdim:])
             for i in range(bdim):
                 sep = np.random.uniform(0, 1)
@@ -736,19 +737,92 @@ def test_powerlaw_generator(csr_arr, final_row=1):
     test_powerlaw_generator(csr_arr_big, final_row=4)
     test_powerlaw_generator(csr_arr_square, final_row=6)
 
+def test_sparse_nd_fluent():
+    def check_fluent_regular(stype, func, kwargs, shape=(5, 17), equal_nan=False):
+        with mx.name.NameManager():
+            data = mx.nd.random_uniform(shape=shape, ctx=default_context()).tostype(stype)
+            regular = getattr(mx.ndarray, func)(data, **kwargs)
+            fluent = getattr(data, func)(**kwargs)
+            if isinstance(regular, list):
+                for r, f in zip(regular, fluent):
+                    assert almost_equal(r.asnumpy(), f.asnumpy(), equal_nan=equal_nan)
+            else:
+                assert almost_equal(regular.asnumpy(), fluent.asnumpy(), equal_nan=equal_nan)
+
+    common_func = ['zeros_like', 'square']
+    rsp_func = ['round', 'rint', 'fix', 'floor', 'ceil', 'trunc',
+                'abs', 'sign', 'sin', 'degrees', 'radians', 'expm1']
+    for func in common_func:
+        check_fluent_regular('csr', func, {})
+    for func in common_func + rsp_func:
+        check_fluent_regular('row_sparse', func, {})
+
+    rsp_func = ['arcsin', 'arctan', 'tan', 'sinh', 'tanh',
+                'arcsinh', 'arctanh', 'log1p', 'sqrt', 'relu']
+    for func in rsp_func:
+        check_fluent_regular('row_sparse', func, {}, equal_nan=True)
+
+    check_fluent_regular('csr', 'slice', {'begin': (2, 5), 'end': (4, 7)}, shape=(5, 17))
+    check_fluent_regular('row_sparse', 'clip', {'a_min': -0.25, 'a_max': 0.75})
+
+    for func in ['sum', 'mean']:
+        check_fluent_regular('csr', func, {'axis': 0})
+
+
 def test_sparse_nd_exception():
     """ test invalid sparse operator will throw a exception """
     a = mx.nd.ones((2,2))
-    assert_exception(mx.nd.sparse.retain, mx.base.MXNetError,
-                     a, invalid_arg="garbage_value")
-    assert_exception(mx.nd.sparse.csr_matrix, ValueError,
-                     a, shape=(3,2))
-    assert_exception(mx.nd.sparse.csr_matrix, ValueError,
-                     (2,2), shape=(3,2))
-    assert_exception(mx.nd.sparse.row_sparse_array, ValueError,
-                     (2,2), shape=(3,2))
-    assert_exception(mx.nd.sparse.zeros, ValueError,
-                     "invalid_stype", (2,2))
+    assertRaises(mx.base.MXNetError, mx.nd.sparse.retain, a, invalid_arg="garbage_value")
+    assertRaises(ValueError, mx.nd.sparse.csr_matrix, a, shape=(3,2))
+    assertRaises(ValueError, mx.nd.sparse.csr_matrix, (2,2), shape=(3,2))
+    assertRaises(ValueError, mx.nd.sparse.row_sparse_array, (2,2), shape=(3,2))
+    assertRaises(ValueError, mx.nd.sparse.zeros, "invalid_stype", (2,2))
+
+def test_sparse_nd_check_format():
+    """ test check_format for sparse ndarray """
+    shape = rand_shape_2d()
+    stypes = ["csr", "row_sparse"]
+    for stype in stypes:
+        arr, _ = rand_sparse_ndarray(shape, stype)
+        arr.check_format()
+        arr = mx.nd.sparse.zeros(stype, shape)
+        arr.check_format()
+    # CSR format index pointer array should be less than the number of rows
+    shape = (3, 4)
+    data_list = [7, 8, 9]
+    indices_list = [0, 2, 1]
+    indptr_list = [0, 5, 2, 3]
+    a = mx.nd.sparse.csr_matrix((data_list, indices_list, indptr_list), shape=shape)
+    assertRaises(mx.base.MXNetError, a.check_format)
+    # CSR format indices should be in ascending order per row
+    indices_list = [2, 1, 1]
+    indptr_list = [0, 2, 2, 3]
+    a = mx.nd.sparse.csr_matrix((data_list, indices_list, indptr_list), shape=shape)
+    assertRaises(mx.base.MXNetError, a.check_format)
+    # CSR format indptr should end with value equal with size of indices
+    indices_list = [1, 2, 1]
+    indptr_list = [0, 2, 2, 4]
+    a = mx.nd.sparse.csr_matrix((data_list, indices_list, indptr_list), shape=shape)
+    assertRaises(mx.base.MXNetError, a.check_format)
+    # CSR format indices should not be negative
+    indices_list = [0, 2, 1]
+    indptr_list = [0, -2, 2, 3]
+    a = mx.nd.sparse.csr_matrix((data_list, indices_list, indptr_list), shape=shape)
+    assertRaises(mx.base.MXNetError, a.check_format)
+    # Row Sparse format indices should be less than the number of rows
+    shape = (3, 2)
+    data_list = [[1, 2], [3, 4]]
+    indices_list = [1, 4]
+    a = mx.nd.sparse.row_sparse_array((data_list, indices_list), shape=shape)
+    assertRaises(mx.base.MXNetError, a.check_format)
+    # Row Sparse format indices should be in ascending order
+    indices_list = [1, 0]
+    a = mx.nd.sparse.row_sparse_array((data_list, indices_list), shape=shape)
+    assertRaises(mx.base.MXNetError, a.check_format)
+    # Row Sparse format indices should not be negative
+    indices_list = [1, -2]
+    a = mx.nd.sparse.row_sparse_array((data_list, indices_list), shape=shape)
+    assertRaises(mx.base.MXNetError, a.check_format)
 
 
 if __name__ == '__main__':
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index 0db9f451dd..a08b6187bc 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -1195,6 +1195,7 @@ def check_cast_storage(shape, density, from_stype, to_stype, check_numeric_grad=
             check_cast_storage((dim0, rnd.randint(512, 1024)), d, 'default', 'row_sparse',
                                check_numeric_grad=False)
 
+
 def test_sparse_dot():
     def test_dot_csr(lhs_shape, rhs_shape, rhs_stype, trans_lhs, lhs_density, rhs_density):
         lhs_nd = rand_ndarray(lhs_shape, 'csr', density=lhs_density, shuffle_csr_indices=False)
@@ -1222,18 +1223,38 @@ def test_dot_csr(lhs_shape, rhs_shape, rhs_stype, trans_lhs, lhs_density, rhs_de
                                 grad_req={'lhs': 'null', 'rhs': 'write'},
                                 rtol=1e-3, atol=1e-4)
 
+    def test_sparse_dot_zero_output(lhs_shape, trans_lhs, rhs_num_cols):
+        """Test for nnr_out = 0. Before the fix, the test would fail."""
+        lhs = mx.nd.zeros(lhs_shape)
+        irow = np.random.randint(0, lhs_shape[0])
+        icol = np.random.randint(0, lhs_shape[1])
+        lhs[irow, icol] = 1.0
+        if trans_lhs:
+            rhs = rand_ndarray(shape=(lhs_shape[0], rhs_num_cols), stype='default')
+            rhs[irow, :] = 0
+        else:
+            rhs = rand_ndarray(shape=(lhs_shape[1], rhs_num_cols), stype='default')
+            rhs[icol, :] = 0
+        dns_out = mx.nd.dot(lhs, rhs, transpose_a=trans_lhs)
+        assert mx.nd.sum(mx.nd.abs(dns_out)).asscalar() == 0
+        sps_out = mx.nd.sparse.dot(lhs.tostype('csr'), rhs.tostype('row_sparse'), transpose_a=trans_lhs)
+        assert same(dns_out.asnumpy(), sps_out.asnumpy())
+
     density = [1.00, 0.50, 0.01]
     for lhs_d in density:
         lhs_shape = rand_shape_2d(50, 200)
         rhs_d = 1
-        test_dot_csr(lhs_shape, (lhs_shape[1], 1), 'default', False, lhs_d, rhs_d) # test gpu SpMV
-        test_dot_csr(lhs_shape, (lhs_shape[0], 1), 'default', True , lhs_d, rhs_d) # (vector kernel)
-        test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(5, 10)), 'default', False, lhs_d, rhs_d) # test gpu SpMM
-        test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(5, 10)), 'default', True , lhs_d, rhs_d) # (scalar kernel)
+        test_dot_csr(lhs_shape, (lhs_shape[1], 1), 'default', False, lhs_d, rhs_d)  # test gpu SpMV
+        test_dot_csr(lhs_shape, (lhs_shape[0], 1), 'default', True,  lhs_d, rhs_d)  # (vector kernel)
+        test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(5, 10)), 'default', False, lhs_d, rhs_d)  # test gpu SpMM
+        test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(5, 10)), 'default', True, lhs_d, rhs_d)  # (scalar kernel)
         for rhs_d in density:
             test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(1, 10)), 'row_sparse', False, lhs_d, rhs_d)
             test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(1, 10)), 'row_sparse', True, lhs_d, rhs_d)
 
+    test_sparse_dot_zero_output(rand_shape_2d(50, 200), False, 40)
+    test_sparse_dot_zero_output(rand_shape_2d(50, 200), True, 40)
+
 
 def test_sparse_slice():
     def check_csr_slice(shape, slice_input):
@@ -1353,6 +1374,7 @@ def check_sparse_function(name, mxnet_func, forward_numpy_call, backward_numpy_c
                           lambda output, outg: outg * assign_each(output, lambda x: x * (1.0 - x)),
                           backward_is_use_output=True)
 
+
 def test_sparse_nd_zeros():
     def check_sparse_nd_zeros(stype, shape):
         zero = mx.nd.zeros(shape)
@@ -1364,6 +1386,7 @@ def check_sparse_nd_zeros(stype, shape):
     check_sparse_nd_zeros('csr', shape)
     check_sparse_nd_zeros('default', shape)
 
+
 def test_sparse_nd_zeros_like():
     def check_sparse_nd_zeros_like(stype, shape):
         zero = mx.nd.zeros(shape, stype=stype)
@@ -1374,6 +1397,7 @@ def check_sparse_nd_zeros_like(stype, shape):
     check_sparse_nd_zeros_like('row_sparse', shape)
     check_sparse_nd_zeros_like('csr', shape)
 
+
 def test_sparse_axis_operations():
     def test_variations(func_name):
         dim0 = 30
@@ -1403,13 +1427,14 @@ def test_fallback(func_name, axis=0, keepdims=True, exclude=True):
     test_variations(mx.nd.mean)
     test_fallback(mx.nd.mean, axis=0, keepdims=True, exclude=True)
 
+
 def test_sparse_square_sum():
     if default_context().device_type == 'cpu':
         dim0 = 30
         dim1 = 30
         axes = [0, 1]
         keepdims = [False, True]
-        densities = [0, 0.01, 0.1, 0.2, 0.5]
+        densities = [0, 0.01, 0.2, 0.5, 1.0]
         for density in densities:
             shape = rand_shape_2d(dim0, dim1)
             rsp = rand_ndarray(shape, 'row_sparse', density)
@@ -1428,11 +1453,11 @@ def test_sparse_square_sum():
                     rsp_data = mx.sym.Variable('data', stype='row_sparse')
                     test = mx.symbol._internal._square_sum(rsp_data, axis=axis, keepdims=keepdim)
 
-                    # check symbolic backward since ograd can be a rsp
+                    # check symbolic backward since ograd can be an rsp
                     # and cannot be checked through check_numeric_gradient
                     # because it will add a loss layer as the output layer
                     # which makes ograd of the square_sum dense
-                    if axis == 1 and keepdims:
+                    if axis == 1 and keepdim:
                         dns_data = mx.sym.Variable('data')
                         baseline = mx.sym.sum(mx.sym.square(dns_data), axis=axis, keepdims=keepdim)
                         igrad_expected = mx.nd.empty(dns.shape)
@@ -1440,13 +1465,29 @@ def test_sparse_square_sum():
                                                       args_grad=[igrad_expected])
                         baseline_exec.forward(is_train=True)
                         baseline_exec.backward([ret_expected])
-                        check_symbolic_backward(test, [rsp], [ret], [igrad_expected.asnumpy()],
+                        # check backward when ograd is row sparse
+                        check_symbolic_backward(test, [rsp], [ret_expected.tostype('row_sparse')],
+                                                [igrad_expected.asnumpy()], grad_stypes={'data': 'row_sparse'})
+
+                        # check backward when ograd is dense
+                        # the stype of output of the square_sum is deteremined in symbol binding stage.
+                        # The ograd stype of the last layer is the same as the output stype of the last layer.
+                        # Need to add one more layer after square_sum to trigger the kernel for ograd
+                        # with default stype in square_sum op.
+                        baseline1 = baseline + 1
+                        baseline_exec1 = baseline1.bind(default_context(), args=[dns],
+                                                        args_grad=[igrad_expected])
+                        baseline_exec1.forward(is_train=True)
+                        baseline_exec1.backward([ret_expected])
+                        test1 = test + 1
+                        check_symbolic_backward(test1, [rsp], [ret_expected], [igrad_expected.asnumpy()],
                                                 grad_stypes={'data': 'row_sparse'})
 
                     # check numeric gradient
                     check_numeric_gradient(test, [rsp], grad_stype_dict={'data': 'row_sparse'},
                                            atol=1e-2, rtol=0.1)
 
+
 def test_sparse_storage_fallback():
     """ test operators which don't implement FComputeEx or FStatefulComputeEx """
     def check_broadcast_add(shape, lhs_stype, rhs_stype):
@@ -1515,6 +1556,7 @@ def check_operator_with_temp_resource(shape, stype):
             check_softmax_with_shape(lhs, rhs, shape, preserve_shape=False)
             check_softmax_with_shape(rhs, rhs, shape, preserve_shape=True)
 
+
 def test_sparse_elementwise_sum():
     def check_sparse_elementwise_sum_with_shape(stype, shape, n):
         # forward
@@ -1545,45 +1587,46 @@ def check_sparse_elementwise_sum_with_shape(stype, shape, n):
         shape = tuple(np.random.randint(5, 10, size=dim))
         check_sparse_elementwise_sum_with_shape('row_sparse', shape, np.random.randint(1, 9))
 
+
 def test_sparse_embedding():
     ''' test sparse embedding op on cpu '''
     def check_sparse_embedding(executor, weight_ref, data_onehot, grad, density):
         # update weight based on density
         weight[:] = rand_ndarray(weight.shape, 'row_sparse', density=density)
         # check forward
-        exe_test.forward(is_train=True)
-        assert_almost_equal(exe_test.outputs[0].asnumpy(), np.dot(data_onehot, weight.asnumpy()))
+        executor.forward(is_train=True)
+        assert_almost_equal(executor.outputs[0].asnumpy(), np.dot(data_onehot, weight.asnumpy()))
         # check backward
         executor.backward([grad])
         assert_almost_equal(grad_map["embed_weight"].asnumpy(), np.dot(data_onehot.T, grad.asnumpy()))
 
-    if default_context().device_type == 'cpu':
-        densities = [0, 0.5, 1]
-        in_dim = 50
-        out_dim = 3
-        batch = 8
-        # init executor
-        data = mx.sym.Variable("data")
-        weight = mx.sym.Variable("embed_weight", stype='row_sparse')
-        embed = mx.sym.contrib.SparseEmbedding(data=data, weight=weight, input_dim=in_dim,
-                                               output_dim=out_dim, name="embed")
-        grad_req = {'data': 'null', 'embed_weight': 'write'}
-        exe_test = embed.simple_bind(default_context(), grad_req=grad_req, data=(batch,))
-        arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays))
-        grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays))
-        # init data
-        np_data = np.random.randint(low=0, high=in_dim, size=batch)
-        np_onehot = np.zeros((batch, in_dim))
-        np_onehot[np.arange(batch), np_data] = 1.0
-        arg_map["data"][:] = np_data
-        # init grad
-        np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape)
-        grad = mx.nd.sparse.zeros('row_sparse', np_grad.shape)
-        grad[:] = np_grad
-        # weight
-        weight = arg_map["embed_weight"]
-        for density in densities:
-            check_sparse_embedding(exe_test, weight, np_onehot, grad, density)
+    densities = [0, 0.5, 1]
+    in_dim = 50
+    out_dim = 3
+    batch = 8
+    # init executor
+    data = mx.sym.Variable("data")
+    weight = mx.sym.Variable("embed_weight", stype='row_sparse')
+    embed = mx.sym.contrib.SparseEmbedding(data=data, weight=weight, input_dim=in_dim,
+                                           output_dim=out_dim, name="embed")
+    grad_req = {'data': 'null', 'embed_weight': 'write'}
+    exe_test = embed.simple_bind(default_context(), grad_req=grad_req, data=(batch,))
+    arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays))
+    grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays))
+    # init data
+    np_data = np.random.randint(low=0, high=in_dim, size=batch)
+    np_onehot = np.zeros((batch, in_dim))
+    np_onehot[np.arange(batch), np_data] = 1.0
+    arg_map["data"][:] = np_data
+    # init grad
+    np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape)
+    grad = mx.nd.sparse.zeros('row_sparse', np_grad.shape)
+    grad[:] = np_grad
+    # weight
+    weight = arg_map["embed_weight"]
+    for density in densities:
+        check_sparse_embedding(exe_test, weight, np_onehot, grad, density)
+
 
 def test_scatter_ops():
     def csr_get_seen_points(name, csr_array, verbose=False):
@@ -1729,6 +1772,7 @@ def check_scatter_ops(name, shape, lhs_stype, rhs_stype, forward_mxnet_call, for
                           lambda l, r: l + r,
                           rhs_is_scalar=True, verbose=False, density=0.5)
 
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tools/accnn/rank_selection.py b/tools/accnn/rank_selection.py
index 66937b2859..c5c026114a 100644
--- a/tools/accnn/rank_selection.py
+++ b/tools/accnn/rank_selection.py
@@ -81,7 +81,7 @@ def get_ranksel(model, ratio):
         if nxt_c > EC:
           continue
         nxt_v = dp[now][now_c] + math.log(S[i][d])
-        if dp[nxt].has_key(nxt_c):
+        if nxt_c in dp[nxt]:
           if nxt_v > dp[nxt][nxt_c]:
             dp[nxt][nxt_c] = nxt_v
             dpc[i][nxt_c] = (d,now_c)
diff --git a/tools/accnn/utils.py b/tools/accnn/utils.py
index 25fb188956..2795f8558f 100644
--- a/tools/accnn/utils.py
+++ b/tools/accnn/utils.py
@@ -20,6 +20,7 @@
 import json
 import ast
 
+
 def load_model(args):
   devs = mx.cpu() if args.gpus == None else [mx.gpu(int(i)) for i in args.gpus.split(',')]
   return mx.model.FeedForward.load(args.model, args.load_epoch, ctx=devs)
@@ -29,7 +30,7 @@ def topsort(nodes):
   deg = [0]*n
   g = [[] for _ in xrange(n)]
   for i,node in enumerate(nodes):
-    if node.has_key('inputs'):
+    if 'inputs' in node:
       for j in node['inputs']:
         deg[i] += 1
         g[j[0]].append(i)
@@ -45,7 +46,7 @@ def topsort(nodes):
         q.append(j)
   new_ids=dict([(node['name'],i) for i,node in enumerate(res)])
   for node in res:
-    if node.has_key('inputs'):
+    if 'inputs' in node:
       for j in node['inputs']:
         j[0]=new_ids[nodes[j[0]]['name']]
   return res
diff --git a/tools/coreml/converter/__init__.py b/tools/coreml/converter/__init__.py
index 245692337b..13a83393a9 100644
--- a/tools/coreml/converter/__init__.py
+++ b/tools/coreml/converter/__init__.py
@@ -14,4 +14,3 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
diff --git a/tools/coreml/converter/_layers.py b/tools/coreml/converter/_layers.py
index fe00232828..4c5ebc6fb0 100644
--- a/tools/coreml/converter/_layers.py
+++ b/tools/coreml/converter/_layers.py
@@ -38,6 +38,30 @@ def _get_node_name(net, node_id):
 def _get_node_shape(net, node_id):
     return net['nodes'][node_id]['shape']
 
+def _get_attrs(node):
+    """get attribute dict from node
+
+    This functions keeps backward compatibility
+    for both attr and attrs key in the json field.
+
+    Parameters
+    ----------
+    node : dict
+       The json graph Node
+
+    Returns
+    -------
+    attrs : dict
+       The attr dict, returns empty dict if
+       the field do not exist.
+    """
+    if 'attrs' in node:
+        return node['attrs']
+    elif 'attr' in node:
+        return node['attr']
+    else:
+        return {}
+
 
 # TODO These operators still need to be converted (listing in order of priority):
 # High priority:
@@ -108,7 +132,7 @@ def convert_transpose(net, node, module, builder):
     """
     input_name, output_name = _get_input_output_name(net, node)
     name = node['name']
-    param = node['attr']
+    param = _get_attrs(node)
 
     axes = literal_eval(param['axes'])
     builder.add_permute(name, axes, input_name, output_name)
@@ -180,7 +204,7 @@ def convert_activation(net, node, module, builder):
     """
     input_name, output_name = _get_input_output_name(net, node)
     name = node['name']
-    mx_non_linearity = node['attr']['act_type']
+    mx_non_linearity = _get_attrs(node)['act_type']
     #TODO add SCALED_TANH, SOFTPLUS, SOFTSIGN, SIGMOID_HARD, LEAKYRELU, PRELU, ELU, PARAMETRICSOFTPLUS, THRESHOLDEDRELU, LINEAR
     if mx_non_linearity == 'relu':
         non_linearity = 'RELU'
@@ -281,7 +305,7 @@ def convert_convolution(net, node, module, builder):
     """
     input_name, output_name = _get_input_output_name(net, node)
     name = node['name']
-    param = node['attr']
+    param = _get_attrs(node)
     inputs = node['inputs']
     args, _ = module.get_params()
 
@@ -361,7 +385,7 @@ def convert_pooling(net, node, module, builder):
     """
     input_name, output_name = _get_input_output_name(net, node)
     name = node['name']
-    param = node['attr']
+    param = _get_attrs(node)
 
     layer_type_mx = param['pool_type']
     if layer_type_mx == 'max':
@@ -445,9 +469,9 @@ def convert_batchnorm(net, node, module, builder):
 
     eps = 1e-3 # Default value of eps for MXNet.
     use_global_stats = False # Default value of use_global_stats for MXNet.
-    if 'attr' in node:
-        if 'eps' in node['attr']:
-            eps = literal_eval(node['attr']['eps'])
+    attrs = _get_attrs(node)
+    if 'eps' in attrs:
+        eps = literal_eval(attrs['eps'])
 
     args, aux = module.get_params()
     gamma = args[_get_node_name(net, inputs[1][0])].asnumpy()
@@ -511,7 +535,7 @@ def convert_deconvolution(net, node, module, builder):
     """
     input_name, output_name = _get_input_output_name(net, node)
     name = node['name']
-    param = node['attr']
+    param = _get_attrs(node)
     inputs = node['inputs']
     args, _ = module.get_params()
 
diff --git a/tools/im2rec.cc b/tools/im2rec.cc
index 8568140240..915b78029c 100644
--- a/tools/im2rec.cc
+++ b/tools/im2rec.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file im2rec.cc
  * \brief convert images into image recordio format
  *  Image Record Format: zeropad[64bit] imid[64bit] img-binary-content
diff --git a/tools/license_header.py b/tools/license_header.py
index db67000837..e26fd2beca 100644
--- a/tools/license_header.py
+++ b/tools/license_header.py
@@ -119,13 +119,6 @@ def process_file(fname, action, verbose=True):
     elif action == 'check':
         return False
     _, ext = os.path.splitext(fname)
-    # remove old license
-    if ext == '.h' or ext == '.cc' or ext == '.cu' or ext == '.cpp' \
-        or ext == '.hpp':
-        for i, l in enumerate(lines):
-            if _OLD_LICENSE.match(l.decode('utf-8')):
-                del lines[i]
-                break
     with open(fname, 'wb') as f:
         # shebang line
         if lines[0].startswith(b'#!'):


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services