You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by la...@apache.org on 2020/10/12 18:19:45 UTC
[incubator-mxnet] branch master updated: Fix python API doc and all rst warnings for sphinx website build (#19329)

This is an automated email from the ASF dual-hosted git repository.

lausen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 5ed72b1  Fix python API doc and all rst warnings for sphinx website build (#19329)
5ed72b1 is described below

commit 5ed72b16d297fa244c1217bbf692cd909bd06d46
Author: Sheng Zha <sz...@users.noreply.github.com>
AuthorDate: Mon Oct 12 14:18:30 2020 -0400

    Fix python API doc and all rst warnings for sphinx website build (#19329)
    
    Automated CI checks treating all warnings as errors will be introduced in a separate commit.
---
 docs/python_docs/_static/transformer.png           |  Bin 0 -> 296782 bytes
 docs/python_docs/python/Makefile                   |    1 +
 docs/python_docs/python/Makefile_sphinx            |    2 +-
 docs/python_docs/python/api/autograd/index.rst     |    2 +-
 .../python/api/contrib/autograd/index.rst          |   23 -
 docs/python_docs/python/api/contrib/index.rst      |   14 +-
 .../python/api/contrib/quantization/index.rst      |   23 -
 .../python_docs/python/api/gluon/contrib/index.rst |    1 +
 docs/python_docs/python/api/gluon/data/index.rst   |    7 +-
 .../python/api/gluon/data/vision/index.rst         |   15 +-
 docs/python_docs/python/api/gluon/hybrid_block.rst |    2 +-
 docs/python_docs/python/api/gluon/index.rst        |    3 +-
 docs/python_docs/python/api/gluon/nn/index.rst     |    2 +-
 .../python/api/gluon/parameter_dict.rst            |   25 -
 docs/python_docs/python/api/gluon/symbol_block.rst |    3 +-
 docs/python_docs/python/api/gluon/trainer.rst      |    3 +-
 docs/python_docs/python/api/index.rst              |   13 +-
 .../python/api/legacy/callback/index.rst           |    2 +-
 docs/python_docs/python/api/legacy/index.rst       |    6 -
 .../python/api/legacy/monitor/index.rst            |   23 -
 .../python/api/legacy/ndarray/contrib/index.rst    |    2 +-
 .../python/api/legacy/ndarray/image/index.rst      |    2 +-
 .../python/api/legacy/ndarray/linalg/index.rst     |    2 +-
 .../python/api/legacy/ndarray/ndarray.rst          |    2 +-
 .../python/api/legacy/ndarray/op/index.rst         |    2 +-
 .../python/api/legacy/ndarray/random/index.rst     |    2 +-
 .../python/api/legacy/ndarray/register/index.rst   |    2 +-
 .../python/api/legacy/ndarray/sparse/index.rst     |    2 +-
 .../python/api/legacy/ndarray/utils/index.rst      |    2 +-
 docs/python_docs/python/api/module/index.rst       |   24 -
 docs/python_docs/python/api/np/arrays.indexing.rst |   31 +-
 docs/python_docs/python/api/np/arrays.ndarray.rst  |  170 +--
 docs/python_docs/python/api/np/arrays.rst          |   32 +-
 docs/python_docs/python/api/np/index.rst           |   24 +-
 docs/python_docs/python/api/np/random/index.rst    |   63 +-
 .../python/api/np/routines.array-creation.rst      |   34 +-
 .../python/api/np/routines.array-manipulation.rst  |   67 +-
 docs/python_docs/python/api/np/routines.io.rst     |  109 +-
 docs/python_docs/python/api/np/routines.linalg.rst |   43 +-
 docs/python_docs/python/api/np/routines.math.rst   |   80 +-
 docs/python_docs/python/api/np/routines.rst        |   41 +-
 docs/python_docs/python/api/np/routines.sort.rst   |   26 +-
 .../python/api/np/routines.statistics.rst          |   30 +-
 docs/python_docs/python/api/npx/index.rst          |   18 +-
 docs/python_docs/python/api/runtime/index.rst      |   42 +-
 docs/python_docs/python/scripts/conf.py            |    5 +-
 .../getting-started/crash-course/index.rst         |   19 +-
 .../tutorials/getting-started/to-mxnet/index.rst   |    2 -
 docs/python_docs/python/tutorials/index.rst        |    3 +-
 .../tutorials/packages/gluon/text/transformer.rst  |    2 +-
 .../python/tutorials/packages/index.rst            |    1 +
 .../python/tutorials/packages/viz/index.rst        |    4 +-
 .../python/tutorials/performance/backend/index.rst |    9 +-
 .../performance/backend/tensorrt/index.rst         |   35 -
 .../backend/tensorrt/wavenet_optimized.svg         |    1 -
 .../backend/tensorrt/wavenet_unoptimized.svg       |   17 -
 python/mxnet/__init__.py                           |    2 +
 python/mxnet/gluon/data/dataloader.py              |   31 +-
 python/mxnet/gluon/metric.py                       |   16 +-
 python/mxnet/gluon/trainer.py                      |    2 +-
 python/mxnet/io/io.py                              |    4 +-
 python/mxnet/ndarray/ndarray.py                    |   90 +-
 python/mxnet/numpy/fallback.py                     |  122 +-
 python/mxnet/numpy/linalg.py                       |  169 ++-
 python/mxnet/numpy/multiarray.py                   | 1240 ++++++++++----------
 python/mxnet/optimizer/lars.py                     |   16 +-
 python/mxnet/symbol/symbol.py                      |   40 +-
 python/mxnet/util.py                               |   33 +-
 src/operator/contrib/bounding_box.cc               |   28 +-
 src/operator/contrib/dynamic_shape_ops.cc          |   26 +-
 src/operator/image/crop.cc                         |   58 +-
 src/operator/image/image_random.cc                 |  189 +--
 src/operator/image/resize.cc                       |   57 +-
 src/operator/numpy/np_elemwise_unary_op_basic.cc   |    3 +
 src/operator/tensor/matrix_op.cc                   |  124 +-
 .../python/unittest/test_numpy_interoperability.py |   52 +-
 tests/python/unittest/test_numpy_op.py             |  110 +-
 tools/license_header.py                            |    4 +-
 78 files changed, 1618 insertions(+), 1918 deletions(-)

diff --git a/docs/python_docs/_static/transformer.png b/docs/python_docs/_static/transformer.png
new file mode 100644
index 0000000..2561c01
Binary files /dev/null and b/docs/python_docs/_static/transformer.png differ
diff --git a/docs/python_docs/python/Makefile b/docs/python_docs/python/Makefile
index c2472b6..154bcf2 100644
--- a/docs/python_docs/python/Makefile
+++ b/docs/python_docs/python/Makefile
@@ -46,6 +46,7 @@ build/%: %
 html: $(OBJ)
 	mkdir -p build;
 	cp Makefile_sphinx build/Makefile;
+	cp -n -r ../_static build/ || true;
 	sphinx-autogen build/api/*.rst build/api/**/*.rst   -t build/_templates/;
 	# make -C build linkcheck doctest html
 	make -C build html;
diff --git a/docs/python_docs/python/Makefile_sphinx b/docs/python_docs/python/Makefile_sphinx
index 0e4558b..a90366f 100644
--- a/docs/python_docs/python/Makefile_sphinx
+++ b/docs/python_docs/python/Makefile_sphinx
@@ -37,7 +37,7 @@ endif # $(NUMJOBS)
 # End number of processors detection
 
 # You can set these variables from the command line.
-SPHINXOPTS    = -j$(NPROCS) -c ../scripts
+SPHINXOPTS    = -j$(NPROCS) -c ../scripts --keep-going
 SPHINXBUILD   = sphinx-build
 PAPER         =
 BUILDDIR      = _build
diff --git a/docs/python_docs/python/api/autograd/index.rst b/docs/python_docs/python/api/autograd/index.rst
index c7964b4..f997dc0 100644
--- a/docs/python_docs/python/api/autograd/index.rst
+++ b/docs/python_docs/python/api/autograd/index.rst
@@ -16,7 +16,7 @@
    under the License.
 
 mxnet.autograd
-===============
+==============
 
 .. automodule:: mxnet.autograd
     :members:
diff --git a/docs/python_docs/python/api/contrib/autograd/index.rst b/docs/python_docs/python/api/contrib/autograd/index.rst
deleted file mode 100644
index 76b0f1e..0000000
--- a/docs/python_docs/python/api/contrib/autograd/index.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
-
-contrib.autograd
-================
-
-.. automodule:: mxnet.contrib.autograd
-    :members:
-    :autosummary:
diff --git a/docs/python_docs/python/api/contrib/index.rst b/docs/python_docs/python/api/contrib/index.rst
index a05dad0..1319239 100644
--- a/docs/python_docs/python/api/contrib/index.rst
+++ b/docs/python_docs/python/api/contrib/index.rst
@@ -16,7 +16,7 @@
    under the License.
 
 mxnet.contrib
-===============
+=============
 
 .. automodule:: mxnet.contrib
 
@@ -26,12 +26,6 @@ Contributed modules
 .. container:: cards
 
    .. card::
-      :title: contrib.autograd
-      :link: autograd/index.html
-
-      Functions for Automatic differentiation.
-
-   .. card::
       :title: contrib.io
       :link: io/index.html
 
@@ -50,12 +44,6 @@ Contributed modules
       ONNX support.
 
    .. card::
-      :title: contrib.quantization
-      :link: quantization/index.html
-
-      Functions for quantization.
-
-   .. card::
       :title: contrib.symbol
       :link: symbol/index.html
 
diff --git a/docs/python_docs/python/api/contrib/quantization/index.rst b/docs/python_docs/python/api/contrib/quantization/index.rst
deleted file mode 100644
index a0f7ca5..0000000
--- a/docs/python_docs/python/api/contrib/quantization/index.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
-
-contrib.quantization
-====================
-
-.. automodule:: mxnet.contrib.quantization
-    :members:
-    :autosummary:
diff --git a/docs/python_docs/python/api/gluon/contrib/index.rst b/docs/python_docs/python/api/gluon/contrib/index.rst
index 90c782d..4f82407 100644
--- a/docs/python_docs/python/api/gluon/contrib/index.rst
+++ b/docs/python_docs/python/api/gluon/contrib/index.rst
@@ -20,6 +20,7 @@ gluon.contrib
 
 This document lists the contrib APIs in Gluon:
 
+.. currentmodule:: mxnet.gluon.contrib
 
 .. autosummary::
     :nosignatures:
diff --git a/docs/python_docs/python/api/gluon/data/index.rst b/docs/python_docs/python/api/gluon/data/index.rst
index 7b31e17..85ae4fe 100644
--- a/docs/python_docs/python/api/gluon/data/index.rst
+++ b/docs/python_docs/python/api/gluon/data/index.rst
@@ -18,7 +18,12 @@
 gluon.data
 ==========
 
-.. automodule:: mxnet.gluon.data
+.. currentmodule:: mxnet.gluon.data
+
+.. autosummary::
+    :nosignatures:
+
+    mxnet.gluon.data
 
 Datasets
 --------
diff --git a/docs/python_docs/python/api/gluon/data/vision/index.rst b/docs/python_docs/python/api/gluon/data/vision/index.rst
index 2731b5f..0e6a9e6 100644
--- a/docs/python_docs/python/api/gluon/data/vision/index.rst
+++ b/docs/python_docs/python/api/gluon/data/vision/index.rst
@@ -16,9 +16,14 @@
    under the License.
 
 data.vision
-============
+===========
 
-.. automodule:: mxnet.gluon.data.vision
+.. currentmodule:: mxnet.gluon.data.vision
+
+.. autosummary::
+    :nosignatures:
+
+    mxnet.gluon.data.vision
 
 Datasets
 ^^^^^^^^
@@ -26,7 +31,7 @@ Datasets
 .. autosummary::
     :nosignatures:
 
-    mxnet.gluon.data.vision.datasets
+    datasets
 
 
 Data transformations
@@ -36,7 +41,7 @@ Data transformations
 .. autosummary::
     :nosignatures:
 
-    mxnet.gluon.data.vision.transforms
+    transforms
 
 
 API Reference
@@ -50,4 +55,4 @@ API Reference
    :maxdepth: 2
    :glob:
 
-   */index
\ No newline at end of file
+   */index
diff --git a/docs/python_docs/python/api/gluon/hybrid_block.rst b/docs/python_docs/python/api/gluon/hybrid_block.rst
index fa2e156..d8aeb31 100644
--- a/docs/python_docs/python/api/gluon/hybrid_block.rst
+++ b/docs/python_docs/python/api/gluon/hybrid_block.rst
@@ -22,4 +22,4 @@ gluon.HybridBlock
 .. autoclass:: mxnet.gluon.HybridBlock
     :members:
     :inherited-members:
-    :autosummary:
\ No newline at end of file
+    :autosummary:
diff --git a/docs/python_docs/python/api/gluon/index.rst b/docs/python_docs/python/api/gluon/index.rst
index 8a8b425..32af7ae 100644
--- a/docs/python_docs/python/api/gluon/index.rst
+++ b/docs/python_docs/python/api/gluon/index.rst
@@ -16,7 +16,7 @@
    under the License.
 
 mxnet.gluon
-============
+===========
 
 The Gluon library in Apache MXNet provides a clear, concise, and simple API for deep learning.
 It makes it easy to prototype, build, and train deep learning models without sacrificing training speed.
@@ -159,6 +159,5 @@ Utilities
    symbol_block
    constant
    parameter
-   parameter_dict
    trainer
    */index
diff --git a/docs/python_docs/python/api/gluon/nn/index.rst b/docs/python_docs/python/api/gluon/nn/index.rst
index 1c7051e..0094647 100644
--- a/docs/python_docs/python/api/gluon/nn/index.rst
+++ b/docs/python_docs/python/api/gluon/nn/index.rst
@@ -14,7 +14,7 @@
    KIND, either express or implied.  See the License for the
    specific language governing permissions and limitations
    under the License.
-   
+
 gluon.nn
 ========
 
diff --git a/docs/python_docs/python/api/gluon/parameter_dict.rst b/docs/python_docs/python/api/gluon/parameter_dict.rst
deleted file mode 100644
index 80f7194..0000000
--- a/docs/python_docs/python/api/gluon/parameter_dict.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
-
-gluon.ParameterDict
-===================
-
-
-.. autoclass:: mxnet.gluon.ParameterDict
-    :members:
-    :inherited-members:
-    :autosummary:
diff --git a/docs/python_docs/python/api/gluon/symbol_block.rst b/docs/python_docs/python/api/gluon/symbol_block.rst
index 2d94f20..10a8841 100644
--- a/docs/python_docs/python/api/gluon/symbol_block.rst
+++ b/docs/python_docs/python/api/gluon/symbol_block.rst
@@ -16,8 +16,7 @@
    under the License.
 
 gluon.SymbolBlock
-===========
-
+=================
 
 .. autoclass:: mxnet.gluon.SymbolBlock
     :members:
diff --git a/docs/python_docs/python/api/gluon/trainer.rst b/docs/python_docs/python/api/gluon/trainer.rst
index fe25b3d..a666ef6 100644
--- a/docs/python_docs/python/api/gluon/trainer.rst
+++ b/docs/python_docs/python/api/gluon/trainer.rst
@@ -16,8 +16,7 @@
    under the License.
 
 gluon.Trainer
-===========
-
+=============
 
 .. autoclass:: mxnet.gluon.Trainer
     :members:
diff --git a/docs/python_docs/python/api/index.rst b/docs/python_docs/python/api/index.rst
index f39578e..9183743 100644
--- a/docs/python_docs/python/api/index.rst
+++ b/docs/python_docs/python/api/index.rst
@@ -27,7 +27,7 @@ following categories:
 
 
 Imperative API
----------------
+--------------
 .. container:: cards
 
    .. card::
@@ -187,12 +187,6 @@ Legacy
       Functions to track various statuses during an epoch.
 
    .. card::
-      :title: mxnet.monitor
-      :link: legacy/monitor/index.html
-
-      Outputs, weights, and gradients for debugging
-
-   .. card::
       :title: mxnet.image
       :link: legacy/image/index.html
 
@@ -229,12 +223,7 @@ Legacy
    initializer/index
    optimizer/index
    lr_scheduler/index
-   metric/index
    kvstore/index
-   module/index
    contrib/index
-   image/index
-   ndarray/index
-   symbol/index
    legacy/index
    */index*
diff --git a/docs/python_docs/python/api/legacy/callback/index.rst b/docs/python_docs/python/api/legacy/callback/index.rst
index 99cefaf..583b8a6 100644
--- a/docs/python_docs/python/api/legacy/callback/index.rst
+++ b/docs/python_docs/python/api/legacy/callback/index.rst
@@ -16,7 +16,7 @@
    under the License.
 
 mxnet.callback
-===============
+==============
 
 .. automodule:: mxnet.callback
     :members:
diff --git a/docs/python_docs/python/api/legacy/index.rst b/docs/python_docs/python/api/legacy/index.rst
index 3026f07..d5ee4d7 100644
--- a/docs/python_docs/python/api/legacy/index.rst
+++ b/docs/python_docs/python/api/legacy/index.rst
@@ -41,12 +41,6 @@ This document hosts API for legacy modules that are being deprecated in MXNet 2.
       Functions to track various statuses during an epoch.
 
    .. card::
-      :title: mxnet.monitor
-      :link: monitor/index.html
-
-      Outputs, weights, and gradients for debugging
-
-   .. card::
       :title: mxnet.io
       :link: io/index.html
 
diff --git a/docs/python_docs/python/api/legacy/monitor/index.rst b/docs/python_docs/python/api/legacy/monitor/index.rst
deleted file mode 100644
index dda8cb6..0000000
--- a/docs/python_docs/python/api/legacy/monitor/index.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
-
-mxnet.monitor
-=============
-
-.. automodule:: mxnet.monitor
-    :members:
-    :autosummary:
diff --git a/docs/python_docs/python/api/legacy/ndarray/contrib/index.rst b/docs/python_docs/python/api/legacy/ndarray/contrib/index.rst
index a26ad22..22817ea 100644
--- a/docs/python_docs/python/api/legacy/ndarray/contrib/index.rst
+++ b/docs/python_docs/python/api/legacy/ndarray/contrib/index.rst
@@ -20,4 +20,4 @@ ndarray.contrib
 
 .. automodule:: mxnet.ndarray.contrib
     :members:
-    :autosummary:
\ No newline at end of file
+    :autosummary:
diff --git a/docs/python_docs/python/api/legacy/ndarray/image/index.rst b/docs/python_docs/python/api/legacy/ndarray/image/index.rst
index aca1800..d154e34 100644
--- a/docs/python_docs/python/api/legacy/ndarray/image/index.rst
+++ b/docs/python_docs/python/api/legacy/ndarray/image/index.rst
@@ -20,4 +20,4 @@ ndarray.image
 
 .. automodule:: mxnet.ndarray.image
     :members:
-    :autosummary:
\ No newline at end of file
+    :autosummary:
diff --git a/docs/python_docs/python/api/legacy/ndarray/linalg/index.rst b/docs/python_docs/python/api/legacy/ndarray/linalg/index.rst
index ab360e4..8284fba 100644
--- a/docs/python_docs/python/api/legacy/ndarray/linalg/index.rst
+++ b/docs/python_docs/python/api/legacy/ndarray/linalg/index.rst
@@ -20,4 +20,4 @@ ndarray.linalg
 
 .. automodule:: mxnet.ndarray.linalg
     :members:
-    :autosummary:
\ No newline at end of file
+    :autosummary:
diff --git a/docs/python_docs/python/api/legacy/ndarray/ndarray.rst b/docs/python_docs/python/api/legacy/ndarray/ndarray.rst
index 8047b74..a20d7ae 100644
--- a/docs/python_docs/python/api/legacy/ndarray/ndarray.rst
+++ b/docs/python_docs/python/api/legacy/ndarray/ndarray.rst
@@ -23,4 +23,4 @@ ndarray
 .. automodule:: mxnet.ndarray
     :members:
     :imported-members:
-    :autosummary:
\ No newline at end of file
+    :autosummary:
diff --git a/docs/python_docs/python/api/legacy/ndarray/op/index.rst b/docs/python_docs/python/api/legacy/ndarray/op/index.rst
index 9894a6e..884ca51 100644
--- a/docs/python_docs/python/api/legacy/ndarray/op/index.rst
+++ b/docs/python_docs/python/api/legacy/ndarray/op/index.rst
@@ -20,4 +20,4 @@ ndarray.op
 
 .. automodule:: mxnet.ndarray.op
     :members:
-    :autosummary:
\ No newline at end of file
+    :autosummary:
diff --git a/docs/python_docs/python/api/legacy/ndarray/random/index.rst b/docs/python_docs/python/api/legacy/ndarray/random/index.rst
index ce928c5..bbf9150 100644
--- a/docs/python_docs/python/api/legacy/ndarray/random/index.rst
+++ b/docs/python_docs/python/api/legacy/ndarray/random/index.rst
@@ -20,4 +20,4 @@ ndarray.random
 
 .. automodule:: mxnet.ndarray.random
     :members:
-    :autosummary:
\ No newline at end of file
+    :autosummary:
diff --git a/docs/python_docs/python/api/legacy/ndarray/register/index.rst b/docs/python_docs/python/api/legacy/ndarray/register/index.rst
index a700af7..4e40769 100644
--- a/docs/python_docs/python/api/legacy/ndarray/register/index.rst
+++ b/docs/python_docs/python/api/legacy/ndarray/register/index.rst
@@ -20,4 +20,4 @@ ndarray.register
 
 .. automodule:: mxnet.ndarray.register
     :members:
-    :autosummary:
\ No newline at end of file
+    :autosummary:
diff --git a/docs/python_docs/python/api/legacy/ndarray/sparse/index.rst b/docs/python_docs/python/api/legacy/ndarray/sparse/index.rst
index 2d1156e..44a3b35 100644
--- a/docs/python_docs/python/api/legacy/ndarray/sparse/index.rst
+++ b/docs/python_docs/python/api/legacy/ndarray/sparse/index.rst
@@ -20,4 +20,4 @@ ndarray.sparse
 
 .. automodule:: mxnet.ndarray.sparse
     :members:
-    :autosummary:
\ No newline at end of file
+    :autosummary:
diff --git a/docs/python_docs/python/api/legacy/ndarray/utils/index.rst b/docs/python_docs/python/api/legacy/ndarray/utils/index.rst
index b0b7e04..5dfef70 100644
--- a/docs/python_docs/python/api/legacy/ndarray/utils/index.rst
+++ b/docs/python_docs/python/api/legacy/ndarray/utils/index.rst
@@ -20,4 +20,4 @@ ndarray.utils
 
 .. automodule:: mxnet.ndarray.utils
     :members:
-    :autosummary:
\ No newline at end of file
+    :autosummary:
diff --git a/docs/python_docs/python/api/module/index.rst b/docs/python_docs/python/api/module/index.rst
deleted file mode 100644
index 12040db..0000000
--- a/docs/python_docs/python/api/module/index.rst
+++ /dev/null
@@ -1,24 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
-
-mxnet.module
-============
-
-.. automodule:: mxnet.module
-    :members:
-    :imported-members:
-    :autosummary:
diff --git a/docs/python_docs/python/api/np/arrays.indexing.rst b/docs/python_docs/python/api/np/arrays.indexing.rst
index e073d21..7aabdc6 100644
--- a/docs/python_docs/python/api/np/arrays.indexing.rst
+++ b/docs/python_docs/python/api/np/arrays.indexing.rst
@@ -1,3 +1,20 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 .. _arrays.indexing:
 
 Indexing
@@ -31,15 +48,15 @@ integer, or a tuple of slice objects and integers. :const:`Ellipsis`
 and :const:`newaxis` objects can be interspersed with these as
 well.
 
-The simplest case of indexing with *N* integers returns an :ref:`array
-scalar <arrays.scalars>` representing the corresponding item.  As in
+The simplest case of indexing with *N* integers returns an array
+scalar representing the corresponding item.  As in
 Python, all indices are zero-based: for the *i*-th index :math:`n_i`,
 the valid range is :math:`0 \le n_i < d_i` where :math:`d_i` is the
 *i*-th element of the shape of the array.  Negative indices are
 interpreted as counting from the end of the array (*i.e.*, if
 :math:`n_i < 0`, it means :math:`n_i + d_i`).
 
-All arrays generated by basic slicing are always :term:`views <view>`
+All arrays generated by basic slicing are always views
 of the original array if the fetched elements are contiguous in memory.
 
 The standard rules of sequence slicing apply to basic slicing on a
@@ -172,7 +189,7 @@ integer or bool). There are two types of advanced indexing: integer
 and Boolean.
 
 Advanced indexing always returns a *copy* of the data (contrast with
-some cases in basic slicing that returns a :term:`view`).
+some cases in basic slicing that returns a view).
 
 .. warning::
 
@@ -298,15 +315,15 @@ faster when ``obj.shape == x.shape``.
 
 If ``obj.ndim == x.ndim``, ``x[obj]`` returns a 1-dimensional array
 filled with the elements of *x* corresponding to the :const:`True`
-values of *obj*.  The search order will be :term:`row-major`,
+values of *obj*.  The search order will be row-major,
 C-style. If *obj* has :const:`True` values at entries that are outside
 of the bounds of *x*, then an index error will be raised. If *obj* is
 smaller than *x* it is identical to filling it with :const:`False`.
 
 .. note::
 
-Boolean indexing currently only supports a single boolean ndarray as a index.
-An composite index including a boolean array is not supported for now.
+    Boolean indexing currently only supports a single boolean ndarray as a index.
+    An composite index including a boolean array is not supported for now.
 
 If there is only one Boolean array and no integer indexing array present,
 this is straight forward. Care must only be taken to make sure that the
diff --git a/docs/python_docs/python/api/np/arrays.ndarray.rst b/docs/python_docs/python/api/np/arrays.ndarray.rst
index a0e9a87..e77d20b 100644
--- a/docs/python_docs/python/api/np/arrays.ndarray.rst
+++ b/docs/python_docs/python/api/np/arrays.ndarray.rst
@@ -1,3 +1,20 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 .. _arrays.ndarray:
 
 ******************************************
@@ -11,7 +28,7 @@ container of items of the same type and size. The number of dimensions
 and items in an array is defined by its :attr:`shape <ndarray.shape>`,
 which is a :class:`tuple` of *N* non-negative integers that specify the
 sizes of each dimension. The type of items in the array is specified by
-a separate :ref:`data-type object (dtype) <arrays.dtypes>`, one of which
+a separate data-type object (dtype), one of which
 is associated with each ndarray.
 
 As with other container objects in Python, the contents of an
@@ -65,19 +82,15 @@ New arrays can be constructed using the routines detailed in
 :class:`ndarray` constructor:
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray
 
-::
-
 
 Indexing arrays
 ===============
 
 Arrays can be indexed using an extended Python slicing syntax,
-``array[selection]``.  Similar syntax is also used for accessing
-fields in a :term:`structured data type`.
+``array[selection]``.
 
 .. seealso:: :ref:`Array Indexing <arrays.indexing>`.
 
@@ -92,8 +105,8 @@ some other object), combined with an indexing scheme that maps *N*
 integers into the location of an item in the block.  The ranges in
 which the indices can vary is specified by the :obj:`shape
 <ndarray.shape>` of the array. How many bytes each item takes and how
-the bytes are interpreted is defined by the :ref:`data-type object
-<arrays.dtypes>` associated with the array.
+the bytes are interpreted is defined by the data-type object
+associated with the array.
 
 .. index:: C-order, Fortran-order, row-major, column-major, stride,
   offset
@@ -116,9 +129,9 @@ corresponds to the offset (in bytes):
 
 from the beginning of the memory block associated with the
 array. Here, :math:`s_k` are integers which specify the :obj:`strides
-<ndarray.strides>` of the array. The :term:`column-major` order (used,
+<ndarray.strides>` of the array. The column-major order (used,
 for example, in the Fortran language and in *Matlab*) and
-:term:`row-major` order (used in C) schemes are just specific kinds of
+row-major order (used in C) schemes are just specific kinds of
 strided scheme, and correspond to memory that can be *addressed* by the strides:
 
 .. math::
@@ -130,7 +143,7 @@ strided scheme, and correspond to memory that can be *addressed* by the strides:
 
 where :math:`d_j` `= self.shape[j]`.
 
-Both the C and Fortran orders are :term:`contiguous`, *i.e.,*
+Both the C and Fortran orders are contiguous, *i.e.,*
 single-segment, memory layouts, in which every part of the
 memory block can be accessed by some combination of the indices.
 
@@ -175,9 +188,9 @@ base offset itself is a multiple of `self.itemsize`. Understanding
     for C-style contiguous arrays or ``self.strides[0] == self.itemsize`` for
     Fortran-style contiguous arrays is true.
 
-Data in new :class:`ndarrays <ndarray>` is in the :term:`row-major`
+Data in new :class:`ndarrays <ndarray>` is in the row-major
 (C) order, unless otherwise specified, but, for example, :ref:`basic
-array slicing <arrays.indexing>` often produces :term:`views <view>`
+array slicing <arrays.indexing>` often produces views
 in a different scheme.
 
 .. seealso: :ref:`Indexing <arrays.ndarray.indexing>`_
@@ -208,49 +221,21 @@ The following attributes contain information about the memory layout
 of the array:
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.shape
    ndarray.ndim
    ndarray.size
 
-::
-
-   ndarray.flags
-   ndarray.strides
-   ndarray.data
-   ndarray.itemsize
-   ndarray.nbytes
-   ndarray.base
-
 Data type
 ---------
 
-.. seealso:: :ref:`Data type objects <arrays.dtypes>`
-
 The data type object associated with the array can be found in the
 :attr:`dtype <ndarray.dtype>` attribute:
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.dtype
 
-Other attributes
-----------------
-
-.. autosummary::
-   :toctree: generated/
-
-   ndarray.T
-
-::
-
-   ndarray.real
-   ndarray.imag
-   ndarray.flat
-   ndarray.ctypes
-
 .. _array.ndarray.methods:
 
 Array methods
@@ -277,26 +262,12 @@ Array conversion
 ----------------
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.item
    ndarray.copy
    ndarray.tolist
    ndarray.astype
 
-::
-
-   ndarray.itemset
-   ndarray.tostring
-   ndarray.tobytes
-   ndarray.tofile
-   ndarray.dump
-   ndarray.dumps
-   ndarray.byteswap
-   ndarray.view
-   ndarray.getfield
-   ndarray.setflags
-   ndarray.fill
 
 Shape manipulation
 ------------------
@@ -305,7 +276,6 @@ For reshape, resize, and transpose, the single tuple argument may be
 replaced with ``n`` integers which will be interpreted as an n-tuple.
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.reshape
    ndarray.transpose
@@ -313,11 +283,6 @@ replaced with ``n`` integers which will be interpreted as an n-tuple.
    ndarray.flatten
    ndarray.squeeze
 
-::
-
-   ndarray.resize
-   ndarray.ravel
-
 Item selection and manipulation
 -------------------------------
 
@@ -327,24 +292,12 @@ array. Any other value for *axis* represents the dimension along which
 the operation should proceed.
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.nonzero
    ndarray.take
    ndarray.repeat
-
-
-::
-
    ndarray.argsort
    ndarray.sort
-   ndarray.put
-   ndarray.choose
-   ndarray.partition
-   ndarray.argpartition
-   ndarray.searchsorted
-   ndarray.compress
-   ndarray.diagonal
 
 Calculation
 -----------
@@ -407,7 +360,6 @@ elements. It can have a different data type in which case casting will
 be performed.
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.max
    ndarray.argmax
@@ -420,14 +372,7 @@ be performed.
    ndarray.cumsum
    ndarray.var
    ndarray.std
-
-::
-
    ndarray.round
-   ndarray.ptp
-   ndarray.conj
-   ndarray.trace
-   ndarray.cumprod
    ndarray.all
    ndarray.any
 
@@ -444,14 +389,11 @@ Each of the arithmetic operations (``+``, ``-``, ``*``, ``/``, ``//``,
 ``%``, ``divmod()``, ``**`` or ``pow()``, ``<<``, ``>>``, ``&``,
 ``^``, ``|``, ``~``) and the comparisons (``==``, ``<``, ``>``,
 ``<=``, ``>=``, ``!=``) is equivalent to the corresponding
-universal function (or :term:`ufunc` for short) in NumPy.  For
-more information, see the section on :ref:`Universal Functions
-<ufuncs>`.
+universal function (or ufunc for short) in NumPy.
 
 Comparison operators:
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.__lt__
    ndarray.__le__
@@ -463,7 +405,6 @@ Comparison operators:
 Truth value of an array (:func:`bool()`):
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.__bool__
 
@@ -478,20 +419,14 @@ Truth value of an array (:func:`bool()`):
 Unary operations:
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.__neg__
-
-::
-
-   ndarray.__pos__
    ndarray.__abs__
    ndarray.__invert__
 
 Arithmetic:
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.__add__
    ndarray.__sub__
@@ -499,13 +434,6 @@ Arithmetic:
    ndarray.__truediv__
    ndarray.__mod__
    ndarray.__pow__
-
-::
-
-   ndarray.__floordiv__
-   ndarray.__divmod__
-   ndarray.__lshift__
-   ndarray.__rshift__
    ndarray.__and__
    ndarray.__or__
    ndarray.__xor__
@@ -528,42 +456,34 @@ Arithmetic:
 Arithmetic, in-place:
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.__iadd__
    ndarray.__isub__
    ndarray.__imul__
    ndarray.__itruediv__
    ndarray.__imod__
-
-::
-
-   ndarray.__ifloordiv__
-   ndarray.__ipow__
-   ndarray.__ilshift__
-   ndarray.__irshift__
    ndarray.__iand__
    ndarray.__ior__
    ndarray.__ixor__
 
+
 .. warning::
 
    In place operations will perform the calculation using the
    precision decided by the data type of the two operands, but will
    silently downcast the result (if necessary) so it can fit back into
-   the array.  Therefore, for mixed precision calculations, ``A {op}=
-   B`` can be different than ``A = A {op} B``. For example, suppose
-   ``a = ones((3,3))``. Then, ``a += 3j`` is different than ``a = a +
-   3j``: while they both perform the same computation, ``a += 3``
+   the array.  Therefore, for mixed precision calculations,
+   ``A {op}= B`` can be different than ``A = A {op} B``. For example, suppose
+   ``a = ones((3,3))``. Then, ``a += 3j`` is different than ``a = a + 3j``:
+   while they both perform the same computation, ``a += 3``
    casts the result to fit back in ``a``, whereas ``a = a + 3j``
    re-binds the name ``a`` to the result.
 
+
 Matrix Multiplication:
 
-.. autosummary::
-   :toctree: generated/
 
-::
+.. autosummary::
 
    ndarray.__matmul__
 
@@ -574,58 +494,36 @@ Special methods
 For standard library functions:
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.__reduce__
    ndarray.__setstate__
 
-::
-
-   ndarray.__copy__
-   ndarray.__deepcopy__
-
 Basic customization:
 
 .. autosummary::
-   :toctree: generated/
-
-::
 
-   ndarray.__array__
    ndarray.__new__
-   ndarray.__array_wrap__
 
 Container customization: (see :ref:`Indexing <arrays.indexing>`)
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.__len__
    ndarray.__getitem__
    ndarray.__setitem__
 
-::
-
-   ndarray.__contains__
-
 Conversion; the operations :func:`int()` and :func:`float()`.
 They work only on arrays that have one element in them
 and return the appropriate scalar.
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.__int__
    ndarray.__float__
 
-::
-
-   ndarray.__complex__
-
 String representations:
 
 .. autosummary::
-   :toctree: generated/
 
    ndarray.__str__
    ndarray.__repr__
diff --git a/docs/python_docs/python/api/np/arrays.rst b/docs/python_docs/python/api/np/arrays.rst
index 3f2a526..7eca1bd 100644
--- a/docs/python_docs/python/api/np/arrays.rst
+++ b/docs/python_docs/python/api/np/arrays.rst
@@ -1,3 +1,20 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 .. _arrays:
 
 *************
@@ -11,16 +28,16 @@ Array objects
 type. The items can be :ref:`indexed <arrays.indexing>` using for
 example N integers.
 
-All ndarrays are :term:`homogenous`: every item takes up the same size
+All ndarrays are homogenous: every item takes up the same size
 block of memory, and all blocks are interpreted in exactly the same
 way. How each item in the array is to be interpreted is specified by a
-separate :ref:`data-type object <arrays.dtypes>`, one of which is associated
+separate data-type object, one of which is associated
 with every array. In addition to basic types (integers, floats,
 *etc.*), the data type objects can also represent data structures.
 
 An item extracted from an array, *e.g.*, by indexing, is represented
-by a Python object whose type is one of the :ref:`array scalar types
-<arrays.scalars>` built in NumPy. The array scalars allow easy manipulation
+by a Python object whose type is one of the array scalar types
+built in NumPy. The array scalars allow easy manipulation
 of also more complicated arrangements of data.
 
 .. note::
@@ -32,11 +49,4 @@ of also more complicated arrangements of data.
    :maxdepth: 2
 
    arrays.ndarray
-   arrays.scalars
-   arrays.dtypes
    arrays.indexing
-   arrays.nditer
-   arrays.classes
-   maskedarray
-   arrays.interface
-   arrays.datetime
diff --git a/docs/python_docs/python/api/np/index.rst b/docs/python_docs/python/api/np/index.rst
index 8b275a9..282af54 100644
--- a/docs/python_docs/python/api/np/index.rst
+++ b/docs/python_docs/python/api/np/index.rst
@@ -1,3 +1,20 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 .. _reference:
 
 mxnet.np
@@ -14,14 +31,7 @@ included in `mxnet.np`. Use the links here to learn more.
    :maxdepth: 2
 
    arrays
-   constants
-   ufuncs
    routines
-   distutils
-   distutils_guide
-   c-api
-   internals
-   swig
 
 
 **Acknowledgements**
diff --git a/docs/python_docs/python/api/np/random/index.rst b/docs/python_docs/python/api/np/random/index.rst
index 0142b19..f4e2a20 100644
--- a/docs/python_docs/python/api/np/random/index.rst
+++ b/docs/python_docs/python/api/np/random/index.rst
@@ -1,22 +1,27 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 .. _numpyrandom:
 
 .. currentmodule:: mxnet.np.random
 
 np.random
-============
-
-..
-  remove a large part about generator here, this page contains a part of generator.rst
-
-
-Accessing the BitGenerator
---------------------------
-.. autosummary::
-   :toctree: generated/
-
-::
+=========
 
-   bit_generator
 
 Simple random data
 ------------------
@@ -25,12 +30,6 @@ Simple random data
 
    choice
 
-::
-
-   random
-   integers
-   bytes
-
 Permutations
 ------------
 .. autosummary::
@@ -38,53 +37,27 @@ Permutations
 
    shuffle
 
-::
-
-   permutation
-
 Distributions
 -------------
 .. autosummary::
    :toctree: generated/
 
-
    normal
    uniform
    rand
    randint
-
-::
-
    beta
-   binomial
    chisquare
-   dirichlet
    exponential
    f
    gamma
-   geometric
    gumbel
-   hypergeometric
    laplace
    logistic
    lognormal
-   logseries
    multinomial
    multivariate_normal
-   negative_binomial
-   noncentral_chisquare
-   noncentral_f
    pareto
-   poisson
    power
    rayleigh
-   standard_cauchy
-   standard_exponential
-   standard_gamma
-   standard_normal
-   standard_t
-   triangular
-   vonmises
-   wald
    weibull
-   zipf
diff --git a/docs/python_docs/python/api/np/routines.array-creation.rst b/docs/python_docs/python/api/np/routines.array-creation.rst
index 2033923..4f897e9 100644
--- a/docs/python_docs/python/api/np/routines.array-creation.rst
+++ b/docs/python_docs/python/api/np/routines.array-creation.rst
@@ -1,10 +1,25 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 .. _routines.array-creation:
 
 Array creation routines
 =======================
 
-.. seealso:: :ref:`Array creation <arrays.creation>`
-
 .. currentmodule:: mxnet.np
 
 Ones and zeros
@@ -36,12 +51,7 @@ From existing data
 
 .. code::
 
-   asarray
-   asanyarray
-   ascontiguousarray
-   asmatrix
    frombuffer
-   fromfile
    fromfunction
    fromiter
    fromstring
@@ -112,13 +122,3 @@ Building matrices
    tri
    triu
    vander
-
-The Matrix class
-----------------
-.. autosummary::
-   :toctree: generated/
-
-::
-
-   mat
-   bmat
diff --git a/docs/python_docs/python/api/np/routines.array-manipulation.rst b/docs/python_docs/python/api/np/routines.array-manipulation.rst
index b43d8f1..116e2fe 100644
--- a/docs/python_docs/python/api/np/routines.array-manipulation.rst
+++ b/docs/python_docs/python/api/np/routines.array-manipulation.rst
@@ -1,17 +1,25 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 Array manipulation routines
 ***************************
 
 .. currentmodule:: mxnet.np
 
-Basic operations
-================
-.. autosummary::
-   :toctree: generated/
-
-::
-
-    copyto
-
 Changing array shape
 ====================
 .. autosummary::
@@ -22,10 +30,6 @@ Changing array shape
    ravel
    ndarray.flatten
 
-::
-
-   ndarray.flat
-
 Transpose-like operations
 =========================
 .. autosummary::
@@ -35,9 +39,6 @@ Transpose-like operations
    ndarray.T
    transpose
    moveaxis
-
-::
-
    rollaxis
 
 Changing number of dimensions
@@ -49,30 +50,9 @@ Changing number of dimensions
    squeeze
    broadcast_to
    broadcast_arrays
-
-::
-
    atleast_1d
    atleast_2d
    atleast_3d
-   broadcast
-
-Changing kind of array
-======================
-.. autosummary::
-   :toctree: generated/
-
-::
-
-   asarray
-   asanyarray
-   asmatrix
-   asfarray
-   asfortranarray
-   ascontiguousarray
-   asarray_chkfinite
-   asscalar
-   require
 
 Joining arrays
 ==============
@@ -83,12 +63,8 @@ Joining arrays
    stack
    dstack
    vstack
-
-::
-
    column_stack
    hstack
-   block
 
 Splitting arrays
 ================
@@ -98,9 +74,6 @@ Splitting arrays
    split
    hsplit
    vsplit
-
-::
-
    array_split
    dsplit
 
@@ -118,9 +91,6 @@ Adding and removing elements
    :toctree: generated/
 
    unique
-
-::
-
    delete
    insert
    append
@@ -136,8 +106,5 @@ Rearranging elements
    flip
    roll
    rot90
-
-::
-
    fliplr
    flipud
diff --git a/docs/python_docs/python/api/np/routines.io.rst b/docs/python_docs/python/api/np/routines.io.rst
index 07e487c..0857532 100644
--- a/docs/python_docs/python/api/np/routines.io.rst
+++ b/docs/python_docs/python/api/np/routines.io.rst
@@ -1,19 +1,25 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 Input and output
 ****************
 
 .. currentmodule:: mxnet.np
 
-NumPy binary files (NPY, NPZ)
------------------------------
-.. autosummary::
-   :toctree: generated/
-
-::
-   load
-   save
-   savez
-   savez_compressed
-
 The format of these binary file types is documented in
 :py:mod:`numpy.lib.format`
 
@@ -23,92 +29,11 @@ Text files
    :toctree: generated/
 
    genfromtxt
-
-::
-
-   loadtxt
-   savetxt
-   fromregex
-   fromstring
-   ndarray.tofile
    ndarray.tolist
 
-Raw binary files
-----------------
-
-.. autosummary::
-
-
-::
-
-   fromfile
-   ndarray.tofile
-
-String formatting
------------------
-.. autosummary::
-   :toctree: generated/
-
-
-::
-
-   array2string
-   array_repr
-   array_str
-   format_float_positional
-   format_float_scientific
-
-Memory mapping files
---------------------
-.. autosummary::
-   :toctree: generated/
-
-
-::
-
-   memmap
-
 Text formatting options
 -----------------------
 .. autosummary::
    :toctree: generated/
 
-
-::
-
    set_printoptions
-   get_printoptions
-   set_string_function
-   printoptions
-
-Base-n representations
-----------------------
-.. autosummary::
-   :toctree: generated/
-
-
-::
-
-   binary_repr
-   base_repr
-
-Data sources
-------------
-.. autosummary::
-   :toctree: generated/
-
-
-::
-
-   DataSource
-
-Binary Format Description
--------------------------
-.. autosummary::
-   :template: autosummary/minimal_module.rst
-   :toctree: generated/
-
-
-::
-
-    lib.format
diff --git a/docs/python_docs/python/api/np/routines.linalg.rst b/docs/python_docs/python/api/np/routines.linalg.rst
index 073dd0b..5f80185 100644
--- a/docs/python_docs/python/api/np/routines.linalg.rst
+++ b/docs/python_docs/python/api/np/routines.linalg.rst
@@ -1,3 +1,20 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 .. _routines.linalg:
 
 .. module:: mxnet.np.linalg
@@ -31,12 +48,8 @@ Matrix and vector products
    outer
    tensordot
    einsum
-
-::
-
    linalg.multi_dot
    matmul
-   einsum_path
    linalg.matrix_power
    kron
 
@@ -46,9 +59,6 @@ Decompositions
    :toctree: generated/
 
    linalg.svd
-
-::
-
    linalg.cholesky
    linalg.qr
 
@@ -57,9 +67,6 @@ Matrix eigenvalues
 .. autosummary::
    :toctree: generated/
 
-
-::
-
    linalg.eig
    linalg.eigh
    linalg.eigvals
@@ -72,9 +79,6 @@ Norms and other numbers
 
    linalg.norm
    trace
-
-::
-
    linalg.cond
    linalg.det
    linalg.matrix_rank
@@ -85,22 +89,9 @@ Solving equations and inverting matrices
 .. autosummary::
    :toctree: generated/
 
-
-::
-
    linalg.solve
    linalg.tensorsolve
    linalg.lstsq
    linalg.inv
    linalg.pinv
    linalg.tensorinv
-
-Exceptions
-----------
-.. autosummary::
-   :toctree: generated/
-
-
-::
-
-   linalg.LinAlgError
diff --git a/docs/python_docs/python/api/np/routines.math.rst b/docs/python_docs/python/api/np/routines.math.rst
index 6dd85cd..bb1301b 100644
--- a/docs/python_docs/python/api/np/routines.math.rst
+++ b/docs/python_docs/python/api/np/routines.math.rst
@@ -1,3 +1,20 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 Mathematical functions
 **********************
 
@@ -28,11 +45,9 @@ Trigonometric functions
    arctan2
    deg2rad
    rad2deg
-
-::
-
    unwrap
 
+
 Hyperbolic functions
 --------------------
 .. autosummary::
@@ -45,6 +60,7 @@ Hyperbolic functions
    arccosh
    arctanh
 
+
 Rounding
 --------
 .. autosummary::
@@ -56,9 +72,6 @@ Rounding
    ceil
    trunc
    around
-
-::
-
    round_
 
 
@@ -70,9 +83,6 @@ Sums, products, differences
    sum
    prod
    cumsum
-
-::
-
    nanprod
    nansum
    cumprod
@@ -80,10 +90,10 @@ Sums, products, differences
    nancumsum
    diff
    ediff1d
-   gradient
    cross
    trapz
 
+
 Exponents and logarithms
 ------------------------
 .. autosummary::
@@ -96,22 +106,14 @@ Exponents and logarithms
    log2
    log1p
 
-::
-
-   exp2
-   logaddexp
-   logaddexp2
 
 Other special functions
 -----------------------
 .. autosummary::
    :toctree: generated/
 
-
-::
-
    i0
-   sinc
+
 
 Floating point routines
 -----------------------
@@ -119,15 +121,12 @@ Floating point routines
    :toctree: generated/
 
    ldexp
-
-::
-
    signbit
    copysign
    frexp
-   nextafter
    spacing
 
+
 Rational routines
 -----------------
 .. autosummary::
@@ -135,9 +134,6 @@ Rational routines
 
    lcm
 
-::
-
-   gcd
 
 Arithmetic operations
 ---------------------
@@ -154,31 +150,12 @@ Arithmetic operations
    multiply
    true_divide
    remainder
-
-::
-
    positive
-   floor_divide
    float_power
-
    fmod
    modf
    divmod
 
-Handling complex numbers
-------------------------
-.. autosummary::
-   :toctree: generated/
-
-
-::
-
-   angle
-   real
-   imag
-   conj
-   conjugate
-
 
 Miscellaneous
 -------------
@@ -186,28 +163,17 @@ Miscellaneous
    :toctree: generated/
 
    clip
-
    sqrt
    cbrt
    square
-
    absolute
    sign
    maximum
    minimum
-
-::
-
-   convolve
-
    fabs
-
    heaviside
-
    fmax
    fmin
-
    nan_to_num
-   real_if_close
-
    interp
+
diff --git a/docs/python_docs/python/api/np/routines.rst b/docs/python_docs/python/api/np/routines.rst
index e35d621..cc94613 100644
--- a/docs/python_docs/python/api/np/routines.rst
+++ b/docs/python_docs/python/api/np/routines.rst
@@ -1,5 +1,22 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 Routines
-============
+========
 
 In this chapter routine docstrings are presented, grouped by functionality.
 Many docstrings contain example code, which demonstrates basic usage
@@ -17,31 +34,9 @@ indentation.
 
    routines.array-creation
    routines.array-manipulation
-   routines.bitwise
-   routines.char
-   routines.ctypeslib
-   routines.datetime
-   routines.dtype
-   routines.dual
-   routines.emath
-   routines.err
-   routines.fft
-   routines.financial
-   routines.functional
-   routines.help
-   routines.indexing
    routines.io
    routines.linalg
-   routines.logic
-   routines.ma
    routines.math
-   routines.matlib
-   routines.other
-   routines.padding
-   routines.polynomials
    random/index
-   routines.set
    routines.sort
    routines.statistics
-   routines.testing
-   routines.window
diff --git a/docs/python_docs/python/api/np/routines.sort.rst b/docs/python_docs/python/api/np/routines.sort.rst
index 0ae1e92..31e9d0a 100644
--- a/docs/python_docs/python/api/np/routines.sort.rst
+++ b/docs/python_docs/python/api/np/routines.sort.rst
@@ -1,3 +1,20 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 Sorting, searching, and counting
 ================================
 
@@ -8,14 +25,11 @@ Sorting
 .. autosummary::
    :toctree: generated/
 
-::
-
    ndarray.sort
    sort
    lexsort
    argsort
    msort
-   sort_complex
    partition
    argpartition
 
@@ -26,9 +40,6 @@ Searching
 
    argmax
    argmin
-
-::
-
    nanargmax
    nanargmin
    argwhere
@@ -43,7 +54,4 @@ Counting
 .. autosummary::
    :toctree: generated/
 
-
-::
-
    count_nonzero
diff --git a/docs/python_docs/python/api/np/routines.statistics.rst b/docs/python_docs/python/api/np/routines.statistics.rst
index e9caf40..30af7d6 100644
--- a/docs/python_docs/python/api/np/routines.statistics.rst
+++ b/docs/python_docs/python/api/np/routines.statistics.rst
@@ -1,3 +1,20 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 Statistics
 ==========
 
@@ -12,9 +29,6 @@ Order statistics
 
    min
    max
-
-::
-
    amin
    amax
    nanmin
@@ -34,13 +48,9 @@ Averages and variances
    mean
    std
    var
-
-::
-
    median
    average
    nanmedian
-   nanmean
    nanstd
    nanvar
 
@@ -50,9 +60,6 @@ Correlating
 .. autosummary::
    :toctree: generated/
 
-
-::
-
    corrcoef
    correlate
    cov
@@ -64,9 +71,6 @@ Histograms
    :toctree: generated/
 
    histogram
-
-::
-
    histogram2d
    histogramdd
    bincount
diff --git a/docs/python_docs/python/api/npx/index.rst b/docs/python_docs/python/api/npx/index.rst
index a073eee..4cc2684 100644
--- a/docs/python_docs/python/api/npx/index.rst
+++ b/docs/python_docs/python/api/npx/index.rst
@@ -1,3 +1,20 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 NPX: NumPy Neural Network Extension
 ===================================
 
@@ -67,7 +84,6 @@ More operators
    sigmoid
    smooth_l1
    softmax
-   threading
    topk
    waitall
    load
diff --git a/docs/python_docs/python/api/runtime/index.rst b/docs/python_docs/python/api/runtime/index.rst
index 4231e86..5b293d4 100644
--- a/docs/python_docs/python/api/runtime/index.rst
+++ b/docs/python_docs/python/api/runtime/index.rst
@@ -1,23 +1,29 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
+..
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
 
-     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
 
 mxnet.runtime
-===============
+=============
 
-.. automodule:: mxnet.runtime
-    :members:
-    :autosummary:
+.. currentmodule:: mxnet.runtime
+
+.. autosummary::
+   :toctree: generated/
+
+   Feature
+   Features
+   feature_list
diff --git a/docs/python_docs/python/scripts/conf.py b/docs/python_docs/python/scripts/conf.py
index d54ecf2..5096490 100644
--- a/docs/python_docs/python/scripts/conf.py
+++ b/docs/python_docs/python/scripts/conf.py
@@ -66,6 +66,9 @@ import mxnet as mx
 from mxnet import np, npx
 '''
 
+autosummary_generate = True
+numpydoc_show_class_members = False
+
 autodoc_member_order = 'alphabetical'
 
 autodoc_default_flags = ['members', 'show-inheritance']
@@ -118,7 +121,7 @@ exclude_patterns = ['templates',
 
 # If true, the current module name will be prepended to all description
 # unit titles (such as .. function::).
-#add_module_names = True
+add_module_names = False
 
 # If true, sectionauthor and moduleauthor directives will be shown in the
 # output. They are ignored by default.
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/index.rst b/docs/python_docs/python/tutorials/getting-started/crash-course/index.rst
index b9a86e0..a69dda2 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/index.rst
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/index.rst
@@ -1,5 +1,22 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
 Getting started with NP on MXNet
-============
+================================
 
 This crash course shows how to get started with NP on MXNet. The topics here provide a quick overview of the core concepts for both NP on MXNet, which helps you manipulate multiple dimensional arrays, and Gluon, which helps you create and train neural
 networks. This is a good place to start if you are already familiar with machine learning or other deep learning frameworks.
diff --git a/docs/python_docs/python/tutorials/getting-started/to-mxnet/index.rst b/docs/python_docs/python/tutorials/getting-started/to-mxnet/index.rst
index 523665d..4c66450 100644
--- a/docs/python_docs/python/tutorials/getting-started/to-mxnet/index.rst
+++ b/docs/python_docs/python/tutorials/getting-started/to-mxnet/index.rst
@@ -34,5 +34,3 @@ Comparison Guides
    :maxdepth: 1
 
    pytorch
-
-.. disqus::
diff --git a/docs/python_docs/python/tutorials/index.rst b/docs/python_docs/python/tutorials/index.rst
index 754344c..2e0de42 100644
--- a/docs/python_docs/python/tutorials/index.rst
+++ b/docs/python_docs/python/tutorials/index.rst
@@ -16,7 +16,7 @@
    under the License.
 
 Python Tutorials
-=====
+================
 
 Getting started
 ---------------
@@ -138,7 +138,6 @@ Deployment
 
 Customization
 -------------
-.. container:: cards
 
 Coming Soon (CustomOps and Custom Operators)
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/text/transformer.rst b/docs/python_docs/python/tutorials/packages/gluon/text/transformer.rst
index ab1db2f..cb243e3 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/text/transformer.rst
+++ b/docs/python_docs/python/tutorials/packages/gluon/text/transformer.rst
@@ -603,5 +603,5 @@ Neural Information Processing Systems. 2017.
 stochastic approximation by averaging." SIAM Journal on Control and
 Optimization. 1992.
 
-.. |transformer| image:: transformer.png
+.. |transformer| image:: /_static/transformer.png
 
diff --git a/docs/python_docs/python/tutorials/packages/index.rst b/docs/python_docs/python/tutorials/packages/index.rst
index f1ee73f..832c222 100644
--- a/docs/python_docs/python/tutorials/packages/index.rst
+++ b/docs/python_docs/python/tutorials/packages/index.rst
@@ -52,6 +52,7 @@ Shared APIs
    .. card::
       :title: Learning Rate
       :link: gluon/training/learning_rates/learning_rate_schedules.html
+
       How to use the Learning Rate Scheduler.
 
    .. card::
diff --git a/docs/python_docs/python/tutorials/packages/viz/index.rst b/docs/python_docs/python/tutorials/packages/viz/index.rst
index 367c8ec..19fdfa6 100644
--- a/docs/python_docs/python/tutorials/packages/viz/index.rst
+++ b/docs/python_docs/python/tutorials/packages/viz/index.rst
@@ -33,7 +33,5 @@ References
 
 .. toctree::
    :hidden:
-   :glob:
 
-   *
-   Visualize networks <https://mxnet.apache.org/api/faq/visualize_graph>
\ No newline at end of file
+   Visualize networks <https://mxnet.apache.org/api/faq/visualize_graph>
diff --git a/docs/python_docs/python/tutorials/performance/backend/index.rst b/docs/python_docs/python/tutorials/performance/backend/index.rst
index c4cfaf6..942f399 100644
--- a/docs/python_docs/python/tutorials/performance/backend/index.rst
+++ b/docs/python_docs/python/tutorials/performance/backend/index.rst
@@ -21,12 +21,6 @@ The following tutorials will help you learn how to use backend tools to boost pe
 
 .. container:: cards
 
-   .. card::
-      :title: TensorRT
-      :link: tensorrt/index.html
-
-      How to use NVIDIA's TensorRT to boost inference performance.
-
   .. card::
      :title: MKL-DNN
      :link: mkldnn/index.html
@@ -57,7 +51,6 @@ The following tutorials will help you learn how to use backend tools to boost pe
    :maxdepth: 1
 
    mkldnn/index
-   tensorrt/index
    tvm
    profiler
-   amp
\ No newline at end of file
+   amp
diff --git a/docs/python_docs/python/tutorials/performance/backend/tensorrt/index.rst b/docs/python_docs/python/tutorials/performance/backend/tensorrt/index.rst
deleted file mode 100644
index 62be33a..0000000
--- a/docs/python_docs/python/tutorials/performance/backend/tensorrt/index.rst
+++ /dev/null
@@ -1,35 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
-
-TensorRT
-========
-
-.. container:: cards
-
-   .. card::
-      :title: Get started with TensorRT
-      :link: tensorrt
-
-      A guide on using TensorRT with MXNet.
-
-
-.. toctree::
-   :hidden:
-   :maxdepth: 1
-   :glob:
-
-   *
\ No newline at end of file
diff --git a/docs/python_docs/python/tutorials/performance/backend/tensorrt/wavenet_optimized.svg b/docs/python_docs/python/tutorials/performance/backend/tensorrt/wavenet_optimized.svg
deleted file mode 100644
index 22bdbcd..0000000
--- a/docs/python_docs/python/tutorials/performance/backend/tensorrt/wavenet_optimized.svg
+++ /dev/null
@@ -1 +0,0 @@
-<svg xmlns="http://www.w3.org/2000/svg" id="export" class="graph" preserveAspectRatio="xMidYMid meet" style="" width="1460" height="3825"><rect id="background" fill="#fff" pointer-events="all" width="1460" height="3825"/><g id="origin" transform="translate(66.35292968750001, 66.35292968750001) scale(1)"><g id="clusters" class="clusters"/><g id="edge-paths" class="edge-paths"><defs><marker id="arrowhead-vee" viewBox="0 0 10 10" refX="9" refY="5" markerUnits="strokeWidth" markerWidth="8" m [...]
\ No newline at end of file
diff --git a/docs/python_docs/python/tutorials/performance/backend/tensorrt/wavenet_unoptimized.svg b/docs/python_docs/python/tutorials/performance/backend/tensorrt/wavenet_unoptimized.svg
deleted file mode 100644
index bb39e9e..0000000
--- a/docs/python_docs/python/tutorials/performance/backend/tensorrt/wavenet_unoptimized.svg
+++ /dev/null
@@ -1,17 +0,0 @@
-<svg xmlns="http://www.w3.org/2000/svg" id="export" class="graph" preserveAspectRatio="xMidYMid meet" style="" width="1849" height="7587"><rect id="background" fill="#fff" pointer-events="all" width="1849" height="7587"/><g id="origin" transform="translate(84.00605468750001, 84.00605468750001) scale(1)"><g id="clusters" class="clusters"/><g id="edge-paths" class="edge-paths"><defs><marker id="arrowhead-vee" viewBox="0 0 10 10" refX="9" refY="5" markerUnits="strokeWidth" markerWidth="8" m [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue1_conv0_fwd" class="node" transform="translate(271.96484375,988.0000057220459)" style="opacity: 1;"><g transform="translate(-96.218 [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue1_conv3_fwd" class="node" transform="translate(546.4140625,1382.500005722046)" style="opacity: 1;"><g transform="translate(-96.2187 [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue2_conv0_fwd" class="node" transform="translate(529.9296894073486,1662.000005722046)" style="opacity: 1;"><g transform="translate(-9 [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue2_conv3_fwd" class="node" transform="translate(772.3671894073486,2015.500005722046)" style="opacity: 1;"><g transform="translate(-9 [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue3_conv0_fwd" class="node" transform="translate(668.3437519073486,2295.000005722046)" style="opacity: 1;"><g transform="translate(-9 [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue3_conv3_fwd" class="node" transform="translate(958.7421894073486,2648.500005722046)" style="opacity: 1;"><g transform="translate(-9 [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue4_conv0_fwd" class="node" transform="translate(781.9140644073486,2928.000005722046)" style="opacity: 1;"><g transform="translate(-9 [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue4_conv3_fwd" class="node" transform="translate(1072.3125019073486,3281.500005722046)" style="opacity: 1;"><g transform="translate(- [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue5_conv0_fwd" class="node" transform="translate(895.4843769073486,3561.000005722046)" style="opacity: 1;"><g transform="translate(-9 [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue5_conv3_fwd" class="node" transform="translate(1185.8828144073486,3914.500005722046)" style="opacity: 1;"><g transform="translate(- [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue6_conv0_fwd" class="node" transform="translate(1009.0546894073486,4194.000005722046)" style="opacity: 1;"><g transform="translate(- [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue6_conv3_fwd" class="node" transform="translate(1299.4531269073486,4547.500005722046)" style="opacity: 1;"><g transform="translate(- [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue7_conv0_fwd" class="node" transform="translate(1122.6250019073486,4827.000005722046)" style="opacity: 1;"><g transform="translate(- [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue7_conv3_fwd" class="node" transform="translate(1365.0625019073486,5180.500005722046)" style="opacity: 1;"><g transform="translate(- [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_causaldilatedresidue8_conv0_fwd" class="node" transform="translate(1309.4531269073486,5460.000005722046)" style="opacity: 1;"><g transform="translate(- [...]
-</title></g><line class="node" x1="82" y1="0" x2="82" y2="22" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/><path class="node border" d="M5,0h116.5a5,5 0 0 1 5,5v12a5,5 0 0 1 -5,5h-116.5a5,5 0 0 1 -5,-5v-12a5,5 0 0 1 5,-5z" style="stroke: rgb(51, 51, 51); fill: none; stroke-width: 1px;"/></g></g></g><g id="node-wavenet1_elu0__greater_scalar0" class="node" transform="translate(775.7343769073486,6383.5000076293945)" style="opacity: 1;"><g transform="translate(-63.1015625 [...]
\ No newline at end of file
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 501ec3b..77c0a4b 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -91,7 +91,9 @@ from . import _deferred_compute
 # checks the __version__ attr of MXNet, which is not set on kvstore server due to the
 # fact that kvstore-server module is imported before the __version__ attr is set.
 # use mx.kv as short for mx.kvstore
+from . import kvstore
 from . import kvstore as kv
+from .kvstore import kvstore_server
 
 # Dynamic library module should be done after ndarray and symbol are initialized
 from . import library
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index c519816..3d515c9 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -320,17 +320,18 @@ class DataLoaderV1(object):
         The sampler to use. Either specify sampler or shuffle, not both.
     last_batch : {'keep', 'discard', 'rollover'}
         How to handle the last batch if batch_size does not evenly divide
-        `len(dataset)`.
-
-        keep - A batch with less samples than previous batches is returned.
-        discard - The last batch is discarded if its incomplete.
-        rollover - The remaining samples are rolled over to the next epoch.
+        `len(dataset)`:
+        - ``keep`` - A batch with less samples than previous batches is returned.
+        - ``discard`` - The last batch is discarded if its incomplete.
+        - ``rollover`` - The remaining samples are rolled over to the next epoch.
     batch_sampler : Sampler
         A sampler that returns mini-batches. Do not specify batch_size,
         shuffle, sampler, and last_batch if batch_sampler is specified.
     batchify_fn : callable
         Callback function to allow users to specify how to merge samples
-        into a batch. Defaults to `default_batchify_fn`::
+        into a batch. Defaults to ``default_batchify_fn``.
+
+        .. code-block:: python
 
             def default_batchify_fn(data):
                 if isinstance(data[0], nd.NDArray):
@@ -526,7 +527,7 @@ class DataLoader(object):
         The sampler to use. Either specify sampler or shuffle, not both.
     last_batch : {'keep', 'discard', 'rollover'}
         How to handle the last batch if batch_size does not evenly divide
-        `len(dataset)`.
+        ``len(dataset)``.
 
         keep - A batch with less samples than previous batches is returned.
         discard - The last batch is discarded if its incomplete.
@@ -536,7 +537,21 @@ class DataLoader(object):
         shuffle, sampler, and last_batch if batch_sampler is specified.
     batchify_fn : callable
         Callback function to allow users to specify how to merge samples
-        into a batch. Defaults to `gluon.data.batchify.Stack()`::
+        into a batch. Defaults to `gluon.data.batchify.Stack()`.
+
+        .. code-block:: python
+
+            def default_batchify_fn(data):
+                if isinstance(data[0], nd.NDArray):
+                    return nd.stack(*data)
+                elif isinstance(data[0], np.ndarray):
+                    return np.stack(*data)
+                elif isinstance(data[0], tuple):
+                    data = zip(*data)
+                    return [default_batchify_fn(i) for i in data]
+                else:
+                    data = np.asarray(data)
+                    return np.ndarray(data, dtype=data.dtype)
 
     num_workers : int, default 0
         The number of multiprocessing workers to use for data preprocessing.
diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index 766a95a..de0beed 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -373,6 +373,7 @@ class Accuracy(EvalMetric):
     The accuracy score is defined as
 
     .. math::
+
         \\text{accuracy}(y, \\hat{y}) = \\frac{1}{n} \\sum_{i=0}^{n-1}
         \\text{1}(\\hat{y_i} == y_i)
 
@@ -947,6 +948,7 @@ class MCC(EvalMetric):
     MCC of 0 is uncorrelated, 1 is completely correlated, and -1 is negatively correlated.
 
     .. math::
+
         \\text{MCC} = \\frac{ TP \\times TN - FP \\times FN }
         {\\sqrt{ (TP + FP) ( TP + FN ) ( TN + FP ) ( TN + FN ) } }
 
@@ -1039,6 +1041,7 @@ class MAE(EvalMetric):
     The mean absolute error is given by
 
     .. math::
+
         \\frac{\\sum_i^n |y_i - \\hat{y}_i|}{n}
 
     Parameters
@@ -1099,6 +1102,7 @@ class MSE(EvalMetric):
     The mean squared error is given by
 
     .. math::
+
         \\frac{\\sum_i^n (y_i - \\hat{y}_i)^2}{n}
 
     Parameters
@@ -1158,6 +1162,7 @@ class RMSE(MSE):
     The root mean squared error is given by
 
     .. math::
+
         \\sqrt{\\frac{\\sum_i^n (y_i - \\hat{y}_i)^2}{n}}
 
     Parameters
@@ -1200,6 +1205,7 @@ class MeanPairwiseDistance(EvalMetric):
     The mean pairwise distance is given by
 
     .. math::
+
         \\sqrt{\\frac{(\\sum_i^n (y_i - \\hat{y}_i)^p)^\\frac{1}{p}}{n}}
 
     Parameters
@@ -1261,13 +1267,15 @@ class MeanPairwiseDistance(EvalMetric):
 @register
 @use_np
 class MeanCosineSimilarity(EvalMetric):
-    """Computes Mean Cosine Similarity.
+    r"""Computes Mean Cosine Similarity.
 
     The mean cosine similarity is given by
 
     .. math::
+
         cos_sim(label, pred) = \frac{{label}.{pred}}{max(||label||.||pred||, eps)}
-    (calculating on the last dimension of label and pred.)
+
+    Calculation happens on the last dimension of label and pred.
 
     Parameters
     ----------
@@ -1338,6 +1346,7 @@ class CrossEntropy(EvalMetric):
     The cross entropy over a batch of sample size :math:`N` is given by
 
     .. math::
+
        -\\sum_{n=1}^{N}\\sum_{k=1}^{K}t_{nk}\\log (y_{nk}),
 
     where :math:`t_{nk}=1` if and only if sample :math:`n` belongs to class :math:`k`.
@@ -1431,6 +1440,7 @@ class Perplexity(CrossEntropy):
     The perplexity of a model q is defined as
 
     .. math::
+
         b^{\\big(-\\frac{1}{N} \\sum_{i=1}^N \\log_b q(x_i) \\big)}
         = \\exp \\big(-\\frac{1}{N} \\sum_{i=1}^N \\log q(x_i)\\big)
 
@@ -1497,6 +1507,7 @@ class PearsonCorrelation(EvalMetric):
     The pearson correlation is given by
 
     .. math::
+
         \\frac{cov(y, \\hat{y})}{\\sigma{y}\\sigma{\\hat{y}}}
 
     Parameters
@@ -1588,6 +1599,7 @@ class PCC(EvalMetric):
     from a discrete solution to the Pearson correlation coefficient.
 
     .. math::
+
         \\text{PCC} = \\frac {\\sum _{k}\\sum _{l}\\sum _{m}C_{kk}C_{lm}-C_{kl}C_{mk}}
         {{\\sqrt {\\sum _{k}(\\sum _{l}C_{kl})(\\sum _{k'|k'\\neq k}\\sum _{l'}C_{k'l'})}}
          {\\sqrt {\\sum _{k}(\\sum _{l}C_{lk})(\\sum _{k'|k'\\neq k}\\sum _{l'}C_{l'k'})}}}
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 017d97e..93fe9ec 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -56,7 +56,7 @@ class Trainer(object):
         constructor for a list of additional supported arguments.
     kvstore : str or KVStore
         kvstore type for multi-gpu and distributed training. See help on
-        :any:`mxnet.kvstore.create` for more information.
+        :func:`mxnet.kvstore.create` for more information.
     compression_params : dict
         Specifies type of gradient compression and additional arguments depending
         on the type of compression being used. For example, 2bit compression requires a threshold.
diff --git a/python/mxnet/io/io.py b/python/mxnet/io/io.py
index 8e57c19..4d78cd9 100644
--- a/python/mxnet/io/io.py
+++ b/python/mxnet/io/io.py
@@ -103,8 +103,8 @@ class DataDesc(namedtuple('DataDesc', ['name', 'shape'])):
 
         Parameters
         ----------
-        shapes : a tuple of (name_, shape_)
-        types : a tuple of  (name_, np.dtype)
+        shapes : a tuple of (name, shape)
+        types : a tuple of  (name, np.dtype)
         """
         if types is not None:
             type_dict = dict(types)
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index a6eae05..39c9388 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -5009,29 +5009,29 @@ from_numpy.__doc__ = from_numpy_doc
 
 to_dlpack_for_read = ndarray_to_dlpack_for_read()
 to_dlpack_for_read_doc = """Returns a reference view of NDArray that represents as DLManagedTensor until
-       all previous write operations on the current array are finished.
-
-    Parameters
-    ----------
-    data: NDArray
-        input data.
-
-    Returns
-    -------
-    PyCapsule (the pointer of DLManagedTensor)
-        a reference view of NDArray that represents as DLManagedTensor.
-
-    Examples
-    --------
-    >>> x = mx.nd.ones((2,3))
-    >>> y = mx.nd.to_dlpack_for_read(x)
-    >>> type(y)
-    <class 'PyCapsule'>
-    >>> z = mx.nd.from_dlpack(y)
-    >>> z
-    [[1. 1. 1.]
-     [1. 1. 1.]]
-    <NDArray 2x3 @cpu(0)>
+all previous write operations on the current array are finished.
+
+Parameters
+----------
+data: NDArray
+    input data.
+
+Returns
+-------
+PyCapsule (the pointer of DLManagedTensor)
+    a reference view of NDArray that represents as DLManagedTensor.
+
+Examples
+--------
+>>> x = mx.nd.ones((2,3))
+>>> y = mx.nd.to_dlpack_for_read(x)
+>>> type(y)
+<class 'PyCapsule'>
+>>> z = mx.nd.from_dlpack(y)
+>>> z
+[[1. 1. 1.]
+ [1. 1. 1.]]
+<NDArray 2x3 @cpu(0)>
 """
 to_dlpack_for_read.__doc__ = to_dlpack_for_read_doc
 
@@ -5039,27 +5039,27 @@ to_dlpack_for_write = ndarray_to_dlpack_for_write()
 to_dlpack_for_write_doc = """Returns a reference view of NDArray that represents as
 DLManagedTensor until all previous read/write operations on the current array are finished.
 
-    Parameters
-    ----------
-    data: NDArray
-        input data.
-
-    Returns
-    -------
-    PyCapsule (the pointer of DLManagedTensor)
-        a reference view of NDArray that represents as DLManagedTensor.
-
-    Examples
-    --------
-    >>> x = mx.nd.ones((2,3))
-    >>> w = mx.nd.to_dlpack_for_write(x)
-    >>> type(w)
-    <class 'PyCapsule'>
-    >>> u = mx.nd.from_dlpack(w)
-    >>> u += 1
-    >>> x
-    [[2. 2. 2.]
-     [2. 2. 2.]]
-    <NDArray 2x3 @cpu(0)>
+Parameters
+----------
+data: NDArray
+    input data.
+
+Returns
+-------
+PyCapsule : the pointer of DLManagedTensor
+    a reference view of NDArray that represents as DLManagedTensor.
+
+Examples
+--------
+>>> x = mx.nd.ones((2,3))
+>>> w = mx.nd.to_dlpack_for_write(x)
+>>> type(w)
+<class 'PyCapsule'>
+>>> u = mx.nd.from_dlpack(w)
+>>> u += 1
+>>> x
+[[2. 2. 2.]
+ [2. 2. 2.]]
+<NDArray 2x3 @cpu(0)>
 """
 to_dlpack_for_write.__doc__ = to_dlpack_for_write_doc
diff --git a/python/mxnet/numpy/fallback.py b/python/mxnet/numpy/fallback.py
index 7ecfb15..c7687ff 100644
--- a/python/mxnet/numpy/fallback.py
+++ b/python/mxnet/numpy/fallback.py
@@ -15,13 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# pylint: disable=undefined-all-variable, not-callable, cell-var-from-loop
 """Operators that fallback to official NumPy implementation."""
 
-
+import sys
 import numpy as onp
 
-
-__all__ = [
+fallbacks = [
     '__version__',
     '_NoValue',
     'allclose',
@@ -38,6 +38,7 @@ __all__ = [
     'correlate',
     'count_nonzero',
     'cov',
+    'cumprod',
     'digitize',
     'divmod',
     'dtype',
@@ -69,11 +70,15 @@ __all__ = [
     'nanpercentile',
     'nanprod',
     'nanquantile',
+    'nanstd',
+    'nansum',
+    'nanvar',
     'ndim',
     'npv',
+    'packbits',
     'partition',
     'piecewise',
-    'packbits',
+    'pmt',
     'poly',
     'polyadd',
     'polydiv',
@@ -107,90 +112,25 @@ __all__ = [
     'vander',
 ]
 
-__version__ = onp.__version__
-_NoValue = onp._NoValue
-allclose = onp.allclose
-alltrue = onp.alltrue
-apply_along_axis = onp.apply_along_axis
-apply_over_axes = onp.apply_over_axes
-argpartition = onp.argpartition
-argwhere = onp.argwhere
-array_equal = onp.array_equal
-array_equiv = onp.array_equiv
-choose = onp.choose
-compress = onp.compress
-corrcoef = onp.corrcoef
-correlate = onp.correlate
-count_nonzero = onp.count_nonzero
-cov = onp.cov
-digitize = onp.digitize
-divmod = onp.divmod
-dtype = onp.dtype
-extract = onp.extract
-float_power = onp.float_power
-frexp = onp.frexp
-heaviside = onp.heaviside
-histogram2d = onp.histogram2d
-histogram_bin_edges = onp.histogram_bin_edges
-histogramdd = onp.histogramdd
-i0 = onp.i0
-in1d = onp.in1d
-intersect1d = onp.intersect1d
-isclose = onp.isclose
-isin = onp.isin
-ix_ = onp.ix_
-lexsort = onp.lexsort
-min_scalar_type = onp.min_scalar_type
-mirr = onp.mirr
-modf = onp.modf
-msort = onp.msort
-nanargmax = onp.nanargmax
-nanargmin = onp.nanargmin
-nancumprod = onp.nancumprod
-nancumsum = onp.nancumsum
-nanmax = onp.nanmax
-nanmedian = onp.nanmedian
-nanmin = onp.nanmin
-nanpercentile = onp.nanpercentile
-nanprod = onp.nanprod
-nanquantile = onp.nanquantile
-nanstd = onp.nanstd
-nansum = onp.nansum
-nanvar = onp.nanvar
-ndim = onp.ndim
-npv = onp.npv
-partition = onp.partition
-packbits = onp.packbits
-piecewise = onp.piecewise
-pmt = onp.pmt
-poly = onp.poly
-polyadd = onp.polyadd
-polydiv = onp.polydiv
-polyfit = onp.polyfit
-polyint = onp.polyint
-polymul = onp.polymul
-polysub = onp.polysub
-positive = onp.positive
-ppmt = onp.ppmt
-promote_types = onp.promote_types
-ptp = onp.ptp
-pv = onp.pv
-rate = onp.rate
-real = onp.real
-result_type = onp.result_type
-roots = onp.roots
-searchsorted = onp.searchsorted
-select = onp.select
-setdiff1d = onp.setdiff1d
-setxor1d = onp.setxor1d
-signbit = onp.signbit
-size = onp.size
-spacing = onp.spacing
-take_along_axis = onp.take_along_axis
-trapz = onp.trapz
-tril_indices_from = onp.tril_indices_from
-trim_zeros = onp.trim_zeros
-union1d = onp.union1d
-unpackbits = onp.unpackbits
-unwrap = onp.unwrap
-vander = onp.vander
+fallback_mod = sys.modules[__name__]
+
+for obj_name in fallbacks:
+    onp_obj = getattr(onp, obj_name)
+    if callable(onp_obj):
+        def fn(*args, **kwargs):
+            return onp_obj(*args, **kwargs)
+        new_fn_doc = onp_obj.__doc__
+        if obj_name in {'divmod', 'float_power', 'frexp', 'heaviside', 'modf', 'signbit', 'spacing'}:
+            # remove reference of kwargs doc and the reference to ufuncs
+            new_fn_doc = new_fn_doc.replace("**kwargs\n    For other keyword-only arguments, see the"
+                                            + "\n    :ref:`ufunc docs <ufuncs.kwargs>`.", '')
+        elif obj_name == 'trapz':
+            # remove unused reference
+            new_fn_doc = new_fn_doc.replace(
+                '.. [1] Wikipedia page: https://en.wikipedia.org/wiki/Trapezoidal_rule', '')
+        fn.__doc__ = new_fn_doc
+        setattr(fallback_mod, obj_name, fn)
+    else:
+        setattr(fallback_mod, obj_name, onp_obj)
+
+__all__ = fallbacks
diff --git a/python/mxnet/numpy/linalg.py b/python/mxnet/numpy/linalg.py
index 975b889..5713d7d 100644
--- a/python/mxnet/numpy/linalg.py
+++ b/python/mxnet/numpy/linalg.py
@@ -27,13 +27,13 @@ __all__ += fallback_linalg.__all__
 
 
 def matrix_rank(M, tol=None, hermitian=False):
-    """
-    Return matrix rank of array using SVD method
+    r"""Return matrix rank of array using SVD method
 
     Rank of the array is the number of singular values of the array that are
     greater than `tol`.
 
     Parameters
+    ----------
     M : {(M,), (..., M, N)} ndarray
         Input vector or stack of matrices.
     tol : (...) ndarray, float, optional
@@ -271,8 +271,7 @@ def norm(x, ord=None, axis=None, keepdims=False):
 
 
 def svd(a):
-    r"""
-    Singular Value Decomposition.
+    r"""Singular Value Decomposition.
 
     When `a` is a 2D array, it is factorized as ``ut @ np.diag(s) @ v``,
     where `ut` and `v` are 2D orthonormal arrays and `s` is a 1D
@@ -297,36 +296,34 @@ def svd(a):
         Orthonormal array(s). The first ``a.ndim - 2`` dimensions have the same
         size as those of the input `a`.
 
-    Notes
-    -----
-
-    The decomposition is performed using LAPACK routine ``_gesvd``.
-
-    SVD is usually described for the factorization of a 2D matrix :math:`A`.
-    The higher-dimensional case will be discussed below. In the 2D case, SVD is
-    written as :math:`A = U^T S V`, where :math:`A = a`, :math:`U^T = ut`,
-    :math:`S= \mathtt{np.diag}(s)` and :math:`V = v`. The 1D array `s`
-    contains the singular values of `a` and `ut` and `v` are orthonormal. The rows
-    of `v` are the eigenvectors of :math:`A^T A` and the columns of `ut` are
-    the eigenvectors of :math:`A A^T`. In both cases the corresponding
-    (possibly non-zero) eigenvalues are given by ``s**2``.
-
-    The sign of rows of `u` and `v` are determined as described in
-    `Auto-Differentiating Linear Algebra <https://arxiv.org/pdf/1710.08717.pdf>`_.
-
-    If `a` has more than two dimensions, then broadcasting rules apply.
-    This means that SVD is working in "stacked" mode: it iterates over
-    all indices of the first ``a.ndim - 2`` dimensions and for each
-    combination SVD is applied to the last two indices. The matrix `a`
-    can be reconstructed from the decomposition with either
-    ``(ut * s[..., None, :]) @ v`` or
-    ``ut @ (s[..., None] * v)``. (The ``@`` operator denotes batch matrix multiplication)
-
-    This function differs from the original `numpy.linalg.svd
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html>`_ in
-    the following way(s):
-     - The sign of rows of `u` and `v` may differ.
-     - Does not support complex input.
+    .. note::
+       The decomposition is performed using LAPACK routine ``_gesvd``.
+
+       SVD is usually described for the factorization of a 2D matrix :math:`A`.
+       The higher-dimensional case will be discussed below. In the 2D case, SVD is
+       written as :math:`A = U^T S V`, where :math:`A = a`, :math:`U^T = ut`,
+       :math:`S= \mathtt{np.diag}(s)` and :math:`V = v`. The 1D array `s`
+       contains the singular values of `a` and `ut` and `v` are orthonormal. The rows
+       of `v` are the eigenvectors of :math:`A^T A` and the columns of `ut` are
+       the eigenvectors of :math:`A A^T`. In both cases the corresponding
+       (possibly non-zero) eigenvalues are given by ``s**2``.
+
+       The sign of rows of `u` and `v` are determined as described in
+       `Auto-Differentiating Linear Algebra <https://arxiv.org/pdf/1710.08717.pdf>`_.
+
+       If `a` has more than two dimensions, then broadcasting rules apply.
+       This means that SVD is working in "stacked" mode: it iterates over
+       all indices of the first ``a.ndim - 2`` dimensions and for each
+       combination SVD is applied to the last two indices. The matrix `a`
+       can be reconstructed from the decomposition with either
+       ``(ut * s[..., None, :]) @ v`` or
+       ``ut @ (s[..., None] * v)``. (The ``@`` operator denotes batch matrix multiplication)
+
+       This function differs from the original `numpy.linalg.svd
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html>`_ in
+       the following way(s):
+       * The sign of rows of `u` and `v` may differ.
+       * Does not support complex input.
 
     Examples
     --------
@@ -775,8 +772,7 @@ def tensorsolve(a, b, axes=None):
 
 
 def eigvals(a):
-    r"""
-    Compute the eigenvalues of a general matrix.
+    r"""Compute the eigenvalues of a general matrix.
 
     Main difference between `eigvals` and `eig`: the eigenvectors aren't
     returned.
@@ -803,18 +799,17 @@ def eigvals(a):
     eigh : eigenvalues and eigenvectors of a real symmetric array.
     eigvalsh : eigenvalues of a real symmetric.
 
-    Notes
-    -----
-    Broadcasting rules apply, see the `numpy.linalg` documentation for
-    details.
+    .. note::
+       Broadcasting rules apply, see the `numpy.linalg` documentation for
+       details.
 
-    This is implemented using the ``_geev`` LAPACK routines which compute
-    the eigenvalues and eigenvectors of general square arrays.
+       This is implemented using the ``_geev`` LAPACK routines which compute
+       the eigenvalues and eigenvectors of general square arrays.
 
-    This function differs from the original `numpy.linalg.eigvals
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eigvals.html>`_ in
-    the following way(s):
-     - Does not support complex input and output.
+       This function differs from the original `numpy.linalg.eigvals
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eigvals.html>`_ in
+       the following way(s):
+       * Does not support complex input and output.
 
     Examples
     --------
@@ -824,6 +819,7 @@ def eigvals(a):
     of `Q`), preserves the eigenvalues of the "middle" matrix.  In other words,
     if `Q` is orthogonal, then ``Q * A * Q.T`` has the same eigenvalues as
     ``A``:
+
     >>> from numpy import linalg as LA
     >>> x = np.random.random()
     >>> Q = np.array([[np.cos(x), -np.sin(x)], [np.sin(x), np.cos(x)]])
@@ -831,6 +827,7 @@ def eigvals(a):
     (1.0, 1.0, 0.0)
 
     Now multiply a diagonal matrix by ``Q`` on one side and by ``Q.T`` on the other:
+
     >>> D = np.diag((-1,1))
     >>> LA.eigvals(D)
     array([-1.,  1.])
@@ -843,8 +840,7 @@ def eigvals(a):
 
 
 def eigvalsh(a, UPLO='L'):
-    r"""
-    Compute the eigenvalues real symmetric matrix.
+    r"""Compute the eigenvalues real symmetric matrix.
 
     Main difference from eigh: the eigenvectors are not computed.
 
@@ -877,24 +873,23 @@ def eigvalsh(a, UPLO='L'):
     eigvals : eigenvalues of a non-symmetric array.
     eigh : eigenvalues and eigenvectors of a real symmetric array.
 
-    Notes
-    -----
-    Broadcasting rules apply, see the `numpy.linalg` documentation for
-    details.
+    .. note::
+       Broadcasting rules apply, see the `numpy.linalg` documentation for
+       details.
 
-    The eigenvalues are computed using LAPACK routines ``_syevd``.
+       The eigenvalues are computed using LAPACK routines ``_syevd``.
 
-    This function differs from the original `numpy.linalg.eigvalsh
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eigvalsh.html>`_ in
-    the following way(s):
-     - Does not support complex input and output.
+       This function differs from the original `numpy.linalg.eigvalsh
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eigvalsh.html>`_ in
+       the following way(s):
+       * Does not support complex input and output.
 
     Examples
     --------
     >>> from numpy import linalg as LA
     >>> a = np.array([[ 5.4119368 ,  8.996273  , -5.086096  ],
-                      [ 0.8866155 ,  1.7490431 , -4.6107802 ],
-                      [-0.08034172,  4.4172044 ,  1.4528792 ]])
+    ...               [ 0.8866155 ,  1.7490431 , -4.6107802 ],
+    ...               [-0.08034172,  4.4172044 ,  1.4528792 ]])
     >>> LA.eigvalsh(a, UPLO='L')
     array([-2.87381886,  5.10144682,  6.38623114]) # in ascending order
     """
@@ -932,32 +927,31 @@ def eig(a):
     eigh : eigenvalues and eigenvectors of a real symmetric array.
     eigvalsh : eigenvalues of a real symmetric.
 
-    Notes
-    -----
-    This is implemented using the ``_geev`` LAPACK routines which compute
-    the eigenvalues and eigenvectors of general square arrays.
+    .. note::
+       This is implemented using the ``_geev`` LAPACK routines which compute
+       the eigenvalues and eigenvectors of general square arrays.
 
-    The number `w` is an eigenvalue of `a` if there exists a vector
-    `v` such that ``dot(a,v) = w * v``. Thus, the arrays `a`, `w`, and
-    `v` satisfy the equations ``dot(a[:,:], v[:,i]) = w[i] * v[:,i]``
-    for :math:`i \\in \\{0,...,M-1\\}`.
+       The number `w` is an eigenvalue of `a` if there exists a vector
+       `v` such that ``dot(a,v) = w * v``. Thus, the arrays `a`, `w`, and
+       `v` satisfy the equations ``dot(a[:,:], v[:,i]) = w[i] * v[:,i]``
+       for :math:`i \\in \\{0,...,M-1\\}`.
 
-    The array `v` of eigenvectors may not be of maximum rank, that is, some
-    of the columns may be linearly dependent, although round-off error may
-    obscure that fact. If the eigenvalues are all different, then theoretically
-    the eigenvectors are linearly independent.
+       The array `v` of eigenvectors may not be of maximum rank, that is, some
+       of the columns may be linearly dependent, although round-off error may
+       obscure that fact. If the eigenvalues are all different, then theoretically
+       the eigenvectors are linearly independent.
 
-    This function differs from the original `numpy.linalg.eig
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eig.html>`_ in
-    the following way(s):
-     - Does not support complex input and output.
+       This function differs from the original `numpy.linalg.eig
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eig.html>`_ in
+       the following way(s):
+       * Does not support complex input and output.
 
     Examples
     --------
     >>> from numpy import linalg as LA
     >>> a = np.array([[-1.9147992 ,  6.054115  , 18.046988  ],
-                      [ 0.77563655, -4.860152  ,  2.1012988 ],
-                      [ 2.6083658 ,  2.3705218 ,  0.3192524 ]])
+    ...               [ 0.77563655, -4.860152  ,  2.1012988 ],
+    ...               [ 2.6083658 ,  2.3705218 ,  0.3192524 ]])
     >>> w, v = LA.eig(a)
     >>> w
     array([ 6.9683027, -7.768063 , -5.655937 ])
@@ -970,8 +964,7 @@ def eig(a):
 
 
 def eigh(a, UPLO='L'):
-    r"""
-    Return the eigenvalues and eigenvectors real symmetric matrix.
+    r"""Return the eigenvalues and eigenvectors real symmetric matrix.
 
     Returns two objects, a 1-D array containing the eigenvalues of `a`, and
     a 2-D square array or matrix (depending on the input type) of the
@@ -1010,21 +1003,21 @@ def eigh(a, UPLO='L'):
     eigvals : eigenvalues of a non-symmetric array.
     eigvalsh : eigenvalues of a real symmetric.
 
-    Notes
-    -----
-    The eigenvalues/eigenvectors are computed using LAPACK routines ``_syevd``.
+    .. note::
+
+       The eigenvalues/eigenvectors are computed using LAPACK routines ``_syevd``.
 
-    This function differs from the original `numpy.linalg.eigh
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eigh.html>`_ in
-    the following way(s):
-     - Does not support complex input and output.
+       This function differs from the original `numpy.linalg.eigh
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eigh.html>`_ in
+       the following way(s):
+       * Does not support complex input and output.
 
     Examples
     --------
     >>> from numpy import linalg as LA
     >>> a = np.array([[ 6.8189726 , -3.926585  ,  4.3990498 ],
-                      [-0.59656644, -1.9166266 ,  9.54532   ],
-                      [ 2.1093285 ,  0.19688708, -1.1634291 ]])
+    ...               [-0.59656644, -1.9166266 ,  9.54532   ],
+    ...               [ 2.1093285 ,  0.19688708, -1.1634291 ]])
     >>> w, v = LA.eigh(a, UPLO='L')
     >>> w
     array([-2.175445 , -1.4581827,  7.3725457])
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 13f97dc..13fb94b 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -1036,7 +1036,7 @@ class ndarray(NDArray):
 
     @wrap_mxnp_np_ufunc
     def __ior__(self, other):
-        """x.__ior__(y) <=> x |= y"""
+        r"""x.__ior__(y) <=> x \|= y"""
         return bitwise_or(self, other, out=self)
 
     @wrap_mxnp_np_ufunc
@@ -1091,7 +1091,7 @@ class ndarray(NDArray):
 
     @wrap_mxnp_np_ufunc
     def __imul__(self, other):
-        """x.__imul__(y) <=> x *= y"""
+        r"""x.__imul__(y) <=> x \*= y"""
         if not self.writable:
             raise ValueError('trying to add to a readonly ndarray')
         return multiply(self, other, out=self)
@@ -1389,9 +1389,9 @@ class ndarray(NDArray):
         ----------
         grad_req : {'write', 'add', 'null'}
             How gradient will be accumulated.
-            - 'write': gradient will be overwritten on every backward.
-            - 'add': gradient will be added to existing value on every backward.
-            - 'null': do not compute gradient for this NDArray.
+            * 'write': gradient will be overwritten on every backward.
+            * 'add': gradient will be added to existing value on every backward.
+            * 'null': do not compute gradient for this NDArray.
         """
         grad = _mx_nd_np.zeros_like(self)  # pylint: disable=undefined-variable
         grad_req = _GRAD_REQ_MAP[grad_req]
@@ -1460,9 +1460,9 @@ class ndarray(NDArray):
         -----
         This function differs from the official `ndarray`'s ``astype`` function in the following
         aspects:
-            - `order` only supports 'C' and 'K'.
-            - `casting` only supports 'unsafe'.
-            - `subok` only supports ``True``.
+            * `order` only supports 'C' and 'K'.
+            * `casting` only supports 'unsafe'.
+            * `subok` only supports ``True``.
         """
         if order is not None and order != 'K' and order != 'C':
             raise ValueError('order must be either \'K\' or \'C\'')
@@ -2492,8 +2492,10 @@ def array(object, dtype=None, ctx=None):
         The desired data-type for the array.
         The default dtype is ``object.dtype`` if `object` is an `ndarray`, `float32` otherwise.
         Default dtype can be set to be consistent with offical numpy by `npx.set_np(dtype=True)`.
-        - When npx.is_np_default_dtype() returns False, default dtype is float32;
-        - When npx.is_np_default_dtype() returns True, default dtype is float64.
+
+        * When npx.is_np_default_dtype() returns False, default dtype is float32;
+        * When npx.is_np_default_dtype() returns True, default dtype is float64.
+
     ctx : device context, optional
         Device context on which the memory is allocated. Default is
         `mxnet.context.current_context()`.
@@ -2709,8 +2711,7 @@ def broadcast_to(array, shape):  # pylint: disable=redefined-outer-name
 # pylint: disable=too-many-arguments, redefined-outer-name
 @set_module('mxnet.numpy')
 def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):
-    """
-    Return a new array of given shape and type, filled with `fill_value`.
+    r"""Return a new array of given shape and type, filled with `fill_value`.
 
     Parameters
     ----------
@@ -2724,7 +2725,8 @@ def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
-    ctx: to specify the device, e.g. the i-th GPU.
+    ctx : mxnet.context.Context
+        The device, e.g. the i-th GPU.
     out : ndarray or None, optional
         A location into which the result is stored.
         If provided, it must have the same shape and dtype as input ndarray.
@@ -2737,15 +2739,12 @@ def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):
         If `fill_value` is an ndarray, out will have the same context as `fill_value`
         regardless of the provided `ctx`.
 
-    Notes
-    -----
-    This function differs from the original `numpy.full
-    https://docs.scipy.org/doc/numpy/reference/generated/numpy.full.html`_ in
-    the following way(s):
+    .. note::
+       This function differs from the original numpy.full in the following way(s):
 
-    - Has an additional `ctx` argument to specify the device
-    - Has an additional `out` argument
-    - Currently does not support `order` selection
+       * Has an additional `ctx` argument to specify the device
+       * Has an additional `out` argument
+       * Currently does not support `order` selection
 
     See Also
     --------
@@ -3019,14 +3018,13 @@ def take(a, indices, axis=None, mode='raise', out=None):
     out : ndarray
         The returned array has the same type as `a`.
 
-    Notes
-    -----
+    .. note::
 
-    This function differs from the original `numpy.take
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.take.html>`_ in
-    the following way(s):
+       This function differs from the original `numpy.take
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.take.html>`_ in
+       the following way(s):
 
-    - Only ndarray or scalar ndarray is accepted as valid input.
+       * Only ndarray or scalar ndarray is accepted as valid input.
 
     Examples
     --------
@@ -3096,23 +3094,23 @@ def unique(ar, return_index=False, return_inverse=False, return_counts=False, ax
         The number of times each of the unique values comes up in the
         original array. Only provided if `return_counts` is True.
 
-    Notes
-    -----
-    When an axis is specified the subarrays indexed by the axis are sorted.
-    This is done by making the specified axis the first dimension of the array
-    and then flattening the subarrays in C order. The flattened subarrays are
-    then viewed as a structured type with each element given a label, with the
-    effect that we end up with a 1-D array of structured types that can be
-    treated in the same way as any other 1-D array. The result is that the
-    flattened subarrays are sorted in lexicographic order starting with the
-    first element.
-
-    This function differs from the original `numpy.unique
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.unique.html>`_ in
-    the following aspects:
+    .. note::
+
+       When an axis is specified the subarrays indexed by the axis are sorted.
+       This is done by making the specified axis the first dimension of the array
+       and then flattening the subarrays in C order. The flattened subarrays are
+       then viewed as a structured type with each element given a label, with the
+       effect that we end up with a 1-D array of structured types that can be
+       treated in the same way as any other 1-D array. The result is that the
+       flattened subarrays are sorted in lexicographic order starting with the
+       first element.
 
-    - Only support ndarray as input.
-    - Object arrays or structured arrays are not supported.
+       This function differs from the original `numpy.unique
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.unique.html>`_ in
+       the following aspects:
+
+       * Only support ndarray as input.
+       * Object arrays or structured arrays are not supported.
 
     Examples
     --------
@@ -3173,16 +3171,15 @@ def add(x1, x2, out=None, **kwargs):
 
     Returns
     -------
-    add : ndarray or scalar
-        The sum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    The sum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
 
-    Notes
-    -----
-    This operator now supports automatic type promotion. The resulting type will be determined
-    according to the following rules:
-        * If both inputs are of floating number types, the output is the more precise type.
-        * If only one of the inputs is floating number type, the result is that type.
-        * If both inputs are of integer types (including boolean), not supported yet.
+    .. note::
+
+       This operator now supports automatic type promotion. The resulting type will be determined
+       according to the following rules:
+       * If both inputs are of floating number types, the output is the more precise type.
+       * If only one of the inputs is floating number type, the result is that type.
+       * If both inputs are of integer types (including boolean), not supported yet.
 
     Examples
     --------
@@ -3202,8 +3199,7 @@ def add(x1, x2, out=None, **kwargs):
 @set_module('mxnet.numpy')
 @wrap_np_binary_func
 def subtract(x1, x2, out=None, **kwargs):
-    """
-    Subtract arguments element-wise.
+    r"""Subtract arguments element-wise.
 
     Parameters
     ----------
@@ -3211,7 +3207,6 @@ def subtract(x1, x2, out=None, **kwargs):
         The arrays to be subtracted from each other. If x1.shape != x2.shape,
         they must be broadcastable to a common shape (which may be the shape
         of one or the other).
-
     out : ndarray
         A location into which the result is stored. If provided, it must have a shape
         that the inputs broadcast to. If not provided or None, a freshly-allocated array
@@ -3222,13 +3217,12 @@ def subtract(x1, x2, out=None, **kwargs):
     subtract : ndarray or scalar
         The difference of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
 
-    Notes
-    -----
-    This operator now supports automatic type promotion. The resulting type will be determined
-    according to the following rules:
-        * If both inputs are of floating number types, the output is the more precise type.
-        * If only one of the inputs is floating number type, the result is that type.
-        * If both inputs are of integer types (including boolean), not supported yet.
+    .. note::
+       This operator now supports automatic type promotion. The resulting type will be determined
+       according to the following rules:
+       * If both inputs are of floating number types, the output is the more precise type.
+       * If only one of the inputs is floating number type, the result is that type.
+       * If both inputs are of integer types (including boolean), not supported yet.
 
     Examples
     --------
@@ -3266,13 +3260,13 @@ def multiply(x1, x2, out=None, **kwargs):
     out : ndarray or scalar
         The difference of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
 
-    Notes
-    -----
-    This operator now supports automatic type promotion. The resulting type will be determined
-    according to the following rules:
-        * If both inputs are of floating number types, the output is the more precise type.
-        * If only one of the inputs is floating number type, the result is that type.
-        * If both inputs are of integer types (including boolean), not supported yet.
+    .. note::
+       This operator now supports automatic type promotion. The resulting type will be determined
+       according to the following rules:
+
+       * If both inputs are of floating number types, the output is the more precise type.
+       * If only one of the inputs is floating number type, the result is that type.
+       * If both inputs are of integer types (including boolean), not supported yet.
 
     Examples
     --------
@@ -3291,17 +3285,26 @@ def multiply(x1, x2, out=None, **kwargs):
 @set_module('mxnet.numpy')
 @wrap_np_binary_func
 def divide(x1, x2, out=None, **kwargs):
-    """
-    Returns a true division of the inputs, element-wise.
+    """Returns a true division of the inputs, element-wise.
+
+    .. note::
+       This operator now supports automatic type promotion. The resulting type will be determined
+       according to the following rules:
+
+       * If both inputs are of floating number types, the output is the more precise type.
+       * If only one of the inputs is floating number type, the result is that type.
+       * If both inputs are of integer types including boolean, the output is of float32 or
+         float64 type, which depends on your current default dtype:
+
+         * When ``npx.is_np_default_dtype()`` returns False, default dtype is float32.
+         * When ``npx.is_np_default_dtype()`` returns True, default dtype is float64.
 
     Parameters
     ----------
     x1 : ndarray or scalar
         Dividend array.
-
     x2 : ndarray or scalar
         Divisor array.
-
     out : ndarray
         A location into which the result is stored. If provided, it must have a shape
         that the inputs broadcast to. If not provided or None, a freshly-allocated array
@@ -3312,17 +3315,6 @@ def divide(x1, x2, out=None, **kwargs):
     out : ndarray or scalar
         This is a scalar if both x1 and x2 are scalars.
 
-    Notes
-    -----
-    This operator now supports automatic type promotion. The resulting type will be determined
-    according to the following rules:
-        * If both inputs are of floating number types, the output is the more precise type.
-        * If only one of the inputs is floating number type, the result is that type.
-        * If both inputs are of integer types (including boolean), the output is of float32 or
-          float64 type, which depends on your current default dtype.
-          When npx.is_np_default_dtype() returns False, default dtype is float32;
-          When npx.is_np_default_dtype() returns True, default dtype is float64.
-
     Examples
     --------
     >>> np.true_divide(x, 4)
@@ -3343,10 +3335,8 @@ def true_divide(x1, x2, out=None):
     ----------
     x1 : ndarray or scalar
         Dividend array.
-
     x2 : ndarray or scalar
         Divisor array.
-
     out : ndarray
         A location into which the result is stored. If provided, it must have a shape
         that the inputs broadcast to. If not provided or None, a freshly-allocated array
@@ -3357,16 +3347,17 @@ def true_divide(x1, x2, out=None):
     out : ndarray or scalar
         This is a scalar if both x1 and x2 are scalars.
 
-    Notes
-    -----
-    This operator now supports automatic type promotion. The resulting type will be determined
-    according to the following rules:
-        * If both inputs are of floating number types, the output is the more precise type.
-        * If only one of the inputs is floating number type, the result is that type.
-        * If both inputs are of integer types (including boolean), the output is of float32 or
-          float64 type, which depends on your current default dtype.
-          When npx.is_np_default_dtype() returns False, default dtype is float32;
-          When npx.is_np_default_dtype() returns True, default dtype is float64.
+    .. note::
+
+       This operator now supports automatic type promotion. The resulting type will be determined
+       according to the following rules:
+
+       * If both inputs are of floating number types, the output is the more precise type.
+       * If only one of the inputs is floating number type, the result is that type.
+       * If both inputs are of integer types (including boolean), the output is of float32 or
+         float64 type, which depends on your current default dtype.
+         When npx.is_np_default_dtype() returns False, default dtype is float32;
+         When npx.is_np_default_dtype() returns True, default dtype is float64.
 
     Examples
     --------
@@ -3444,8 +3435,7 @@ def fmod(x1, x2, out=None, **kwargs):
 @set_module('mxnet.numpy')
 @wrap_np_binary_func
 def matmul(a, b, out=None, **kwargs):
-    """
-    Matrix product of two arrays.
+    r"""Matrix product of two arrays.
 
     Parameters
     ----------
@@ -3470,41 +3460,40 @@ def matmul(a, b, out=None, **kwargs):
 
     See Also
     --------
-    tensordot :
-        Sum products over arbitrary axes.
-    dot :
-        alternative matrix product with different broadcasting rules.
-    einsum :
-        Einstein summation convention.
+    tensordot : Sum products over arbitrary axes.
+    dot : alternative matrix product with different broadcasting rules.
+    einsum : Einstein summation convention.
 
-    Notes
-    -----
-    The behavior depends on the arguments in the following way.
+    .. note::
 
-    - If both arguments are 2-D they are multiplied like conventional matrices.
-    - If either argument is N-D, N > 2, it is treated as a stack of matrices
-      residing in the last two indexes and broadcast accordingly.
-    - If the first argument is 1-D, it is promoted to a matrix by prepending
-      a 1 to its dimensions. After matrix multiplication the prepended 1 is removed.
-    - If the second argument is 1-D, it is promoted to a matrix by appending a 1
-      to its dimensions. After matrix multiplication the appended 1 is removed.
+       The behavior depends on the arguments in the following way.
 
-    matmul differs from dot in two important ways:
+       * If both arguments are ``2-D`` they are multiplied like conventional matrices.
+       * If either argument is ``N-D``, ``N > 2``, it is treated as a stack of matrices
+         residing in the last two indexes and broadcast accordingly.
+       * If the first argument is ``1-D``, it is promoted to a matrix by prepending
+         a 1 to its dimensions. After matrix multiplication the prepended 1 is removed.
+       * If the second argument is ``1-D``, it is promoted to a matrix by appending a 1
+         to its dimensions. After matrix multiplication the appended 1 is removed.
 
-    - Multiplication by scalars is not allowed, use multiply instead.
-    - Stacks of matrices are broadcast together as if the matrices were elements,
-    respecting the signature (n,k),(k,m)->(n,m):
-    >>> a = np.ones([9, 5, 7, 4])
-    >>> c = np.ones([9, 5, 4, 3])
-    >>> np.dot(a, c).shape
-    (9, 5, 7, 9, 5, 3)
-    >>> np.matmul(a, c).shape
-    (9, 5, 7, 3)
-    >>> # n is 7, k is 4, m is 3
+       matmul differs from dot in two important ways:
+
+       * Multiplication by scalars is not allowed, use multiply instead.
+       * Stacks of matrices are broadcast together as if the matrices were elements,
+         respecting the signature ``(n,k),(k,m)->(n,m)``:
+
+       >>> a = np.ones([9, 5, 7, 4])
+       >>> c = np.ones([9, 5, 4, 3])
+       >>> np.dot(a, c).shape
+       (9, 5, 7, 9, 5, 3)
+       >>> np.matmul(a, c).shape
+       (9, 5, 7, 3)
+       >>> # n is 7, k is 4, m is 3
 
     Examples
     --------
     For 2-D arrays it is the matrix product:
+
     >>> a = np.array([[1, 0],
     ...               [0, 1]])
     >>> b = np.array([[4, 1],
@@ -3514,6 +3503,7 @@ def matmul(a, b, out=None, **kwargs):
            [2., 2.]])
 
     For 2-D mixed with 1-D, the result is the usual.
+
     >>> a = np.array([[1, 0],
     ...               [0, 1]])
     >>> b = np.array([1, 2])
@@ -3523,6 +3513,7 @@ def matmul(a, b, out=None, **kwargs):
     array([1., 2.])
 
     Broadcasting is conventional for stacks of arrays
+
     >>> a = np.arange(2 * 2 * 4).reshape((2, 2, 4))
     >>> b = np.arange(2 * 2 * 4).reshape((2, 4, 2))
     >>> np.matmul(a, b).shape
@@ -3533,10 +3524,12 @@ def matmul(a, b, out=None, **kwargs):
     array(98.)
 
     Scalar multiplication raises an error.
+
     >>> np.matmul([1, 2], 3)
     Traceback (most recent call last):
     ...
     mxnet.base.MXNetError: ... : Multiplication by scalars is not allowed.
+
     """
     return _mx_nd_np.matmul(a, b, out=out)
 
@@ -3830,13 +3823,14 @@ def tanh(x, out=None, **kwargs):
     y : ndarray or scalar
        The corresponding hyperbolic tangent values.
 
-    Notes
-    -----
-    If `out` is provided, the function writes the result into it,
-    and returns a reference to `out`.  (See Examples)
-    - input x does not support complex computation (like imaginary number)
-    >>> np.tanh(np.pi*1j)
-    TypeError: type <type 'complex'> not supported
+    .. note::
+       If `out` is provided, the function writes the result into it,
+       and returns a reference to `out`.  (See Examples)
+
+       * input x does not support complex computation (like imaginary number)
+
+       >>> np.tanh(np.pi*1j)
+       TypeError: type <type 'complex'> not supported
 
     Examples
     --------
@@ -4142,22 +4136,22 @@ def arcsin(x, out=None, **kwargs):
     >>> np.arcsin(0)
     0.0
 
-    Notes
-    -----
-    `arcsin` is a multivalued function: for each `x` there are infinitely
-    many numbers `z` such that :math:`sin(z) = x`.  The convention is to
-    return the angle `z` whose real part lies in [-pi/2, pi/2].
-    For real-valued input data types, *arcsin* always returns real output.
-    For each value that cannot be expressed as a real number or infinity,
-    it yields ``nan`` and sets the `invalid` floating point error flag.
-    The inverse sine is also known as `asin` or sin^{-1}.
-    The output `ndarray` has the same `ctx` as the input `ndarray`.
-    This function differs from the original `numpy.arcsin
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.arcsin.html>`_ in
-    the following aspects:
-    - Only support ndarray or scalar now.
-    - `where` argument is not supported.
-    - Complex input is not supported.
+    .. note::
+       `arcsin` is a multivalued function: for each `x` there are infinitely
+       many numbers `z` such that :math:`sin(z) = x`.  The convention is to
+       return the angle `z` whose real part lies in [-pi/2, pi/2].
+       For real-valued input data types, *arcsin* always returns real output.
+       For each value that cannot be expressed as a real number or infinity,
+       it yields ``nan`` and sets the `invalid` floating point error flag.
+       The inverse sine is also known as `asin` or sin^{-1}.
+       The output `ndarray` has the same `ctx` as the input `ndarray`.
+       This function differs from the original `numpy.arcsin
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.arcsin.html>`_ in
+       the following aspects:
+
+       * Only support ndarray or scalar now.
+       * `where` argument is not supported.
+       * Complex input is not supported.
 
     References
     ----------
@@ -4274,13 +4268,14 @@ def sign(x, out=None, **kwargs):
         The sign of `x`.
         This is a scalar if `x` is a scalar.
 
-    Note
-    -------
-    - Only supports real number as input elements.
-    - Input type does not support Python native iterables(list, tuple, ...).
-    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
-    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
-    - ``out`` param does not support scalar input case.
+    .. note::
+       * Only supports real number as input elements.
+       * Input type does not support Python native iterables(list, tuple, ...).
+       * ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be
+         the same as the expected output.
+       * ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the
+         same as the expected output.
+       * ``out`` param does not support scalar input case.
 
     Examples
     --------
@@ -4326,18 +4321,20 @@ def log(x, out=None, **kwargs):
         The natural logarithm of `x`, element-wise.
         This is a scalar if `x` is a scalar.
 
-    Notes
-    -----
-    Currently only supports data of real values and ``inf`` as input. Returns data of real value, ``inf``, ``-inf`` and
-    ``nan`` according to the input.
-    This function differs from the original `numpy.log
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.log.html>`_ in
-    the following aspects:
-    - Does not support complex number for now
-    - Input type does not support Python native iterables(list, tuple, ...).
-    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
-    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
-    - ``out`` param does not support scalar input case.
+    .. note::
+       Currently only supports data of real values and ``inf`` as input. Returns data of
+       real value, ``inf``, ``-inf`` and ``nan`` according to the input.
+       This function differs from the original `numpy.log
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.log.html>`_ in
+       the following aspects:
+
+       * Does not support complex number for now
+       * Input type does not support Python native iterables(list, tuple, ...).
+       * ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be
+         the same as the expected output.
+       * ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the
+         same as the expected output.
+       * ``out`` param does not support scalar input case.
 
     Examples
     --------
@@ -4374,14 +4371,14 @@ def rint(x, out=None, **kwargs):
     out : ndarray or scalar
         Output array is same shape and type as x. This is a scalar if x is a scalar.
 
-    Notes
-    -----
-    This function differs from the original `numpy.rint
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.rint.html>`_ in
-    the following way(s):
-    - only ndarray or scalar is accpted as valid input, tuple of ndarray is not supported
-    - broadcasting to `out` of different shape is currently not supported
-    - when input is plain python numerics, the result will not be stored in the `out` param
+    .. note::
+       This function differs from the original `numpy.rint
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.rint.html>`_ in
+       the following way(s):
+
+       * only ndarray or scalar is accpted as valid input, tuple of ndarray is not supported
+       * broadcasting to `out` of different shape is currently not supported
+       * when input is plain python numerics, the result will not be stored in the `out` param
 
     Examples
     --------
@@ -4413,14 +4410,14 @@ def log2(x, out=None, **kwargs):
         The logarithm base two of `x`, element-wise.
         This is a scalar if `x` is a scalar.
 
-    Notes
-    -----
-    This function differs from the original `numpy.log2
-    <https://www.google.com/search?q=numpy+log2>`_ in
-    the following way(s):
-    - only ndarray or scalar is accpted as valid input, tuple of ndarray is not supported
-    - broadcasting to `out` of different shape is currently not supported
-    - when input is plain python numerics, the result will not be stored in the `out` param
+    .. note::
+       This function differs from the original `numpy.log2
+       <https://www.google.com/search?q=numpy+log2>`_ in
+       the following way(s):
+
+       * only ndarray or scalar is accpted as valid input, tuple of ndarray is not supported
+       * broadcasting to `out` of different shape is currently not supported
+       * when input is plain python numerics, the result will not be stored in the `out` param
 
     Examples
     --------
@@ -4499,15 +4496,18 @@ def degrees(x, out=None, **kwargs):
         reference to it.
         This is a scalar if `x` is a scalar.
 
-    Notes
-    -------
-    This function differs from the original `numpy.degrees
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.degrees.html>`_ in
-    the following aspects:
-    - Input type does not support Python native iterables(list, tuple, ...). Only ndarray is supported.
-    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
-    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
-    - ``out`` param does not support scalar input case.
+    .. note::
+       This function differs from the original `numpy.degrees
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.degrees.html>`_ in
+       the following aspects:
+
+       * Input type does not support Python native iterables(list, tuple, ...).
+         Only ndarray is supported.
+       * ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be
+         the same as the expected output.
+       * ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the
+         same as the expected output.
+       * ``out`` param does not support scalar input case.
 
     Examples
     --------
@@ -4527,8 +4527,8 @@ def degrees(x, out=None, **kwargs):
 @set_module('mxnet.numpy')
 @wrap_np_unary_func
 def rad2deg(x, out=None, **kwargs):
-    r"""
-    Convert angles from radians to degrees.
+    r"""Convert angles from radians to degrees.
+
     Parameters
     ----------
     x : ndarray or scalar
@@ -4543,13 +4543,14 @@ def rad2deg(x, out=None, **kwargs):
         The corresponding angle in radians.
         This is a scalar if `x` is a scalar.
 
-    Notes
-    -----
-    "rad2deg(x)" is "x * 180 / pi".
+    .. note::
+
+       "rad2deg(x)" is "x * 180 / pi".
+
+       This function differs from the original numpy.arange in the following aspects:
 
-    This function differs from the original numpy.arange in the following aspects:
-        - Only support float32 and float64.
-        - `out` must be in the same size of input.
+       * Only support float32 and float64.
+       * `out` must be in the same size of input.
 
     Examples
     --------
@@ -4579,14 +4580,14 @@ def radians(x, out=None, **kwargs):
     y : ndarray
         The corresponding radian values. This is a scalar if x is a scalar.
 
-    Notes
-    -----
-    This function differs from the original `numpy.radians
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.radians.html>`_ in
-    the following way(s):
-    - only ndarray or scalar is accpted as valid input, tuple of ndarray is not supported
-    - broadcasting to `out` of different shape is currently not supported
-    - when input is plain python numerics, the result will not be stored in the `out` param
+    .. note::
+       This function differs from the original `numpy.radians
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.radians.html>`_ in
+       the following way(s):
+
+       * only ndarray or scalar is accpted as valid input, tuple of ndarray is not supported
+       * broadcasting to `out` of different shape is currently not supported
+       * when input is plain python numerics, the result will not be stored in the `out` param
 
     Examples
     --------
@@ -4619,13 +4620,13 @@ def deg2rad(x, out=None, **kwargs):
         The corresponding angle in radians.
         This is a scalar if `x` is a scalar.
 
-    Notes
-    -----
-    "deg2rad(x)" is "x * pi / 180".
+    .. note::
+       "deg2rad(x)" is "x * pi / 180".
+
+       This function differs from the original numpy.arange in the following aspects:
 
-    This function differs from the original numpy.arange in the following aspects:
-        - Only support float32 and float64.
-        - `out` must be in the same size of input.
+       * Only support float32 and float64.
+       * `out` must be in the same size of input.
 
     Examples
     --------
@@ -4638,8 +4639,7 @@ def deg2rad(x, out=None, **kwargs):
 @set_module('mxnet.numpy')
 @wrap_np_unary_func
 def reciprocal(x, out=None, **kwargs):
-    r"""
-    Return the reciprocal of the argument, element-wise.
+    r"""Return the reciprocal of the argument, element-wise.
     Calculates ``1/x``.
 
     Parameters
@@ -4664,19 +4664,20 @@ def reciprocal(x, out=None, **kwargs):
     >>> np.reciprocal(x)
     array([1.       , 0.5      , 0.3003003])
 
-    Notes
-    -----
     .. note::
-        This function is not designed to work with integers.
-    For integer arguments with absolute value larger than 1 the result is
-    always zero because of the way Python handles integer division.  For
-    integer zero the result is an overflow.
-    The output `ndarray` has the same `ctx` as the input `ndarray`.
-    This function differs from the original `numpy.reciprocal
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.reciprocal.html>`_ in
-    the following aspects:
-    - Only support ndarray and scalar now.
-    - `where` argument is not supported.
+
+       This function is not designed to work with integers.
+       For integer arguments with absolute value larger than 1 the result is
+       always zero because of the way Python handles integer division.  For
+       integer zero the result is an overflow.
+       The output `ndarray` has the same `ctx` as the input `ndarray`.
+       This function differs from the original `numpy.reciprocal
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.reciprocal.html>`_ in
+       the following aspects:
+
+       * Only support ndarray and scalar now.
+       * `where` argument is not supported.
+
     """
     return _mx_nd_np.reciprocal(x, out=out, **kwargs)
 
@@ -4709,15 +4710,16 @@ def square(x, out=None, **kwargs):
     >>> np.square(x)
     array([1., 4., 1.])
 
-    Notes
-    -----
-    The output `ndarray` has the same `ctx` as the input `ndarray`.
-    This function differs from the original `numpy.square
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.square.html>`_ in
-    the following aspects:
-    - Only support ndarray and scalar now.
-    - `where` argument is not supported.
-    - Complex input is not supported.
+    .. note::
+       The output `ndarray` has the same `ctx` as the input `ndarray`.
+       This function differs from the original `numpy.square
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.square.html>`_ in
+       the following aspects:
+
+       * Only support ndarray and scalar now.
+       * `where` argument is not supported.
+       * Complex input is not supported.
+
     """
     return _mx_nd_np.square(x, out=out, **kwargs)
 
@@ -4728,8 +4730,8 @@ def negative(x, out=None, **kwargs):
     r"""
     Numerical negative, element-wise.
 
-    Parameters:
-    ------------
+    Parameters
+    ----------
     x : ndarray or scalar
         Input array.
     out : ndarray, None, or tuple of ndarray and None, optional
@@ -4739,7 +4741,7 @@ def negative(x, out=None, **kwargs):
           A tuple (possible only as a keyword argument) must have length
           equal to the number of outputs.
 
-    Returns:
+    Returns
     -------
     y : ndarray or scalar
         Returned array or scalar: y = -x. This is a scalar if x is a scalar.
@@ -4759,14 +4761,14 @@ def fix(x, out=None, **kwargs):
     Round an array of floats element-wise to nearest integer towards zero.
     The rounded values are returned as floats.
 
-    Parameters:
+    Parameters
     ----------
     x : ndarray
         An array of floats to be rounded
     out : ndarray, optional
         Output array
 
-    Returns:
+    Returns
     -------
     y : ndarray or scalar
     Returned array or scalar: y = -x. This is a scalar if x is a scalar.ndarray of floats
@@ -4786,7 +4788,7 @@ def tan(x, out=None, **kwargs):
     Compute tangent element-wise.
     Equivalent to np.sin(x)/np.cos(x) element-wise.
 
-    Parameters:
+    Parameters
     ----------
     x : ndarray
         Input array.
@@ -4796,7 +4798,7 @@ def tan(x, out=None, **kwargs):
           a freshly-allocated array is returned. A tuple (possible only as a keyword argument)
           must have length equal to the number of outputs.
 
-    Returns:
+    Returns
     -------
     y : ndarray
     The corresponding tangent values. This is a scalar if x is a scalar.
@@ -5013,13 +5015,14 @@ def trunc(x, out=None, **kwargs):
     y : ndarray or scalar
         The truncated value of each element in `x`.
         This is a scalar if `x` is a scalar.
-    Notes
-    -----
-    This function differs from the original numpy.trunc in the following aspects:
-        - Do not support `where`, a parameter in numpy which indicates where to calculate.
-        - Cannot cast type automatically. Dtype of `out` must be same as the expected one.
-        - Cannot broadcast automatically. Shape of `out` must be same as the expected one.
-        - If `x` is plain python numeric, the result won't be stored in out.
+
+    .. note::
+       This function differs from the original numpy.trunc in the following aspects:
+
+       * Do not support `where`, a parameter in numpy which indicates where to calculate.
+       * Cannot cast type automatically. Dtype of `out` must be same as the expected one.
+       * Cannot broadcast automatically. Shape of `out` must be same as the expected one.
+       * If `x` is plain python numeric, the result won't be stored in out.
 
     Examples
     --------
@@ -5050,13 +5053,12 @@ def logical_not(x, out=None, **kwargs):
         on elements of `x`.
         This is a scalar if `x` is a scalar.
 
-    Notes
-    -----
-    This function differs from the original numpy.logical_not in the following aspects:
-        - Do not support `where`, a parameter in numpy which indicates where to calculate.
-        - Cannot cast type automatically. Dtype of `out` must be same as the expected one.
-        - Cannot broadcast automatically. Shape of `out` must be same as the expected one.
-        - If `x` is plain python numeric, the result won't be stored in out.
+    .. note::
+       This function differs from the original numpy.logical_not in the following aspects:
+       * Do not support `where`, a parameter in numpy which indicates where to calculate.
+       * Cannot cast type automatically. Dtype of `out` must be same as the expected one.
+       * Cannot broadcast automatically. Shape of `out` must be same as the expected one.
+       * If `x` is plain python numeric, the result won't be stored in out.
 
     Examples
     --------
@@ -5090,21 +5092,21 @@ def arcsinh(x, out=None, **kwargs):
         Array of the same shape as `x`.
         This is a scalar if `x` is a scalar.
 
-    Notes
-    -----
-    `arcsinh` is a multivalued function: for each `x` there are infinitely
-    many numbers `z` such that `sinh(z) = x`.
+    .. note::
+       `arcsinh` is a multivalued function: for each `x` there are infinitely
+       many numbers `z` such that `sinh(z) = x`.
+
+       For real-valued input data types, `arcsinh` always returns real output.
+       For each value that cannot be expressed as a real number or infinity, it
+       yields ``nan`` and sets the `invalid` floating point error flag.
 
-    For real-valued input data types, `arcsinh` always returns real output.
-    For each value that cannot be expressed as a real number or infinity, it
-    yields ``nan`` and sets the `invalid` floating point error flag.
+       This function differs from the original numpy.arcsinh in the following aspects:
 
-    This function differs from the original numpy.arcsinh in the following aspects:
-        - Do not support `where`, a parameter in numpy which indicates where to calculate.
-        - Do not support complex-valued input.
-        - Cannot cast type automatically. DType of `out` must be same as the expected one.
-        - Cannot broadcast automatically. Shape of `out` must be same as the expected one.
-        - If `x` is plain python numeric, the result won't be stored in out.
+       * Do not support `where`, a parameter in numpy which indicates where to calculate.
+       * Do not support complex-valued input.
+       * Cannot cast type automatically. DType of `out` must be same as the expected one.
+       * Cannot broadcast automatically. Shape of `out` must be same as the expected one.
+       * If `x` is plain python numeric, the result won't be stored in out.
 
     Examples
     --------
@@ -5137,21 +5139,21 @@ def arccosh(x, out=None, **kwargs):
         Array of the same shape as `x`.
         This is a scalar if `x` is a scalar.
 
-    Notes
-    -----
-    `arccosh` is a multivalued function: for each `x` there are infinitely
-    many numbers `z` such that `cosh(z) = x`.
+    .. note::
+       `arccosh` is a multivalued function: for each `x` there are infinitely
+       many numbers `z` such that `cosh(z) = x`.
+
+       For real-valued input data types, `arccosh` always returns real output.
+       For each value that cannot be expressed as a real number or infinity, it
+       yields ``nan`` and sets the `invalid` floating point error flag.
 
-    For real-valued input data types, `arccosh` always returns real output.
-    For each value that cannot be expressed as a real number or infinity, it
-    yields ``nan`` and sets the `invalid` floating point error flag.
+       This function differs from the original numpy.arccosh in the following aspects:
 
-    This function differs from the original numpy.arccosh in the following aspects:
-        - Do not support `where`, a parameter in numpy which indicates where to calculate.
-        - Do not support complex-valued input.
-        - Cannot cast type automatically. Dtype of `out` must be same as the expected one.
-        - Cannot broadcast automatically. Shape of `out` must be same as the expected one.
-        - If `x` is plain python numeric, the result won't be stored in out.
+       * Do not support `where`, a parameter in numpy which indicates where to calculate.
+       * Do not support complex-valued input.
+       * Cannot cast type automatically. Dtype of `out` must be same as the expected one.
+       * Cannot broadcast automatically. Shape of `out` must be same as the expected one.
+       * If `x` is plain python numeric, the result won't be stored in out.
 
     Examples
     --------
@@ -5184,21 +5186,21 @@ def arctanh(x, out=None, **kwargs):
         Array of the same shape as `x`.
         This is a scalar if `x` is a scalar.
 
-    Notes
-    -----
-    `arctanh` is a multivalued function: for each `x` there are infinitely
-    many numbers `z` such that `tanh(z) = x`.
+    .. note::
+       `arctanh` is a multivalued function: for each `x` there are infinitely
+       many numbers `z` such that `tanh(z) = x`.
 
-    For real-valued input data types, `arctanh` always returns real output.
-    For each value that cannot be expressed as a real number or infinity, it
-    yields ``nan`` and sets the `invalid` floating point error flag.
+       For real-valued input data types, `arctanh` always returns real output.
+       For each value that cannot be expressed as a real number or infinity, it
+       yields ``nan`` and sets the `invalid` floating point error flag.
 
-    This function differs from the original numpy.arctanh in the following aspects:
-        - Do not support `where`, a parameter in numpy which indicates where to calculate.
-        - Do not support complex-valued input.
-        - Cannot cast type automatically. Dtype of `out` must be same as the expected one.
-        - Cannot broadcast automatically. Shape of `out` must be same as the expected one.
-        - If `x` is plain python numeric, the result won't be stored in out.
+       This function differs from the original numpy.arctanh in the following aspects:
+
+       * Do not support `where`, a parameter in numpy which indicates where to calculate.
+       * Do not support complex-valued input.
+       * Cannot cast type automatically. Dtype of `out` must be same as the expected one.
+       * Cannot broadcast automatically. Shape of `out` must be same as the expected one.
+       * If `x` is plain python numeric, the result won't be stored in out.
 
     Examples
     --------
@@ -5329,16 +5331,14 @@ def sort(a, axis=-1, kind=None, order=None):
 
 @set_module('mxnet.numpy')
 def tensordot(a, b, axes=2):
-    r"""
-    tensordot(a, b, axes=2)
-    Compute tensor dot product along specified axes for arrays >= 1-D.
+    r"""Compute tensor dot product along specified axes for arrays >= 1-D.
     Given two tensors (arrays of dimension greater than or equal to one),
-    `a` and `b`, and an ndarray object containing two ndarray
-    objects, ``(a_axes, b_axes)``, sum the products of `a`'s and `b`'s
+    ``a`` and ``b``, and an ndarray object containing two ndarray
+    objects, ``(a_axes, b_axes)``, sum the products of ``a``'s and ``b``'s
     elements (components) over the axes specified by ``a_axes`` and
     ``b_axes``. The third argument can be a single non-negative
     integer_like scalar, ``N``; if it is such, then the last ``N``
-    dimensions of `a` and the first ``N`` dimensions of `b` are summed
+    dimensions of ``a`` and the first ``N`` dimensions of ``b`` are summed
     over.
 
     Parameters
@@ -5346,30 +5346,32 @@ def tensordot(a, b, axes=2):
     a, b : ndarray, len(shape) >= 1
         Tensors to "dot".
     axes : int or (2,) ndarray
+
         * integer_like
-        If an int N, sum over the last N axes of `a` and the first N axes
-        of `b` in order. The sizes of the corresponding axes must match.
+          If an int N, sum over the last N axes of `a` and the first N axes
+          of `b` in order. The sizes of the corresponding axes must match.
         * (2,) ndarray
-        Or, a list of axes to be summed over, first sequence applying to `a`,
-        second to `b`. Both elements ndarray must be of the same length.
+          Or, a list of axes to be summed over, first sequence applying to `a`,
+          second to `b`. Both elements ndarray must be of the same length.
 
     See Also
     --------
     dot, einsum
 
-    Notes
-    -----
-    Three common use cases are:
-        * ``axes = 0`` : tensor product :math:`a\otimes b`
-        * ``axes = 1`` : tensor dot product :math:`a\cdot b`
-        * ``axes = 2`` : (default) tensor double contraction :math:`a:b`
-    When `axes` is integer_like, the sequence for evaluation will be: first
-    the -Nth axis in `a` and 0th axis in `b`, and the -1th axis in `a` and
-    Nth axis in `b` last.
-    When there is more than one axis to sum over - and they are not the last
-    (first) axes of `a` (`b`) - the argument `axes` should consist of
-    two sequences of the same length, with the first axis to sum over given
-    first in both sequences, the second axis second, and so forth.
+    .. note::
+
+       Three common use cases are:
+
+           * ``axes = 0`` : tensor product :math:`a\otimes b`
+           * ``axes = 1`` : tensor dot product :math:`a\cdot b`
+           * ``axes = 2`` : (default) tensor double contraction :math:`a:b`
+       When `axes` is integer_like, the sequence for evaluation will be: first
+       the -Nth axis in `a` and 0th axis in `b`, and the -1th axis in `a` and
+       Nth axis in `b` last.
+       When there is more than one axis to sum over - and they are not the last
+       (first) axes of `a` (`b`) - the argument `axes` should consist of
+       two sequences of the same length, with the first axis to sum over given
+       first in both sequences, the second axis second, and so forth.
 
     Examples
     --------
@@ -5538,17 +5540,16 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
     (-0.5, 1)
     >>> plt.show()
 
-    Notes
-    -----
+    .. note::
 
-    This function differs from the original `numpy.linspace
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linspace.html>`_ in
-    the following aspects:
+       This function differs from the original `numpy.linspace
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linspace.html>`_ in
+       the following aspects:
 
-    - `start` and `stop` do not support list, numpy ndarray and mxnet ndarray
-    - axis could only be 0
-    - There could be an additional `ctx` argument to specify the device, e.g. the i-th
-      GPU.
+       * `start` and `stop` do not support list, numpy ndarray and mxnet ndarray
+       * axis could only be 0
+       * There could be an additional `ctx` argument to specify the device, e.g. the i-th
+         GPU.
     """
     return _mx_nd_np.linspace(start, stop, num, endpoint, retstep, dtype, axis, ctx)
 # pylint: enable=redefined-outer-name
@@ -5818,14 +5819,14 @@ def transpose(a, axes=None):
     p : ndarray
         a with its axes permuted.
 
-    Notes
-    -----
-    This function differs from the original `numpy.transpose
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.transpose.html>`_ in
-    the following way(s):
+    .. note::
+
+       This function differs from the original `numpy.transpose
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.transpose.html>`_ in
+       the following way(s):
 
-    - only ndarray is accepted as valid input, python iterables are not supported
-    - the operator always returns an `ndarray` that does not share the memory with the input
+       * only ndarray is accepted as valid input, python iterables are not supported
+       * the operator always returns an `ndarray` that does not share the memory with the input
 
     Examples
     --------
@@ -6173,8 +6174,8 @@ def arange(start, stop=None, step=1, dtype=None, ctx=None):
     dtype : dtype
         The type of the output array.
         Default dtype can be set to be consistent with offical numpy by `npx.set_np(dtype=True)`.
-        - When npx.is_np_default_dtype() returns False, default dtype is float32;
-        - When npx.is_np_default_dtype() returns True, default dtype is int64.
+        * When npx.is_np_default_dtype() returns False, default dtype is float32;
+        * When npx.is_np_default_dtype() returns True, default dtype is int64.
 
     Returns
     -------
@@ -6225,9 +6226,11 @@ def split(ary, indices_or_sections, axis=0):
         If `indices_or_sections` is a 1-D array of sorted integers, the entries
         indicate where along `axis` the array is split.  For example,
         ``[2, 3]`` would, for ``axis=0``, result in
-          - ary[:2]
-          - ary[2:3]
-          - ary[3:]
+
+        * ary[:2]
+        * ary[2:3]
+        * ary[3:]
+
         If an index exceeds the dimension of the array along `axis`,
         an empty sub-array is returned correspondingly.
     axis : int, optional
@@ -6277,11 +6280,12 @@ def array_split(ary, indices_or_sections, axis=0):
     l % n sub-arrays of size l//n + 1 and the rest of size l//n.
 
     If `indices_or_sections` is a 1-D array of sorted integers, the entries
-        indicate where along `axis` the array is split.  For example,
-        ``[2, 3]`` would, for ``axis=0``, result in
-          - ary[:2]
-          - ary[2:3]
-          - ary[3:]
+    indicate where along `axis` the array is split.  For example, ``[2, 3]``
+    would, for ``axis=0``, result in
+    * ary[:2]
+    * ary[2:3]
+    * ary[3:]
+
     If an index exceeds the dimension of the array along `axis`,
     an empty sub-array is returned correspondingly.
 
@@ -6321,10 +6325,7 @@ def array_split(ary, indices_or_sections, axis=0):
 
 @set_module('mxnet.numpy')
 def vsplit(ary, indices_or_sections):
-    r"""
-    vsplit(ary, indices_or_sections)
-
-    Split an array into multiple sub-arrays vertically (row-wise).
+    r"""Split an array into multiple sub-arrays vertically (row-wise).
 
     ``vsplit`` is equivalent to ``split`` with `axis=0` (default): the array is always split
     along the first axis regardless of the array dimension.
@@ -6340,9 +6341,9 @@ def vsplit(ary, indices_or_sections):
         If `indices_or_sections` is a 1-D array of sorted integers, the entries indicate where
         along axis 0 the array is split.  For example, ``[2, 3]`` would result in
 
-          - ary[:2]
-          - ary[2:3]
-          - ary[3:]
+        * ary[:2]
+        * ary[2:3]
+        * ary[3:]
 
         If an index exceeds the dimension of the array along axis 0, an error will be thrown.
 
@@ -6355,16 +6356,16 @@ def vsplit(ary, indices_or_sections):
     --------
     split : Split an array into multiple sub-arrays of equal size.
 
-    Notes
-    -------
-    This function differs from the original `numpy.vsplit
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.vsplit.html>`_ in
-    the following aspects:
+    .. note::
+       This function differs from the original `numpy.vsplit
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.vsplit.html>`_ in
+       the following aspects:
+
+       * Currently parameter ``indices_or_sections`` does not support ndarray, but supports scalar,
+         tuple and list.
+       * In ``indices_or_sections``, if an index exceeds the dimension of the array along axis 0,
+         an error will be thrown.
 
-    - Currently parameter ``indices_or_sections`` does not support ndarray, but supports scalar,
-    tuple and list.
-    - In ``indices_or_sections``, if an index exceeds the dimension of the array along axis 0,
-    an error will be thrown.
 
     Examples
     --------
@@ -6414,9 +6415,9 @@ def dsplit(ary, indices_or_sections):
         If `indices_or_sections` is a 1-D array of sorted integers, the entries indicate where
         along axis 2 the array is split.  For example, ``[2, 3]`` would result in
 
-          - ary[:, :, :2]
-          - ary[:, :, 2:3]
-          - ary[:, :, 3:]
+        * ary[:, :, :2]
+        * ary[:, :, 2:3]
+        * ary[:, :, 3:]
 
         If an index exceeds the dimension of the array along axis 2, an error will be thrown.
 
@@ -6429,16 +6430,14 @@ def dsplit(ary, indices_or_sections):
     --------
     split : Split an array into multiple sub-arrays of equal size.
 
-    Notes
-    -------
-    This function differs from the original `numpy.dsplit
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.dsplit.html>`_ in
-    the following aspects:
-
-    - Currently parameter ``indices_or_sections`` does not support ndarray, but supports scalar,
-    tuple and list.
-    - In ``indices_or_sections``, if an index exceeds the dimension of the array along axis 2,
-    an error will be thrown.
+    .. note::
+       This function differs from the original `numpy.dsplit
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.dsplit.html>`_ in
+       the following aspects:
+       * Currently parameter ``indices_or_sections`` does not support ndarray, but supports scalar,
+       tuple and list.
+       * In ``indices_or_sections``, if an index exceeds the dimension of the array along axis 2,
+       an error will be thrown.
 
     Examples
     --------
@@ -7183,19 +7182,20 @@ def argmax(a, axis=None, out=None):
         Array of indices into the array. It has the same shape as `a.shape`
         with the dimension along `axis` removed.
 
-    Notes
-    -----
-    In case of multiple occurrences of the maximum values, the indices
-    corresponding to the first occurrence are returned.
+    .. note::
+       In case of multiple occurrences of the maximum values, the indices
+       corresponding to the first occurrence are returned.
 
-    This function differs from the original `numpy.argmax
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html>`_ in
-    the following aspects:
+       This function differs from the original `numpy.argmax
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html>`_ in
+       the following aspects:
 
-    - Input type does not support Python native iterables(list, tuple, ...).
-    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
-    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
-    - ``out`` param does not support scalar input case.
+       * Input type does not support Python native iterables(list, tuple, ...).
+       * ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be
+         the same as the expected output.
+       * ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the
+         same as the expected output.
+       * ``out`` param does not support scalar input case.
 
     Examples
     --------
@@ -7251,19 +7251,20 @@ def argmin(a, axis=None, out=None):
         Array of indices into the array. It has the same shape as `a.shape`
         with the dimension along `axis` removed.
 
-    Notes
-    -----
-    In case of multiple occurrences of the minimum values, the indices
-    corresponding to the first occurrence are returned.
+    .. note::
+       In case of multiple occurrences of the minimum values, the indices
+       corresponding to the first occurrence are returned.
 
-    This function differs from the original `numpy.argmin
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmin.html>`_ in
-    the following aspects:
+       This function differs from the original `numpy.argmin
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmin.html>`_ in
+       the following aspects:
 
-    - Input type does not support Python native iterables(list, tuple, ...).
-    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
-    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
-    - ``out`` param does not support scalar input case.
+       * Input type does not support Python native iterables(list, tuple, ...).
+       * ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be
+         the same as the expected output.
+       * ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the
+         same as the expected output.
+       * ``out`` param does not support scalar input case.
 
     Examples
     --------
@@ -7473,27 +7474,27 @@ def average(a, axis=None, weights=None, returned=False, out=None):
     Raises
     --------
         MXNetError
-        - When all weights along axis sum to zero.
-        - When the length of 1D weights is not the same as the shape of a along axis.
-        - When given 1D weights, the axis is not specified or is not int.
-        - When the shape of weights and a differ, but weights are not 1D.
+        * When all weights along axis sum to zero.
+        * When the length of 1D weights is not the same as the shape of a along axis.
+        * When given 1D weights, the axis is not specified or is not int.
+        * When the shape of weights and a differ, but weights are not 1D.
 
     See also
     --------
         mean
 
-    Notes
-    --------
-    This function differs from the original `numpy.average`
-    <https://numpy.org/devdocs/reference/generated/numpy.average.html>`_ in
-    the following way(s):
+    .. note::
+       This function differs from the original `numpy.average`
+       <https://numpy.org/devdocs/reference/generated/numpy.average.html>`_ in
+       the following way(s):
+
+       * Does not guarantee the same behavior with numpy when given float16 dtype and overflow happens
+       * Does not support complex dtype
+       * The dtypes of a and weights must be the same
+       * Integral a results in float32 or float64 returned dtype:
 
-    - Does not guarantee the same behavior with numpy when given float16 dtype and overflow happens
-    - Does not support complex dtype
-    - The dtypes of a and weights must be the same
-    - Integral a results in float32 or float64 returned dtype:
-      When npx.is_np_default_dtype() returns False, default dtype is float32,
-      When npx.is_np_default_dtype() returns True, default dtype is float64;
+         * When npx.is_np_default_dtype() returns False, default dtype is float32,
+         * When npx.is_np_default_dtype() returns True, default dtype is float64;
 
     Examples
     --------
@@ -7556,13 +7557,14 @@ def mean(a, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable
         If out=None, returns a new array containing the mean values,
         otherwise a reference to the output array is returned.
 
-    Notes
-    -----
-    This function differs from the original `numpy.mean
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.mean.html>`_ in
-    the following way(s):
-    - only ndarray is accepted as valid input, python iterables or scalar is not supported
-    - default data type for integer input is float32 or float64, which depends on your current default dtype
+    .. note::
+
+       This function differs from the original `numpy.mean
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.mean.html>`_ in
+       the following way(s):
+
+       * only ndarray is accepted as valid input, python iterables or scalar is not supported
+       * default data type for integer input is float32 or float64, which depends on your current default dtype
 
     Examples
     --------
@@ -7860,13 +7862,12 @@ def copysign(x1, x2, out=None, **kwargs):
         The values of `x1` with the sign of `x2`.
         This is a scalar if both `x1` and `x2` are scalars.
 
-    Notes
-    -------
-    This function differs from the original `numpy.copysign
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.copysign.html>`_ in
-    the following aspects:
+    .. note::
+       This function differs from the original `numpy.copysign
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.copysign.html>`_ in
+       the following aspects:
 
-    - ``where`` param is not supported.
+       * ``where`` param is not supported.
 
     Examples
     --------
@@ -7910,10 +7911,10 @@ def ravel(x, order='C'):
         Note that matrices are special cased for backward compatibility, if `x`
         is a matrix, then y is a 1-D ndarray.
 
-    Notes
-    -----
-    This function differs from the original numpy.arange in the following aspects:
-        - Only support row-major, C-style order.
+    .. note::
+       This function differs from the original numpy.arange in the following aspects:
+
+       * Only support row-major, C-style order.
 
     Examples
     --------
@@ -7937,8 +7938,8 @@ def unravel_index(indices, shape, order='C'): # pylint: disable=redefined-outer-
     """
     Converts a flat index or array of flat indices into a tuple of coordinate arrays.
 
-    Parameters:
-    -------------
+    Parameters
+    ----------
     indices : array_like
             An integer array whose elements are indices into the flattened version of an array of dimensions shape.
             Before version 1.6.0, this function accepted just one index value.
@@ -7946,8 +7947,8 @@ def unravel_index(indices, shape, order='C'): # pylint: disable=redefined-outer-
             The shape of the array to use for unraveling indices.
     order : Only row-major is supported currently.
 
-    Returns:
-    -------------
+    Returns
+    -------
     unraveled_coords : ndarray
             Each row in the ndarray has the same shape as the indices array.
             Each column in the ndarray represents the unravelled index
@@ -8011,8 +8012,8 @@ def diag_indices_from(arr):
     the usual diagonal, for a.ndim > 2 this is the set of indices to access
     a[i, i, ..., i] for i = [0..n-1].
 
-    Parameters:
-    -------------
+    Parameters
+    ----------
     arr : ndarray
         Input array for acessing the main diagonal. All dimensions
         should have equal length.
@@ -8469,16 +8470,15 @@ def around(x, decimals=0, out=None, **kwargs):
         An array of the same type as `x`, containing the rounded values.
         A reference to the result is returned.
 
-    Notes
-    -----
-    For values exactly halfway between rounded decimal values, NumPy
-    rounds to the nearest even value. Thus 1.5 and 2.5 round to 2.0,
-    -0.5 and 0.5 round to 0.0, etc.
+    .. note::
+       For values exactly halfway between rounded decimal values, NumPy
+       rounds to the nearest even value. Thus 1.5 and 2.5 round to 2.0,
+       -0.5 and 0.5 round to 0.0, etc.
 
-    This function differs from the original numpy.prod in the following aspects:
+       This function differs from the original numpy.prod in the following aspects:
 
-        - Cannot cast type automatically. Dtype of `out` must be same as the expected one.
-        - Cannot support complex-valued number.
+       * Cannot cast type automatically. Dtype of `out` must be same as the expected one.
+       * Cannot support complex-valued number.
 
     Examples
     --------
@@ -8558,28 +8558,33 @@ def arctan2(x1, x2, out=None, **kwargs):
         Array of angles in radians, in the range ``[-pi, pi]``. This is a scalar if
         `x1` and `x2` are scalars.
 
-    Notes
-    -----
-    *arctan2* is identical to the `atan2` function of the underlying
-    C library.  The following special values are defined in the C
-    standard: [1]_
-
-    ====== ====== ================
-    `x1`   `x2`   `arctan2(x1,x2)`
-    ====== ====== ================
-    +/- 0  +0     +/- 0
-    +/- 0  -0     +/- pi
-        > 0   +/-inf +0 / +pi
-        < 0   +/-inf -0 / -pi
-    +/-inf +inf   +/- (pi/4)
-    +/-inf -inf   +/- (3*pi/4)
-    ====== ====== ================
-
-    Note that +0 and -0 are distinct floating point numbers, as are +inf
-    and -inf.
-
-    This function differs from the original numpy.arange in the following aspects:
-        - Only support float16, float32 and float64.
+    .. notes::
+       *arctan2* is identical to the ``atan2`` function of the underlying
+       C library.  The following special values are defined in the C
+       standard: [1]_
+
+       +========+========+==================+
+       | `x1`   | `x2`   | `arctan2(x1,x2)` |
+       +========+========+==================+
+       | +/- 0  | +0     | +/- 0            |
+       +========+========+==================+
+       | +/- 0  | -0     | +/- pi           |
+       +========+========+==================+
+       | > 0    | +/-inf | +0 / +pi         |
+       +========+========+==================+
+       | < 0    | +/-inf | -0 / -pi         |
+       +========+========+==================+
+       | +/-inf | +inf   | +/- (pi/4)       |
+       +========+========+==================+
+       | +/-inf | -inf   | +/- (3*pi/4)     |
+       +========+========+==================+
+
+       Note that +0 and -0 are distinct floating point numbers, as are +inf
+       and -inf.
+
+       This function differs from the original numpy.arange in the following aspects:
+
+       * Only support float16, float32 and float64.
 
     References
     ----------
@@ -8632,10 +8637,10 @@ def hypot(x1, x2, out=None, **kwargs):
         The hypotenuse of the triangle(s).
         This is a scalar if both `x1` and `x2` are scalars.
 
-    Notes
-    -----
-    This function differs from the original numpy.arange in the following aspects:
-        - Only support float16, float32 and float64.
+    .. note::
+       This function differs from the original numpy.arange in the following aspects:
+
+       * Only support float16, float32 and float64.
 
     Examples
     --------
@@ -8877,17 +8882,24 @@ def inner(a, b):
     dot : Generalised matrix product, using second last dimension of `b`.
     einsum : Einstein summation convention.
 
-    Notes
-    -----
-    For vectors (1-D arrays) it computes the ordinary inner-product::
-        np.inner(a, b) = sum(a[:]*b[:])
-    More generally, if `ndim(a) = r > 0` and `ndim(b) = s > 0`::
-        np.inner(a, b) = np.tensordot(a, b, axes=(-1,-1))
-    or explicitly::
-        np.inner(a, b)[i0,...,ir-1,j0,...,js-1]
-            = sum(a[i0,...,ir-1,:]*b[j0,...,js-1,:])
-    In addition `a` or `b` may be scalars, in which case::
-    np.inner(a,b) = a*b
+    .. note::
+
+       For vectors (1-D arrays) it computes the ordinary inner-product::
+
+           np.inner(a, b) = sum(a[:]*b[:])
+
+       More generally, if `ndim(a) = r > 0` and `ndim(b) = s > 0`::
+
+           np.inner(a, b) = np.tensordot(a, b, axes=(-1,-1))
+
+       or explicitly::
+
+           np.inner(a, b)[i0,...,ir-1,j0,...,js-1]
+               = sum(a[i0,...,ir-1,:]*b[j0,...,js-1,:])
+
+       In addition `a` or `b` may be scalars, in which case::
+
+           np.inner(a,b) = a*b
 
     Examples
     --------
@@ -9072,34 +9084,44 @@ def cross(a, b, axisa=-1, axisb=-1, axisc=-1, axis=None): # pylint: disable=too-
 
 @set_module('mxnet.numpy')
 def kron(a, b):
-    r"""
-    Kronecker product of two arrays.
+    r"""Kronecker product of two arrays.
+
     Computes the Kronecker product, a composite array made of blocks of the
     second array scaled by the first.
+
     Parameters
     ----------
     a, b : ndarray
+
     Returns
     -------
     out : ndarray
+
     See Also
     --------
     outer : The outer product
-    Notes
-    -----
-    The function assumes that the number of dimensions of `a` and `b`
-    are the same, if necessary prepending the smallest with ones.
-    If `a.shape = (r0,r1,..,rN)` and `b.shape = (s0,s1,...,sN)`,
-    the Kronecker product has shape `(r0*s0, r1*s1, ..., rN*SN)`.
-    The elements are products of elements from `a` and `b`, organized
-    explicitly by::
-        kron(a,b)[k0,k1,...,kN] = a[i0,i1,...,iN] * b[j0,j1,...,jN]
-    where::
-        kt = it * st + jt,  t = 0,...,N
-    In the common 2-D case (N=1), the block structure can be visualized::
-        [[ a[0,0]*b,   a[0,1]*b,  ... , a[0,-1]*b  ],
-        [  ...                              ...   ],
-        [ a[-1,0]*b,  a[-1,1]*b, ... , a[-1,-1]*b ]]
+
+    .. note::
+       The function assumes that the number of dimensions of `a` and `b`
+       are the same, if necessary prepending the smallest with ones.
+       If `a.shape = (r0,r1,..,rN)` and `b.shape = (s0,s1,...,sN)`,
+       the Kronecker product has shape `(r0*s0, r1*s1, ..., rN*SN)`.
+       The elements are products of elements from `a` and `b`, organized
+       explicitly by::
+
+           kron(a,b)[k0,k1,...,kN] = a[i0,i1,...,iN] * b[j0,j1,...,jN]
+
+       where::
+
+           kt = it * st + jt,  t = 0,...,N
+
+       In the common 2-D case (N=1), the block structure can be visualized::
+
+           [[ a[0,0]*b,   a[0,1]*b,  ... , a[0,-1]*b  ],
+           [  ...                              ...   ],
+           [ a[-1,0]*b,  a[-1,1]*b, ... , a[-1,-1]*b ]]
+
+
     Examples
     --------
     >>> np.kron([1,10,100], [5,6,7])
@@ -9547,12 +9569,11 @@ def hsplit(ary, indices_or_sections):
     sub-arrays : list of ndarrays
         A list of sub-arrays.
 
-    Notes
-    ------
-    - If `indices_or_sections` is given as an integer, but a split
-      does not result in equal division.It will raises ValueErrors.
-    - If indices_or_sections is an integer, and the number is 1, it will
-      raises an error. Because single output from split is not supported yet...
+    .. note::
+       * If `indices_or_sections` is given as an integer, but a split
+         does not result in equal division.It will raises ValueErrors.
+       * If indices_or_sections is an integer, and the number is 1, it will
+         raises an error. Because single output from split is not supported yet...
 
     See Also
     --------
@@ -9718,14 +9739,15 @@ def einsum(*operands, **kwargs):
     returns the optimal path in the majority of cases. 'optimal' is not supported
     for now.
 
-    This function differs from the original `numpy.einsum
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.einsum.html>`_ in
-    the following way(s):
+    .. note::
+       This function differs from the original `numpy.einsum
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.einsum.html>`_ in
+       the following way(s):
 
-    - Does not support 'optimal' strategy
-    - Does not support the alternative subscript like
-        `einsum(op0, sublist0, op1, sublist1, ..., [sublistout])`
-    - Does not produce view in any cases
+       * Does not support 'optimal' strategy
+       * Does not support the alternative subscript like
+           `einsum(op0, sublist0, op1, sublist1, ..., [sublistout])`
+       * Does not produce view in any cases
 
     Examples
     --------
@@ -9844,8 +9866,7 @@ def einsum(*operands, **kwargs):
 
 @set_module('mxnet.numpy')
 def insert(arr, obj, values, axis=None):
-    """
-    Insert values along the given axis before the given indices.
+    r"""Insert values along the given axis before the given indices.
 
     Parameters
     ----------
@@ -9871,12 +9892,11 @@ def insert(arr, obj, values, axis=None):
         does not occur in-place: a new array is returned. If
         `axis` is None, `out` is a flattened array.
 
-    Notes
-    -----
-    - Note that for higher dimensional inserts `obj=0` behaves very different
-    from `obj=[0]` just like `arr[:,0,:] = values` is different from
-    `arr[:,[0],:] = values`.
-    - If obj is a ndarray, it's dtype only supports int64
+    .. note::
+       * Note that for higher dimensional inserts `obj=0` behaves very different
+         from `obj=[0]` just like `arr[:,0,:] = values` is different from
+         `arr[:,[0],:] = values`.
+       * If obj is a ndarray, it's dtype only supports int64
 
     Examples
     --------
@@ -10076,9 +10096,9 @@ def percentile(a, q, axis=None, out=None, overwrite_input=None, interpolation='l
 
 @set_module('mxnet.numpy')
 def median(a, axis=None, out=None, overwrite_input=None, keepdims=False):
-    r"""
-    Compute the median along the specified axis.
+    r"""Compute the median along the specified axis.
     Returns the median of the array elements.
+
     Parameters
     ----------
     a : array_like
@@ -10095,6 +10115,7 @@ def median(a, axis=None, out=None, overwrite_input=None, keepdims=False):
         If this is set to True, the axes which are reduced are left
         in the result as dimensions with size one. With this option,
         the result will broadcast correctly against the original `arr`.
+
     Returns
     -------
     median : ndarray
@@ -10103,9 +10124,11 @@ def median(a, axis=None, out=None, overwrite_input=None, keepdims=False):
         ``np.float32``.  Otherwise, the data-type of the output is the
         same as that of the input. If `out` is specified, that array is
         returned instead.
+
     See Also
     --------
     mean, percentile
+
     Examples
     --------
     >>> a = np.array([[10, 7, 4], [3, 2, 1]])
@@ -10125,9 +10148,9 @@ def median(a, axis=None, out=None, overwrite_input=None, keepdims=False):
 
 @set_module('mxnet.numpy')
 def quantile(a, q, axis=None, out=None, overwrite_input=None, interpolation='linear', keepdims=False): # pylint: disable=too-many-arguments
-    """
-    Compute the q-th quantile of the data along the specified axis.
+    """Compute the q-th quantile of the data along the specified axis.
     New in version 1.15.0.
+
     Parameters
     ----------
     a : ndarray
@@ -10144,14 +10167,17 @@ def quantile(a, q, axis=None, out=None, overwrite_input=None, interpolation='lin
     interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
         This optional parameter specifies the interpolation method to use
         when the desired quantile lies between two data points i < j:
-            linear: i + (j - i) * fraction, where fraction is the fractional part of the index surrounded by i and j.
-            lower: i.
-            higher: j.
-            nearest: i or j, whichever is nearest.
-            midpoint: (i + j) / 2.
+
+        * linear: i + (j - i) * fraction, where fraction is the fractional part of the index surrounded by i and j.
+        * lower: i.
+        * higher: j.
+        * nearest: i or j, whichever is nearest.
+        * midpoint: (i + j) / 2.
+
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left in the result as dimensions with size one.
         With this option, the result will broadcast correctly against the original array a.
+
     Returns
     -------
     quantile : ndarray
@@ -10159,21 +10185,24 @@ def quantile(a, q, axis=None, out=None, overwrite_input=None, interpolation='lin
         If multiple quantiles are given, first axis of the result corresponds to the quantiles.
         The other axes are the axes that remain after the reduction of a.
         If out is specified, that array is returned instead.
+
     See also
     --------
     mean
-    Notes
-    -----
-    Given a vector V of length N, the q-th quantile of V is the value q of the way from the minimum
-    to the maximum in a sorted copy of V. The values and distances of the two nearest neighbors
-    as well as the interpolation parameter will determine the quantile if the normalized ranking
-    does not match the location of q exactly. This function is the same as the median if q=0.5,
-    the same as the minimum if q=0.0 and the same as the maximum if q=1.0.
-    This function differs from the original `numpy.quantile
-    <https://numpy.org/devdocs/reference/generated/numpy.quantile.html>`_ in
-    the following aspects:
-    - q must be ndarray type even if it is a scalar
-    - do not support overwrite_input
+
+    .. note::
+       Given a vector V of length N, the q-th quantile of V is the value q of the way from the minimum
+       to the maximum in a sorted copy of V. The values and distances of the two nearest neighbors
+       as well as the interpolation parameter will determine the quantile if the normalized ranking
+       does not match the location of q exactly. This function is the same as the median if q=0.5,
+       the same as the minimum if q=0.0 and the same as the maximum if q=1.0.
+       This function differs from the original `numpy.quantile
+       <https://numpy.org/devdocs/reference/generated/numpy.quantile.html>`_ in
+       the following aspects:
+
+       * q must be ndarray type even if it is a scalar
+       * do not support overwrite_input
+
     Examples
     --------
     >>> a = np.array([[10, 7, 4], [3, 2, 1]])
@@ -10226,12 +10255,13 @@ def shares_memory(a, b, max_work=None):
     >>> np.may_share_memory(np.array([1,2]), np.array([5,8,9]))
     False
 
-    This function differs from the original `numpy.shares_memory
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.shares_memory.html>`_ in
-    the following way(s):
+    .. note::
+       This function differs from the original `numpy.shares_memory
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.shares_memory.html>`_ in
+       the following way(s):
 
-    - Does not support `max_work`, it is a dummy argument
-    - Actually it is same as `may_share_memory` in MXNet np
+       * Does not support `max_work`, it is a dummy argument
+       * Actually it is same as `may_share_memory` in MXNet np
     """
     return _mx_nd_np.shares_memory(a, b, max_work)
 
@@ -10267,12 +10297,13 @@ def may_share_memory(a, b, max_work=None):
     >>> np.may_share_memory(x[:,0], x[:,1])
     True
 
-    This function differs from the original `numpy.may_share_memory
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.may_share_memory.html>`_ in
-    the following way(s):
+    .. note::
+       This function differs from the original `numpy.may_share_memory
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.may_share_memory.html>`_ in
+       the following way(s):
 
-    - Does not support `max_work`, it is a dummy argument
-    - Actually it is same as `shares_memory` in MXNet np
+       * Does not support `max_work`, it is a dummy argument
+       * Actually it is same as `shares_memory` in MXNet np
     """
     return _mx_nd_np.may_share_memory(a, b, max_work)
 
@@ -10419,8 +10450,8 @@ def resize(a, new_shape):
 
 @set_module('mxnet.numpy')
 def interp(x, xp, fp, left=None, right=None, period=None):  # pylint: disable=too-many-arguments
-    """
-    One-dimensional linear interpolation.
+    r"""One-dimensional linear interpolation.
+
     Returns the one-dimensional piecewise linear interpolant to a function
     with given values at discrete data-points.
 
@@ -10442,12 +10473,12 @@ def interp(x, xp, fp, left=None, right=None, period=None):  # pylint: disable=to
         A period for the x-coordinates. This parameter allows the proper
         interpolation of angular x-coordinates. Parameters `left` and `right`
         are ignored if `period` is specified.
-        .. versionadded:: 1.10.0
 
     Returns
     -------
     y : float (corresponding to fp) or ndarray
         The interpolated values, same shape as `x`.
+
     Raises
     ------
     ValueError
@@ -10455,12 +10486,13 @@ def interp(x, xp, fp, left=None, right=None, period=None):  # pylint: disable=to
         If `xp` or `fp` are not 1-D sequences
         If `period == 0`
 
-    Notes
-    -----
-    Does not check that the x-coordinate sequence `xp` is increasing.
-    If `xp` is not increasing, the results are nonsense.
-    A simple check for increasing is::
-        np.all(np.diff(xp) > 0)
+    .. note::
+       Does not check that the x-coordinate sequence `xp` is increasing.
+       If `xp` is not increasing, the results are nonsense.
+       A simple check for increasing is::
+
+           np.all(np.diff(xp) > 0)
+
 
     Examples
     --------
@@ -10848,15 +10880,13 @@ def nan_to_num(x, copy=True, nan=0.0, posinf=None, neginf=None, **kwargs):
 
 @set_module('mxnet.numpy')
 def squeeze(x, axis=None):
-    """
-    Remove single-dimensional entries from the shape of an array.
+    r"""Remove single-dimensional entries from the shape of an array.
 
     Parameters
     ----------
     a : array_like
         Input data.
     axis : None or int or tuple of ints, optional
-        .. versionadded:: 1.7.0
         Selects a subset of the single-dimensional entries in the
         shape. If an axis is selected with shape entry greater than
         one, an error is raised.
@@ -10922,14 +10952,19 @@ def isnan(x, out=None, **kwargs):
     -----
     NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic (IEEE 754).
 
-    This function differs from the original `numpy.isinf
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.isnan.html>`_ in
-    the following aspects:
-    - Does not support complex number for now
-    - Input type does not support Python native iterables(list, tuple, ...).
-    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
-    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
-    - ``out`` param does not support scalar input case.
+    .. note::
+
+       This function differs from the original `numpy.isinf
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.isnan.html>`_ in
+       the following aspects:
+
+       * Does not support complex number for now
+       * Input type does not support Python native iterables(list, tuple, ...).
+       * ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be
+         the same as the expected output.
+       * ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the
+         same as the expected output.
+       * ``out`` param does not support scalar input case.
 
     Examples
     --------
@@ -10969,14 +11004,19 @@ def isinf(x, out=None, **kwargs):
     NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic (IEEE 754).
     This means that Not a Number is not equivalent to infinity.
 
-    This function differs from the original `numpy.isnan
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.isnan.html>`_ in
-    the following aspects:
-    - Does not support complex number for now
-    - Input type does not support Python native iterables(list, tuple, ...).
-    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
-    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
-    - ``out`` param does not support scalar input case.
+    .. note::
+
+       This function differs from the original `numpy.isnan
+       <https://docs.scipy.org/doc/numpy/reference/generated/numpy.isnan.html>`_ in
+       the following aspects:
+
+       * Does not support complex number for now
+       * Input type does not support Python native iterables(list, tuple, ...).
+       * ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be
+         the same as the expected output.
+       * ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the
+         same as the expected output.
+       * ``out`` param does not support scalar input case.
 
     Examples
     --------
@@ -11234,13 +11274,13 @@ def polyval(p, x):
     values : ndarray
         Result array of polynomials
 
-    Notes
-    -----
-    This function differs from the original `numpy.polyval
-    <https://numpy.org/devdocs/reference/generated/numpy.polyval.html>`_ in
-    the following way(s):
-    - Does not support poly1d.
-    - X should be ndarray type even if it contains only one element.
+    .. note::
+       This function differs from the original `numpy.polyval
+       <https://numpy.org/devdocs/reference/generated/numpy.polyval.html>`_ in
+       the following way(s):
+
+       * Does not support poly1d.
+       * X should be ndarray type even if it contains only one element.
 
     Examples
     --------
@@ -11604,17 +11644,17 @@ def dot(a, b, out=None):
     """
     Dot product of two arrays. Specifically,
 
-    - If both `a` and `b` are 1-D arrays, it is inner product of vectors
+    * If both `a` and `b` are 1-D arrays, it is inner product of vectors
 
-    - If both `a` and `b` are 2-D arrays, it is matrix multiplication,
+    * If both `a` and `b` are 2-D arrays, it is matrix multiplication,
 
-    - If either `a` or `b` is 0-D (scalar), it is equivalent to :func:`multiply`
+    * If either `a` or `b` is 0-D (scalar), it is equivalent to :func:`multiply`
       and using ``np.multiply(a, b)`` or ``a * b`` is preferred.
 
-    - If `a` is an N-D array and `b` is a 1-D array, it is a sum product over
+    * If `a` is an N-D array and `b` is a 1-D array, it is a sum product over
       the last axis of `a` and `b`.
 
-    - If `a` is an N-D array and `b` is a 2-D array, it is a
+    * If `a` is an N-D array and `b` is a 2-D array, it is a
       sum product over the last axis of `a` and the second-to-last axis of `b`::
 
         dot(a, b)[i,j,k] = sum(a[i,j,:] * b[:,k])
@@ -11890,8 +11930,8 @@ def rollaxis(a, axis, start=0):
 def diag(v, k=0):
     """
     Extracts a diagonal or constructs a diagonal array.
-    - 1-D arrays: constructs a 2-D array with the input as its diagonal, all other elements are zero.
-    - 2-D arrays: extracts the k-th Diagonal
+    * 1-D arrays: constructs a 2-D array with the input as its diagonal, all other elements are zero.
+    * 2-D arrays: extracts the k-th Diagonal
 
     Parameters
     ----------
@@ -12064,11 +12104,11 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=N
 
     Notes
     -----
-    - Input type does not support Python native iterables.
-    - "out" param: cannot perform auto type change. out ndarray's dtype must be the same as the expected output.
-    - "initial" param is not supported yet. Please use None as input.
-    - Arithmetic is modular when using integer types, and no error is raised on overflow.
-    - The sum of an empty array is the neutral element 0:
+    * Input type does not support Python native iterables.
+    * "out" param: cannot perform auto type change. out ndarray's dtype must be the same as the expected output.
+    * "initial" param is not supported yet. Please use None as input.
+    * Arithmetic is modular when using integer types, and no error is raised on overflow.
+    * The sum of an empty array is the neutral element 0:
 
     >>> a = np.empty(1)
     >>> np.sum(a)
@@ -12078,10 +12118,10 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=N
     <https://docs.scipy.org/doc/numpy/reference/generated/numpy.sum.html>`_ in
     the following aspects:
 
-    - Input type does not support Python native iterables(list, tuple, ...).
-    - "out" param: cannot perform auto type cast. out ndarray's dtype must be the same as the expected output.
-    - "initial" param is not supported yet. Please use ``None`` as input or skip it.
-    - The default type is float32.
+    * Input type does not support Python native iterables(list, tuple, ...).
+    * "out" param: cannot perform auto type cast. out ndarray's dtype must be the same as the expected output.
+    * "initial" param is not supported yet. Please use ``None`` as input or skip it.
+    * The default type is float32.
 
     Examples
     --------
diff --git a/python/mxnet/optimizer/lars.py b/python/mxnet/optimizer/lars.py
index 9492a93..136ecc9 100644
--- a/python/mxnet/optimizer/lars.py
+++ b/python/mxnet/optimizer/lars.py
@@ -39,12 +39,16 @@ class LARS(Optimizer):
 
     Behave mostly like SGD with momentum and weight decay but is scaling \
     adaptively the learning for each layer:
-    w_norm = L2norm(weights)
-    g_norm = L2norm(gradients)
-    if w_norm > 0 and g_norm > 0:
-        lr_layer = lr * w_norm / (g_norm + weight_decay * w_norm + epsilon)
-    else:
-        lr_layer = lr
+
+    .. code-block::
+
+       w_norm = L2norm(weights)
+       g_norm = L2norm(gradients)
+       if w_norm > 0 and g_norm > 0:
+           lr_layer = lr * w_norm / (g_norm + weight_decay * w_norm + epsilon)
+       else:
+           lr_layer = lr
+
 
     Parameters
     ----------
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index ddc4a1e..1aad91b 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -1476,50 +1476,44 @@ class Symbol(SymbolBase):
     # pylint: disable=too-many-locals
     def optimize_for(self, backend, args=None, aux=None, ctx=None,
                      shape_dict=None, type_dict=None, stype_dict=None, skip_infer=False, **kwargs):
-        """Partitions current symbol and optimizes it for a given backend,
-        returns new partitioned symbol.
+        r"""Partitions current symbol and optimizes it for a given backend.
+
+        The backend must have registered the partitioning graph pass in
+        ``SubgraphBackendRegistry``.
 
         Parameters
         ----------
         backend : str
-            The name of backend, as registered in `SubgraphBackendRegistry`
-
+            The name of backend, as registered in ``SubgraphBackendRegistry``
         args : dict of str to NDArray, optional
             Input arguments to the symbol, required to infer shapes/types before partitioning
-            - If type is a dict of str to `NDArray`, then it maps the name of arguments
-              to the corresponding `NDArray`. Non defined arguments' `NDArray`s don't have to be
-              specified in the dict.
-
+            If type is a dict of str to NDArray, then it maps the names of arguments
+            to the corresponding NDArray. Undefined arguments' NDArrays
+            don't have to be specified in the dict.
         aux : dict of str to NDArray, optional
             Input auxiliary arguments to the symbol
-            - If type is a dict of str to `NDArray`, then it maps the name of arguments
-              to the corresponding `NDArray`.
-
+            If type is a dict of str to :class:`NDArray`, then it maps the name of arguments
+            to the corresponding :class:`NDArray`.
         ctx : Context, optional
             Device context, used to infer stypes
-
-        shape_dict  : Dict of str->tuple, optional
+        shape_dict : Dict of str->tuple, optional
             Input shape dictionary.
-            Used iff input NDArray is not in `args`.
-
-        type_dict  : Dict of str->numpy.dtype, optional
+            Used iff input :class:`NDArray` is not in ``args``.
+        type_dict : Dict of str->numpy.dtype, optional
             Input type dictionary.
-            Used iff input NDArray is not in `args`.
-
+            Used iff input :class:`NDArray` is not in ``args``.
         stype_dict  : Dict of str->str, optional
             Input storage type dictionary.
-            Used iff input NDArray is not in `args`.
-
+            Used iff input :class:`NDArray` is not in ``args``.
         skip_infer : bool, optional
             If True, the optimization skips the shape, type and storage type inference pass.
-
         kwargs : optional arguments
-            Passed on to `PrePartition` and `PostPartition` functions of `SubgraphProperty`
+            Passed on to ``PrePartition`` and ``PostPartition`` functions of ``SubgraphProperty``
 
         Returns
         -------
         out : SymbolHandle
-            The created symbol for target backend.
+            A symbol with the partitioned graph for target backend.
         """
         out = SymbolHandle()
         assert isinstance(backend, str)
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 823d813..159576e 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -235,7 +235,8 @@ def use_np_shape(func):
     `func` is a class, it ensures that all the methods, static functions, and properties
     of the class are executed with the NumPy shape semantics.
 
-    Example::
+    .. code-block:: python
+
         import mxnet as mx
         @mx.use_np_shape
         def scalar_one():
@@ -263,11 +264,11 @@ def use_np_shape(func):
                 print("Is value property in np_shape semantics? {}!".format(str(np.is_np_shape())))
                 return self._scalar.asnumpy().item()
 
-
         print("Is global scope of np_shape activated? {}!".format(str(np.is_np_shape())))
         scalar_tensor = ScalarTensor()
         print(scalar_tensor)
 
+
     Parameters
     ----------
     func : a user-provided callable function or class to be scoped by the NumPy-shape semantics.
@@ -411,11 +412,11 @@ def use_np_array(func):
     For example, at the time when a parameter is created in a `Block`, an `mxnet.numpy.ndarray`
     is created if it's decorated with this decorator.
 
-    Example::
+    .. code-block:: python
+
         import mxnet as mx
         from mxnet import gluon, np
 
-
         class TestHybridBlock1(gluon.HybridBlock):
             def __init__(self):
                 super(TestHybridBlock1, self).__init__()
@@ -424,7 +425,6 @@ def use_np_array(func):
             def hybrid_forward(self, F, x, w):
                 return F.dot(x, w)
 
-
         x = mx.nd.ones((2, 2))
         net1 = TestHybridBlock1()
         net1.initialize()
@@ -433,7 +433,6 @@ def use_np_array(func):
             assert type(v.data()) is mx.nd.NDArray
         assert type(out) is mx.nd.NDArray
 
-
         @np.use_np_array
         class TestHybridBlock2(gluon.HybridBlock):
             def __init__(self):
@@ -443,7 +442,6 @@ def use_np_array(func):
             def hybrid_forward(self, F, x, w):
                 return F.np.dot(x, w)
 
-
         x = np.ones((2, 2))
         net2 = TestHybridBlock2()
         net2.initialize()
@@ -488,17 +486,17 @@ def use_np_array(func):
 
 def use_np(func):
     """A convenience decorator for wrapping user provided functions and classes in the scope of
-    both NumPy-shape and NumPy-array semantics, which means that (1) empty tuples `()` and tuples
-    with zeros, such as `(0, 1)`, `(1, 0, 2)`, will be treated as scalar tensors' shapes and
+    both NumPy-shape and NumPy-array semantics, which means that ``(1)`` empty tuples ``()`` and
+    tuples with zeros, such as ``(0, 1)``, ``(1, 0, 2)``, will be treated as scalar tensors' shapes and
     zero-size tensors' shapes in shape inference functions of operators, instead of as unknown
-    in legacy mode; (2) ndarrays of type `mxnet.numpy.ndarray` should be created instead of
-    `mx.nd.NDArray`.
+    in legacy mode; (2) ndarrays of type :class:`mxnet.numpy.ndarray` should be created instead of
+    :class:`mx.nd.NDArray`.
+
+    .. code-block:: python
 
-    Example::
         import mxnet as mx
         from mxnet import gluon, np
 
-
         class TestHybridBlock1(gluon.HybridBlock):
             def __init__(self):
                 super(TestHybridBlock1, self).__init__()
@@ -507,7 +505,6 @@ def use_np(func):
             def hybrid_forward(self, F, x, w):
                 return F.dot(x, w) + F.ones((1,))
 
-
         x = mx.nd.ones((2, 2))
         net1 = TestHybridBlock1()
         net1.initialize()
@@ -516,7 +513,6 @@ def use_np(func):
             assert type(v.data()) is mx.nd.NDArray
         assert type(out) is mx.nd.NDArray
 
-
         @np.use_np
         class TestHybridBlock2(gluon.HybridBlock):
             def __init__(self):
@@ -526,7 +522,6 @@ def use_np(func):
             def hybrid_forward(self, F, x, w):
                 return F.np.dot(x, w) + F.np.ones(())
 
-
         x = np.ones((2, 2))
         net2 = TestHybridBlock2()
         net2.initialize()
@@ -536,6 +531,7 @@ def use_np(func):
             assert type(v.data()) is np.ndarray
         assert type(out) is np.ndarray
 
+
     Parameters
     ----------
     func : a user-provided callable function or class to be scoped by the
@@ -1009,7 +1005,8 @@ def use_np_default_dtype(func):
     When`func` is a class, it ensures that all the methods, static functions, and properties
     of the class are executed with the NumPy-default_dtype semantics.
 
-    Example:
+    .. code-block:: python
+
         import mxnet as mx
         @mx.use_np_default_dtype
         def float64_one():
@@ -1037,11 +1034,11 @@ def use_np_default_dtype(func):
                 print("Is value property in np_dafault_dtype semantics? {}!".format(str(np.is_np_default_dtype())))
                 return self._data.asnumpy()
 
-
         print("Is global scope of np_default_dtype activated? {}!".format(str(np.is_np_default_dtype())))
         float64_tensor = Float64Tensor()
         print(float64_tensor)
 
+
     Parameters
     ----------
     func : a user-provided callable function or class to be scoped by the NumPy-default_dtype semantics.
diff --git a/src/operator/contrib/bounding_box.cc b/src/operator/contrib/bounding_box.cc
index a0cc34b..d4a5992 100644
--- a/src/operator/contrib/bounding_box.cc
+++ b/src/operator/contrib/bounding_box.cc
@@ -40,8 +40,8 @@ NNVM_REGISTER_OP(_contrib_box_nms)
 .add_alias("_npx_box_nms")
 .describe(R"code(Apply non-maximum suppression to input.
 
-The output will be sorted in descending order according to `score`. Boxes with
-overlaps larger than `overlap_thresh`, smaller scores and background boxes
+The output will be sorted in descending order according to ``score``. Boxes with
+overlaps larger than ``overlap_thresh``, smaller scores and background boxes
 will be removed and filled with -1, the corresponding position will be recorded
 for backward propogation.
 
@@ -61,23 +61,23 @@ Input requirements::
 By default, a box is [id, score, xmin, ymin, xmax, ymax, ...],
 additional elements are allowed.
 
-- `id_index`: optional, use -1 to ignore, useful if `force_suppress=False`, which means
-  we will skip highly overlapped boxes if one is `apple` while the other is `car`.
+- ``id_index``: optional, use -1 to ignore, useful if ``force_suppress=False``, which means
+  we will skip highly overlapped boxes if one is ``apple`` while the other is ``car``.
 
-- `background_id`: optional, default=-1, class id for background boxes, useful
-  when `id_index >= 0` which means boxes with background id will be filtered before nms.
+- ``background_id``: optional, default=-1, class id for background boxes, useful
+  when ``id_index >= 0`` which means boxes with background id will be filtered before nms.
 
-- `coord_start`: required, default=2, the starting index of the 4 coordinates.
+- ``coord_start``: required, default=2, the starting index of the 4 coordinates.
   Two formats are supported:
 
-    - `corner`: [xmin, ymin, xmax, ymax]
+    - ``corner``: [xmin, ymin, xmax, ymax]
 
-    - `center`: [x, y, width, height]
+    - ``center``: [x, y, width, height]
 
-- `score_index`: required, default=1, box score/confidence.
-  When two boxes overlap IOU > `overlap_thresh`, the one with smaller score will be suppressed.
+- ``score_index``: required, default=1, box score/confidence.
+  When two boxes overlap IOU > ``overlap_thresh``, the one with smaller score will be suppressed.
 
-- `in_format` and `out_format`: default='corner', specify in/out box formats.
+- ``in_format`` and ``out_format``: default='corner', specify in/out box formats.
 
 Examples::
 
@@ -234,8 +234,8 @@ NNVM_REGISTER_OP(_contrib_box_encode)
 NNVM_REGISTER_OP(_contrib_box_decode)
 .add_alias("_npx_box_decode")
 .describe(R"doc(Decode bounding boxes training target with normalized center offsets.
-    Input bounding boxes are using corner type: `x_{min}, y_{min}, x_{max}, y_{max}`
-    or center type: `x, y, width, height.) array
+    Input bounding boxes are using corner type: ``x_{min}, y_{min}, x_{max}, y_{max}``
+    or center type: ``x, y, width, height``.) array
 )doc" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
diff --git a/src/operator/contrib/dynamic_shape_ops.cc b/src/operator/contrib/dynamic_shape_ops.cc
index ff71d4b..1b5274c 100644
--- a/src/operator/contrib/dynamic_shape_ops.cc
+++ b/src/operator/contrib/dynamic_shape_ops.cc
@@ -80,38 +80,44 @@ Accepts 2 inputs - data and shape.
 The output returns data in the new shape.
 
 Some dimensions of the shape can take special values from the set {0, -1, -2, -3, -4}. The significance of each is explained below:
-- ``0``  copy this dimension from the input to the output shape.
-  Example::
+- ``0``  copy this dimension from the input to the output shape. Example::
+
   - input shape = (2,3,4), shape = (4,0,2), output shape = (4,3,2)
   - input shape = (2,3,4), shape = (2,0,0), output shape = (2,3,4)
+
 - ``-1`` infers the dimension of the output shape by using the remainder of the input dimensions
   keeping the size of the new array same as that of the input array.
-  At most one dimension of shape can be -1.
-  Example::
+  At most one dimension of shape can be -1. Example::
+
   - input shape = (2,3,4), shape = (6,1,-1), output shape = (6,1,4)
   - input shape = (2,3,4), shape = (3,-1,8), output shape = (3,1,8)
   - input shape = (2,3,4), shape=(-1,), output shape = (24,)
-- ``-2`` copy all/remainder of the input dimensions to the output shape.
-  Example::
+
+- ``-2`` copy all/remainder of the input dimensions to the output shape. Example::
+
   - input shape = (2,3,4), shape = (-2,), output shape = (2,3,4)
   - input shape = (2,3,4), shape = (2,-2), output shape = (2,3,4)
   - input shape = (2,3,4), shape = (-2,1,1), output shape = (2,3,4,1,1)
-- ``-3`` use the product of two consecutive dimensions of the input shape as the output dimension.
-  Example::
+
+- ``-3`` use the product of two consecutive dimensions of the input shape as the output dimension. Example::
+
   - input shape = (2,3,4), shape = (-3,4), output shape = (6,4)
   - input shape = (2,3,4,5), shape = (-3,-3), output shape = (6,20)
   - input shape = (2,3,4), shape = (0,-3), output shape = (2,12)
   - input shape = (2,3,4), shape = (-3,-2), output shape = (6,4)
-- ``-4`` split one dimension of the input into two dimensions passed subsequent to -4 in shape (can contain -1).
-  Example::
+
+- ``-4`` split one dimension of the input into two dimensions passed subsequent to -4 in shape (can contain -1). Example::
+
   - input shape = (2,3,4), shape = (-4,1,2,-2), output shape =(1,2,3,4)
   - input shape = (2,3,4), shape = (2,-4,-1,3,-2), output shape = (2,1,3,4)
 
 Example::
+
    data = mx.nd.array(np.random.normal(0,1,(2,3,5,5)))
    shape = mx.nd.array((0,-1))
    out = mx.sym.contrib.dynamic_reshape(data = data, shape = shape)
    // out will be of shape (2,75)
+
 )code" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
diff --git a/src/operator/image/crop.cc b/src/operator/image/crop.cc
index ba31bf4..24fd327 100644
--- a/src/operator/image/crop.cc
+++ b/src/operator/image/crop.cc
@@ -39,32 +39,14 @@ DMLC_REGISTER_PARAMETER(RandomResizedCropParam);
 NNVM_REGISTER_OP(_image_crop)
 .add_alias("_npx__image_crop")
 .describe(R"code(Crop an image NDArray of shape (H x W x C) or (N x H x W x C)
-to the given size.
-Example:
-    .. code-block:: python
-        image = mx.nd.random.uniform(0, 255, (4, 2, 3)).astype(dtype=np.uint8)
-        mx.nd.image.crop(image, 1, 1, 2, 2)
-            [[[144  34   4]
-              [ 82 157  38]]
-
-             [[156 111 230]
-              [177  25  15]]]
-            <NDArray 2x2x3 @cpu(0)>
-        image = mx.nd.random.uniform(0, 255, (2, 4, 2, 3)).astype(dtype=np.uint8)
-        mx.nd.image.crop(image, 1, 1, 2, 2)
-            [[[[ 35 198  50]
-               [242  94 168]]
-
-              [[223 119 129]
-               [249  14 154]]]
-
-
-              [[[137 215 106]
-                [ 79 174 133]]
-
-               [[116 142 109]
-                [ 35 239  50]]]]
-            <NDArray 2x2x2x3 @cpu(0)>
+to the given size. Example:
+.. code-block:: python
+
+    image = mx.nd.random.uniform(0, 255, (4, 2, 3)).astype(dtype=np.uint8)
+    mx.nd.image.crop(image, 1, 1, 2, 2).shape # (2, 2, 3)
+    image = mx.nd.random.uniform(0, 255, (2, 4, 2, 3)).astype(dtype=np.uint8)
+    mx.nd.image.crop(image, 1, 1, 2, 2) # (2, 2, 2, 3)
+
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
@@ -86,13 +68,13 @@ NNVM_REGISTER_OP(_backward_image_crop)
 NNVM_REGISTER_OP(_image_random_crop)
 .add_alias("_npx__image_random_crop")
 .describe(R"code(Randomly crop an image NDArray of shape (H x W x C) or (N x H x W x C)
-to the given size. Upsample result if `src` is smaller than `size`.
-Example:
-    .. code-block:: python
-        im = mx.nd.array(cv2.imread("flower.jpg"))
-        cropped_im, rect  = mx.nd.image.random_crop(im, (100, 100))
-        print(cropped_im)
-        <NDArray 100x100x1 @cpu(0)>
+to the given size. Upsample result if `src` is smaller than `size`. Example:
+
+.. code-block:: python
+
+    im = mx.nd.array(cv2.imread("flower.jpg"))
+    cropped_im, rect  = mx.nd.image.random_crop(im, (100, 100))
+
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(2)
@@ -123,11 +105,11 @@ NNVM_REGISTER_OP(_image_random_resized_crop)
 .describe(R"code(Randomly crop an image NDArray of shape (H x W x C) or (N x H x W x C)
 to the given size. Randomize area and aspect ratio. Upsample result if `src` is smaller than `size`.
 Example:
-    .. code-block:: python
-        im = mx.nd.array(cv2.imread("flower.jpg"))
-        cropped_im, rect  = mx.nd.image.random_resized_crop(im, (100, 100))
-        print(cropped_im)
-        <NDArray 100x100x1 @cpu(0)>
+.. code-block:: python
+
+    im = mx.nd.array(cv2.imread("flower.jpg"))
+    cropped_im, rect  = mx.nd.image.random_resized_crop(im, (100, 100))
+
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
diff --git a/src/operator/image/image_random.cc b/src/operator/image/image_random.cc
index c7b7131..e552457 100644
--- a/src/operator/image/image_random.cc
+++ b/src/operator/image/image_random.cc
@@ -43,53 +43,54 @@ NNVM_REGISTER_OP(_image_to_tensor)
 .add_alias("_npx__image_to_tensor")
 .describe(R"code(Converts an image NDArray of shape (H x W x C) or (N x H x W x C) 
 with values in the range [0, 255] to a tensor NDArray of shape (C x H x W) or (N x C x H x W)
-with values in the range [0, 1]
+with values in the range [0, 1].
+
+Examples
+--------
+>>> image = mx.nd.random.uniform(0, 255, (4, 2, 3)).astype(dtype=np.uint8)
+>>> to_tensor(image)
+[[[ 0.85490197  0.72156864]
+  [ 0.09019608  0.74117649]
+  [ 0.61960787  0.92941177]
+  [ 0.96470588  0.1882353 ]]
+ [[ 0.6156863   0.73725492]
+  [ 0.46666667  0.98039216]
+  [ 0.44705883  0.45490196]
+  [ 0.01960784  0.8509804 ]]
+ [[ 0.39607844  0.03137255]
+  [ 0.72156864  0.52941179]
+  [ 0.16470589  0.7647059 ]
+  [ 0.05490196  0.70588237]]]
+<NDArray 3x4x2 @cpu(0)>
+
+>>> image = mx.nd.random.uniform(0, 255, (2, 4, 2, 3)).astype(dtype=np.uint8)
+>>> to_tensor(image)
+[[[[0.11764706 0.5803922 ]
+   [0.9411765  0.10588235]
+   [0.2627451  0.73333335]
+   [0.5647059  0.32156864]]
+  [[0.7176471  0.14117648]
+   [0.75686276 0.4117647 ]
+   [0.18431373 0.45490196]
+   [0.13333334 0.6156863 ]]
+  [[0.6392157  0.5372549 ]
+   [0.52156866 0.47058824]
+   [0.77254903 0.21568628]
+   [0.01568628 0.14901961]]]
+ [[[0.6117647  0.38431373]
+   [0.6784314  0.6117647 ]
+   [0.69411767 0.96862745]
+   [0.67058825 0.35686275]]
+  [[0.21960784 0.9411765 ]
+   [0.44705883 0.43529412]
+   [0.09803922 0.6666667 ]
+   [0.16862746 0.1254902 ]]
+  [[0.6156863  0.9019608 ]
+   [0.35686275 0.9019608 ]
+   [0.05882353 0.6509804 ]
+   [0.20784314 0.7490196 ]]]]
+<NDArray 2x3x4x2 @cpu(0)>
 
-Example:
-    .. code-block:: python
-        image = mx.nd.random.uniform(0, 255, (4, 2, 3)).astype(dtype=np.uint8)
-        to_tensor(image)
-            [[[ 0.85490197  0.72156864]
-              [ 0.09019608  0.74117649]
-              [ 0.61960787  0.92941177]
-              [ 0.96470588  0.1882353 ]]
-             [[ 0.6156863   0.73725492]
-              [ 0.46666667  0.98039216]
-              [ 0.44705883  0.45490196]
-              [ 0.01960784  0.8509804 ]]
-             [[ 0.39607844  0.03137255]
-              [ 0.72156864  0.52941179]
-              [ 0.16470589  0.7647059 ]
-              [ 0.05490196  0.70588237]]]
-             <NDArray 3x4x2 @cpu(0)>
-
-        image = mx.nd.random.uniform(0, 255, (2, 4, 2, 3)).astype(dtype=np.uint8)
-        to_tensor(image)
-            [[[[0.11764706 0.5803922 ]
-               [0.9411765  0.10588235]
-               [0.2627451  0.73333335]
-               [0.5647059  0.32156864]]
-              [[0.7176471  0.14117648]
-               [0.75686276 0.4117647 ]
-               [0.18431373 0.45490196]
-               [0.13333334 0.6156863 ]]
-              [[0.6392157  0.5372549 ]
-               [0.52156866 0.47058824]
-               [0.77254903 0.21568628]
-               [0.01568628 0.14901961]]]
-             [[[0.6117647  0.38431373]
-               [0.6784314  0.6117647 ]
-               [0.69411767 0.96862745]
-               [0.67058825 0.35686275]]
-              [[0.21960784 0.9411765 ]
-               [0.44705883 0.43529412]
-               [0.09803922 0.6666667 ]
-               [0.16862746 0.1254902 ]]
-              [[0.6156863  0.9019608 ]
-               [0.35686275 0.9019608 ]
-               [0.05882353 0.6509804 ]
-               [0.20784314 0.7490196 ]]]]
-            <NDArray 2x3x4x2 @cpu(0)>
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
@@ -106,10 +107,10 @@ Example:
 NNVM_REGISTER_OP(_image_normalize)
 .add_alias("_npx__image_normalize")
 .describe(R"code(Normalize an tensor of shape (C x H x W) or (N x C x H x W) with mean and
-    standard deviation.
+standard deviation.
 
-    Given mean `(m1, ..., mn)` and std `(s\ :sub:`1`\ , ..., s\ :sub:`n`)` for `n` channels,
-    this transform normalizes each channel of the input tensor with:
+Given mean `(m1, ..., mn)` and std `(s\ :sub:`1`\ , ..., s\ :sub:`n`)` for `n` channels,
+this transform normalizes each channel of the input tensor with:
 
 .. math::
 
@@ -121,50 +122,52 @@ NNVM_REGISTER_OP(_image_normalize)
 
 Example:
 
-    .. code-block:: python
-        image = mx.nd.random.uniform(0, 1, (3, 4, 2))
-        normalize(image, mean=(0, 1, 2), std=(3, 2, 1))
-            [[[ 0.18293785  0.19761486]
-              [ 0.23839645  0.28142193]
-              [ 0.20092112  0.28598186]
-              [ 0.18162774  0.28241724]]
-             [[-0.2881726  -0.18821815]
-              [-0.17705294 -0.30780914]
-              [-0.2812064  -0.3512327 ]
-              [-0.05411351 -0.4716435 ]]
-             [[-1.0363373  -1.7273437 ]
-              [-1.6165586  -1.5223348 ]
-              [-1.208275   -1.1878313 ]
-              [-1.4711051  -1.5200229 ]]]
-            <NDArray 3x4x2 @cpu(0)>
-
-        image = mx.nd.random.uniform(0, 1, (2, 3, 4, 2))
-        normalize(image, mean=(0, 1, 2), std=(3, 2, 1))
-            [[[[ 0.18934818  0.13092826]
-               [ 0.3085322   0.27869293]
-               [ 0.02367868  0.11246539]
-               [ 0.0290431   0.2160573 ]]
-              [[-0.4898908  -0.31587923]
-               [-0.08369008 -0.02142242]
-               [-0.11092162 -0.42982462]
-               [-0.06499392 -0.06495637]]
-              [[-1.0213816  -1.526392  ]
-               [-1.2008414  -1.1990893 ]
-               [-1.5385206  -1.4795225 ]
-               [-1.2194707  -1.3211205 ]]]
-             [[[ 0.03942481  0.24021089]
-               [ 0.21330701  0.1940066 ]
-               [ 0.04778443  0.17912441]
-               [ 0.31488964  0.25287187]]
-              [[-0.23907584 -0.4470462 ]
-               [-0.29266903 -0.2631998 ]
-               [-0.3677222  -0.40683383]
-               [-0.11288315 -0.13154092]]
-              [[-1.5438497  -1.7834496 ]
-               [-1.431566   -1.8647819 ]
-               [-1.9812102  -1.675859  ]
-               [-1.3823645  -1.8503251 ]]]]
-            <NDArray 2x3x4x2 @cpu(0)>
+.. code-block:: python
+
+    image = mx.nd.random.uniform(0, 1, (3, 4, 2))
+    normalize(image, mean=(0, 1, 2), std=(3, 2, 1))
+        [[[ 0.18293785  0.19761486]
+          [ 0.23839645  0.28142193]
+          [ 0.20092112  0.28598186]
+          [ 0.18162774  0.28241724]]
+         [[-0.2881726  -0.18821815]
+          [-0.17705294 -0.30780914]
+          [-0.2812064  -0.3512327 ]
+          [-0.05411351 -0.4716435 ]]
+         [[-1.0363373  -1.7273437 ]
+          [-1.6165586  -1.5223348 ]
+          [-1.208275   -1.1878313 ]
+          [-1.4711051  -1.5200229 ]]]
+        <NDArray 3x4x2 @cpu(0)>
+
+    image = mx.nd.random.uniform(0, 1, (2, 3, 4, 2))
+    normalize(image, mean=(0, 1, 2), std=(3, 2, 1))
+        [[[[ 0.18934818  0.13092826]
+           [ 0.3085322   0.27869293]
+           [ 0.02367868  0.11246539]
+           [ 0.0290431   0.2160573 ]]
+          [[-0.4898908  -0.31587923]
+           [-0.08369008 -0.02142242]
+           [-0.11092162 -0.42982462]
+           [-0.06499392 -0.06495637]]
+          [[-1.0213816  -1.526392  ]
+           [-1.2008414  -1.1990893 ]
+           [-1.5385206  -1.4795225 ]
+           [-1.2194707  -1.3211205 ]]]
+         [[[ 0.03942481  0.24021089]
+           [ 0.21330701  0.1940066 ]
+           [ 0.04778443  0.17912441]
+           [ 0.31488964  0.25287187]]
+          [[-0.23907584 -0.4470462 ]
+           [-0.29266903 -0.2631998 ]
+           [-0.3677222  -0.40683383]
+           [-0.11288315 -0.13154092]]
+          [[-1.5438497  -1.7834496 ]
+           [-1.431566   -1.8647819 ]
+           [-1.9812102  -1.675859  ]
+           [-1.3823645  -1.8503251 ]]]]
+        <NDArray 2x3x4x2 @cpu(0)>
+
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<NormalizeParam>)
 .set_num_inputs(1)
diff --git a/src/operator/image/resize.cc b/src/operator/image/resize.cc
index d2397ea..2e8139f 100644
--- a/src/operator/image/resize.cc
+++ b/src/operator/image/resize.cc
@@ -35,39 +35,42 @@ DMLC_REGISTER_PARAMETER(ResizeParam);
 
 NNVM_REGISTER_OP(_image_resize)
 .add_alias("_npx__image_resize")
-.describe(R"code(Resize an image NDArray of shape (H x W x C) or (N x H x W x C) 
-to the given size
-Example:
-    .. code-block:: python
-        image = mx.nd.random.uniform(0, 255, (4, 2, 3)).astype(dtype=np.uint8)
-        mx.nd.image.resize(image, (3, 3))
-            [[[124 111 197]
-              [158  80 155]
-              [193  50 112]]
+.describe(R"code(Resize an image NDArray of shape (H x W x C) or (N x H x W x C)
+to the given size. Example:
 
-             [[110 100 113]
-              [134 165 148]
-              [157 231 182]]
+.. code-block:: python
 
-             [[202 176 134]
-              [174 191 149]
-              [147 207 164]]]
-            <NDArray 3x3x3 @cpu(0)>
-        image = mx.nd.random.uniform(0, 255, (2, 4, 2, 3)).astype(dtype=np.uint8)
-        mx.nd.image.resize(image, (2, 2))            
-            [[[[ 59 133  80]
-               [187 114 153]]
+    image = mx.nd.random.uniform(0, 255, (4, 2, 3)).astype(dtype=np.uint8)
+    mx.nd.image.resize(image, (3, 3))
+        [[[124 111 197]
+          [158  80 155]
+          [193  50 112]]
 
-              [[ 38 142  39]
-               [207 131 124]]]
+         [[110 100 113]
+          [134 165 148]
+          [157 231 182]]
 
+         [[202 176 134]
+          [174 191 149]
+          [147 207 164]]]
+        <NDArray 3x3x3 @cpu(0)>
 
-              [[[117 125 136]
-               [191 166 150]]
+    image = mx.nd.random.uniform(0, 255, (2, 4, 2, 3)).astype(dtype=np.uint8)
+    mx.nd.image.resize(image, (2, 2))
+        [[[[ 59 133  80]
+           [187 114 153]]
+
+          [[ 38 142  39]
+           [207 131 124]]]
+
+
+          [[[117 125 136]
+           [191 166 150]]
+
+          [[129  63 113]
+           [182 109  48]]]]
+        <NDArray 2x2x2x3 @cpu(0)>
 
-              [[129  63 113]
-               [182 109  48]]]]
-            <NDArray 2x2x2x3 @cpu(0)>
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cc b/src/operator/numpy/np_elemwise_unary_op_basic.cc
index 548f618..5031efe 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cc
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cc
@@ -38,8 +38,11 @@ MXNET_OPERATOR_REGISTER_UNARY(_npx_relu)
 
 MXNET_OPERATOR_REGISTER_UNARY(_npx_sigmoid)
 .describe(R"code(Computes sigmoid of x element-wise.
+
 .. math::
+
    y = 1 / (1 + exp(-x))
+
 )code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::sigmoid>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_sigmoid"});
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 8c3d14f..a7cc36a 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -140,38 +140,49 @@ NNVM_REGISTER_OP(Reshape)
 Given an array and a shape, this function returns a copy of the array in the new shape.
 The shape is a tuple of integers such as (2,3,4). The size of the new shape should be same as the size of the input array.
 Example::
+
   reshape([1,2,3,4], shape=(2,2)) = [[1,2], [3,4]]
+
 Some dimensions of the shape can take special values from the set {0, -1, -2, -3, -4}. The significance of each is explained below:
-- ``0``  copy this dimension from the input to the output shape.
-  Example::
-  - input shape = (2,3,4), shape = (4,0,2), output shape = (4,3,2)
-  - input shape = (2,3,4), shape = (2,0,0), output shape = (2,3,4)
+- ``0``  copy this dimension from the input to the output shape. Example::
+
+      - input shape = (2,3,4), shape = (4,0,2), output shape = (4,3,2)
+      - input shape = (2,3,4), shape = (2,0,0), output shape = (2,3,4)
+
 - ``-1`` infers the dimension of the output shape by using the remainder of the input dimensions
   keeping the size of the new array same as that of the input array.
-  At most one dimension of shape can be -1.
-  Example::
-  - input shape = (2,3,4), shape = (6,1,-1), output shape = (6,1,4)
-  - input shape = (2,3,4), shape = (3,-1,8), output shape = (3,1,8)
-  - input shape = (2,3,4), shape=(-1,), output shape = (24,)
-- ``-2`` copy all/remainder of the input dimensions to the output shape.
-  Example::
-  - input shape = (2,3,4), shape = (-2,), output shape = (2,3,4)
-  - input shape = (2,3,4), shape = (2,-2), output shape = (2,3,4)
-  - input shape = (2,3,4), shape = (-2,1,1), output shape = (2,3,4,1,1)
+  At most one dimension of shape can be -1. Example::
+
+      - input shape = (2,3,4), shape = (6,1,-1), output shape = (6,1,4)
+      - input shape = (2,3,4), shape = (3,-1,8), output shape = (3,1,8)
+      - input shape = (2,3,4), shape=(-1,), output shape = (24,)
+
+- ``-2`` copy all/remainder of the input dimensions to the output shape. Example::
+
+      - input shape = (2,3,4), shape = (-2,), output shape = (2,3,4)
+      - input shape = (2,3,4), shape = (2,-2), output shape = (2,3,4)
+      - input shape = (2,3,4), shape = (-2,1,1), output shape = (2,3,4,1,1)
+
 - ``-3`` use the product of two consecutive dimensions of the input shape as the output dimension.
   Example::
-  - input shape = (2,3,4), shape = (-3,4), output shape = (6,4)
-  - input shape = (2,3,4,5), shape = (-3,-3), output shape = (6,20)
-  - input shape = (2,3,4), shape = (0,-3), output shape = (2,12)
-  - input shape = (2,3,4), shape = (-3,-2), output shape = (6,4)
+
+      - input shape = (2,3,4), shape = (-3,4), output shape = (6,4)
+      - input shape = (2,3,4,5), shape = (-3,-3), output shape = (6,20)
+      - input shape = (2,3,4), shape = (0,-3), output shape = (2,12)
+      - input shape = (2,3,4), shape = (-3,-2), output shape = (6,4)
+
 - ``-4`` split one dimension of the input into two dimensions passed subsequent to -4 in shape (can contain -1).
   Example::
-  - input shape = (2,3,4), shape = (-4,1,2,-2), output shape =(1,2,3,4)
-  - input shape = (2,3,4), shape = (2,-4,-1,3,-2), output shape = (2,1,3,4)
+
+      - input shape = (2,3,4), shape = (-4,1,2,-2), output shape =(1,2,3,4)
+      - input shape = (2,3,4), shape = (2,-4,-1,3,-2), output shape = (2,1,3,4)
+
 If the argument `reverse` is set to 1, then the special values are inferred from right to left.
   Example::
-  - without reverse=1, for input shape = (10,5,4), shape = (-1,0), output shape would be (40,5)
-  - with reverse=1, output shape will be (50,4).
+
+      - without reverse=1, for input shape = (10,5,4), shape = (-1,0), output shape would be (40,5)
+      - with reverse=1, output shape will be (50,4).
+
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
@@ -236,6 +247,7 @@ the input array into an output array of shape ``(d1, d2*...*dk)``.
 Note that the behavior of this function is different from numpy.ndarray.flatten,
 which behaves similar to mxnet.ndarray.reshape((-1,)).
 Example::
+
     x = [[
         [1,2,3],
         [4,5,6],
@@ -309,6 +321,7 @@ inline static bool TransposeStorageType(const nnvm::NodeAttrs& attrs,
 NNVM_REGISTER_OP(transpose)
 .describe(R"code(Permutes the dimensions of an array.
 Examples::
+
   x = [[ 1, 2],
        [ 3, 4]]
   transpose(x) = [[ 1.,  3.],
@@ -451,7 +464,9 @@ NNVM_REGISTER_OP(slice)
 MXNET_ADD_SPARSE_OP_ALIAS(slice)
 .add_alias("crop")
 .describe(R"code(Slices a region of the array.
+
 .. note:: ``crop`` is deprecated. Use ``slice`` instead.
+
 This function returns a sliced array between the indices given
 by `begin` and `end` with the corresponding `step`.
 For an input array of ``shape=(d_0, d_1, ..., d_n-1)``,
@@ -468,13 +483,18 @@ and `step`, the following rule will be used to set default values.
 If `s_k` is `None`, set `s_k=1`. If `s_k > 0`, set `b_k=0`, `e_k=d_k`;
 else, set `b_k=d_k-1`, `e_k=-1`.
 The storage type of ``slice`` output depends on storage types of inputs
-- slice(csr) = csr
-- otherwise, ``slice`` generates output with default storage
-.. note:: When input data storage type is csr, it only supports
+* slice(csr) = csr
+* otherwise, ``slice`` generates output with default storage
+
+.. note::
+
+   When input data storage type is csr, it only supports
    step=(), or step=(None,), or step=(1,) to generate a csr output.
    For other step parameter values, it falls back to slicing
    a dense tensor.
+
 Example::
+
   x = [[  1.,   2.,   3.,   4.],
        [  5.,   6.,   7.,   8.],
        [  9.,  10.,  11.,  12.]]
@@ -561,6 +581,7 @@ NNVM_REGISTER_OP(slice_axis)
 Returns an array slice along a given `axis` starting from the `begin` index
 to the `end` index.
 Examples::
+
   x = [[  1.,   2.,   3.,   4.],
        [  5.,   6.,   7.,   8.],
        [  9.,  10.,  11.,  12.]]
@@ -612,6 +633,7 @@ is 3.
 The following is allowed in this situation:
 `` out = slice_like(a, b, axes=(0, 2))``
 Example::
+
   x = [[  1.,   2.,   3.,   4.],
        [  5.,   6.,   7.,   8.],
        [  9.,  10.,  11.,  12.]]
@@ -663,21 +685,25 @@ MXNET_ADD_SPARSE_OP_ALIAS(clip)
 .add_alias("_npi_clip")
 .describe(R"code(Clips (limits) the values in an array.
 Given an interval, values outside the interval are clipped to the interval edges.
-Clipping ``x`` between `a_min` and `a_max` would be::
+Clipping ``x`` between `a_min` and `a_max` would be
 .. math::
+
    clip(x, a_min, a_max) = \max(\min(x, a_max), a_min))
+
 Example::
+
     x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     clip(x,1,8) = [ 1.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  8.]
+
 The storage type of ``clip`` output depends on storage types of inputs and the a_min, a_max \
 parameter values:
-   - clip(default) = default
-   - clip(row_sparse, a_min <= 0, a_max >= 0) = row_sparse
-   - clip(csr, a_min <= 0, a_max >= 0) = csr
-   - clip(row_sparse, a_min < 0, a_max < 0) = default
-   - clip(row_sparse, a_min > 0, a_max > 0) = default
-   - clip(csr, a_min < 0, a_max < 0) = csr
-   - clip(csr, a_min > 0, a_max > 0) = csr
+* clip(default) = default
+* clip(row_sparse, a_min <= 0, a_max >= 0) = row_sparse
+* clip(csr, a_min <= 0, a_max >= 0) = csr
+* clip(row_sparse, a_min < 0, a_max < 0) = default
+* clip(row_sparse, a_min > 0, a_max > 0) = default
+* clip(csr, a_min < 0, a_max < 0) = csr
+* clip(csr, a_min > 0, a_max > 0) = csr
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
@@ -733,10 +759,13 @@ NNVM_REGISTER_OP(repeat)
 .describe(R"code(Repeats elements of an array.
 By default, ``repeat`` flattens the input array into 1-D and then repeats the
 elements::
+
   x = [[ 1, 2],
        [ 3, 4]]
   repeat(x, repeats=2) = [ 1.,  1.,  2.,  2.,  3.,  3.,  4.,  4.]
+
 The parameter ``axis`` specifies the axis along which to perform repeat::
+
   repeat(x, repeats=2, axis=1) = [[ 1.,  1.,  2.,  2.],
                                   [ 3.,  3.,  4.,  4.]]
   repeat(x, repeats=2, axis=0) = [[ 1.,  2.],
@@ -745,6 +774,7 @@ The parameter ``axis`` specifies the axis along which to perform repeat::
                                   [ 3.,  4.]]
   repeat(x, repeats=2, axis=-1) = [[ 1.,  1.,  2.,  2.],
                                    [ 3.,  3.,  4.,  4.]]
+
 )code" ADD_FILELINE)
 .set_num_outputs(1)
 .set_num_inputs(1)
@@ -777,18 +807,23 @@ NNVM_REGISTER_OP(tile)
 If ``reps`` has length *d*, and input array has dimension of *n*. There are
 three cases:
 - **n=d**. Repeat *i*-th dimension of the input by ``reps[i]`` times::
+
     x = [[1, 2],
          [3, 4]]
     tile(x, reps=(2,3)) = [[ 1.,  2.,  1.,  2.,  1.,  2.],
                            [ 3.,  4.,  3.,  4.,  3.,  4.],
                            [ 1.,  2.,  1.,  2.,  1.,  2.],
                            [ 3.,  4.,  3.,  4.,  3.,  4.]]
+
 - **n>d**. ``reps`` is promoted to length *n* by pre-pending 1's to it. Thus for
   an input shape ``(2,3)``, ``repos=(2,)`` is treated as ``(1,2)``::
+
     tile(x, reps=(2,)) = [[ 1.,  2.,  1.,  2.],
                           [ 3.,  4.,  3.,  4.]]
+
 - **n<d**. The input is promoted to be d-dimensional by prepending new axes. So a
   shape ``(2,2)`` array is promoted to ``(1,2,2)`` for 3-D replication::
+
     tile(x, reps=(2,2,3)) = [[[ 1.,  2.,  1.,  2.,  1.,  2.],
                               [ 3.,  4.,  3.,  4.,  3.,  4.],
                               [ 1.,  2.,  1.,  2.,  1.,  2.],
@@ -797,6 +832,7 @@ three cases:
                               [ 3.,  4.,  3.,  4.,  3.,  4.],
                               [ 1.,  2.,  1.,  2.,  1.,  2.],
                               [ 3.,  4.,  3.,  4.,  3.,  4.]]]
+
 )code" ADD_FILELINE)
 .set_num_outputs(1)
 .set_num_inputs(1)
@@ -827,12 +863,14 @@ NNVM_REGISTER_OP(reverse)
 .describe(R"code(Reverses the order of elements along given axis while preserving array shape.
 Note: reverse and flip are equivalent. We use reverse in the following examples.
 Examples::
+
   x = [[ 0.,  1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.,  9.]]
   reverse(x, axis=0) = [[ 5.,  6.,  7.,  8.,  9.],
                         [ 0.,  1.,  2.,  3.,  4.]]
   reverse(x, axis=1) = [[ 4.,  3.,  2.,  1.,  0.],
                         [ 9.,  8.,  7.,  6.,  5.]]
+
 )code" ADD_FILELINE)
 .set_num_outputs(1)
 .set_num_inputs(1)
@@ -871,12 +909,14 @@ The axis parameter specifies the index of the new axis in the dimensions of the
 result. For example, if axis=0 it will be the first dimension and if axis=-1 it
 will be the last dimension.
 Examples::
+
   x = [1, 2]
   y = [3, 4]
   stack(x, y) = [[1, 2],
                  [3, 4]]
   stack(x, y, axis=1) = [[1, 3],
                          [2, 4]]
+
 )code")
 .set_num_inputs([](const nnvm::NodeAttrs& attrs) {
     const StackParam& param = dmlc::get<StackParam>(attrs.parsed);
@@ -916,13 +956,14 @@ NNVM_REGISTER_OP(squeeze)
 Same behavior of defining the output tensor shape as numpy.squeeze for the most of cases.
 See the following note for exception.
 Examples::
+
   data = [[[0], [1], [2]]]
   squeeze(data) = [0, 1, 2]
   squeeze(data, axis=0) = [[0], [1], [2]]
   squeeze(data, axis=2) = [[0, 1, 2]]
   squeeze(data, axis=(0, 2)) = [0, 1, 2]
-.. Note::
-  The output of this operator will keep at least one dimension not removed. For example,
+
+.. note:: The output of this operator will keep at least one dimension not removed. For example,
   squeeze([[[4]]]) = [4], while in numpy.squeeze, the output will become a scalar.
 )code")
 .set_num_inputs(1)
@@ -952,15 +993,19 @@ Similar to ONNX DepthToSpace operator:
 https://github.com/onnx/onnx/blob/master/docs/Operators.md#DepthToSpace.
 The output is a new tensor where the values from depth dimension are moved in spatial blocks
 to height and width dimension. The reverse of this operation is ``space_to_depth``.
+
 .. math::
+
     \begin{gather*}
     x \prime = reshape(x, [N, block\_size, block\_size, C / (block\_size ^ 2), H * block\_size, W * block\_size]) \\
     x \prime \prime = transpose(x \prime, [0, 3, 4, 1, 5, 2]) \\
     y = reshape(x \prime \prime, [N, C / (block\_size ^ 2), H * block\_size, W * block\_size])
     \end{gather*}
+
 where :math:`x` is an input tensor with default layout as :math:`[N, C, H, W]`: [batch, channels, height, width]
 and :math:`y` is the output tensor of layout :math:`[N, C / (block\_size ^ 2), H * block\_size, W * block\_size]`
 Example::
+
   x = [[[[0, 1, 2],
          [3, 4, 5]],
         [[6, 7, 8],
@@ -973,6 +1018,7 @@ Example::
                             [12, 18, 13, 19, 14, 20],
                             [3, 9, 4, 10, 5, 11],
                             [15, 21, 16, 22, 17, 23]]]]
+
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<DepthToSpaceParam>)
 .set_num_inputs(1)
@@ -1000,14 +1046,17 @@ https://github.com/onnx/onnx/blob/master/docs/Operators.md#SpaceToDepth
 The output is a new tensor where the values from height and width dimension are
 moved to the depth dimension. The reverse of this operation is ``depth_to_space``.
 .. math::
+
     \begin{gather*}
     x \prime = reshape(x, [N, C, H / block\_size, block\_size, W / block\_size, block\_size]) \\
     x \prime \prime = transpose(x \prime, [0, 3, 5, 1, 2, 4]) \\
     y = reshape(x \prime \prime, [N, C * (block\_size ^ 2), H / block\_size, W / block\_size])
     \end{gather*}
+
 where :math:`x` is an input tensor with default layout as :math:`[N, C, H, W]`: [batch, channels, height, width]
 and :math:`y` is the output tensor of layout :math:`[N, C * (block\_size ^ 2), H / block\_size, W / block\_size]`
 Example::
+
   x = [[[[0, 6, 1, 7, 2, 8],
          [12, 18, 13, 19, 14, 20],
          [3, 9, 4, 10, 5, 11],
@@ -1020,6 +1069,7 @@ Example::
                             [15, 16, 17]],
                            [[18, 19, 20],
                             [21, 22, 23]]]]
+
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<DepthToSpaceParam>)
 .set_num_inputs(1)
@@ -1045,6 +1095,7 @@ NNVM_REGISTER_OP(_split_v2)
 .add_alias("_npi_array_split")
 .describe(R"code(Splits an array along a particular axis into multiple sub-arrays.
 Example::
+
    x  = [[[ 1.]
           [ 2.]]
          [[ 3.]
@@ -1077,11 +1128,13 @@ Example::
          [6.]]]
   w[0].shape = (1, 2, 1)
   w[1].shape = (2, 2, 1)
+
 `squeeze_axis=True` removes the axis with length 1 from the shapes of the output arrays.
 **Note** that setting `squeeze_axis` to ``1`` removes axis with length 1 only
 along the `axis` which it is split.
 Also `squeeze_axis` can be set to true only if ``input.shape[axis] == indices_or_sections``.
 Example::
+
    z = split_v2(x, axis=0, indices_or_sections=3, squeeze_axis=1) // a list of 3 arrays with shape (2, 1)
    z = [[ 1.]
         [ 2.]]
@@ -1090,6 +1143,7 @@ Example::
        [[ 5.]
         [ 6.]]
    z[0].shape = (2, 1)
+
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<SplitParam>)
 .set_num_inputs(1)
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index fed197c..07fe364 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -2397,6 +2397,13 @@ def _add_workload_cov():
     OpArgMngr.add_workload('cov', y, rowvar=False, bias=True)
 
 
+def _add_workload_cumprod():
+    a = np.array([[1, 2], [3, 5]])
+    OpArgMngr.add_workload('cumprod', a)
+    OpArgMngr.add_workload('cumprod', a, axis=0)
+    OpArgMngr.add_workload('cumprod', a, axis=1)
+
+
 def _add_workload_digitize():
     a = np.array([1, 2, 3, 4])
     b = np.array([1, 3])
@@ -2660,6 +2667,40 @@ def _add_workload_nanquantile():
     OpArgMngr.add_workload('nanquantile', a, 0.4, interpolation='nearest')
 
 
+def _add_workload_nanstd():
+    OpArgMngr.add_workload('nanstd', np.random.uniform(size=(4, 1)))
+    A = np.array([[1, 2, 3], [4, np.nan, 6]])
+    OpArgMngr.add_workload('nanstd', A)
+    OpArgMngr.add_workload('nanstd', A, 0)
+    OpArgMngr.add_workload('nanstd', A, 1)
+    OpArgMngr.add_workload('nanstd', np.array([1, -1, 1, -1]))
+    OpArgMngr.add_workload('nanstd', np.array([1, -1, 1, -1]), ddof=1)
+    OpArgMngr.add_workload('nanstd', np.array([1, -1, 1, -1]), ddof=2)
+    OpArgMngr.add_workload('nanstd', np.arange(10), out=np.array(0.))
+
+
+def _add_workload_nansum():
+    a = 1
+    b = np.array([1, np.nan])
+    c = np.array([[1, 2], [3, np.nan]])
+    OpArgMngr.add_workload('nansum', a)
+    OpArgMngr.add_workload('nansum', b)
+    OpArgMngr.add_workload('nansum', c)
+    OpArgMngr.add_workload('nansum', c, axis=0)
+
+
+def _add_workload_nanvar():
+    OpArgMngr.add_workload('nanvar', np.random.uniform(size=(4, 1)))
+    A = np.array([[1, 2, 3], [4, np.nan, 6]])
+    OpArgMngr.add_workload('nanvar', A)
+    OpArgMngr.add_workload('nanvar', A, 0)
+    OpArgMngr.add_workload('nanvar', A, 1)
+    OpArgMngr.add_workload('nanvar', np.array([1, -1, 1, -1]))
+    OpArgMngr.add_workload('nanvar', np.array([1, -1, 1, -1]), ddof=1)
+    OpArgMngr.add_workload('nanvar', np.array([1, -1, 1, -1]), ddof=2)
+    OpArgMngr.add_workload('nanvar', np.arange(10), out=np.array(0.))
+
+
 def _add_workload_ndim():
     a = 1
     b = np.array([[1,2,3],[4,5,6]])
@@ -2698,6 +2739,10 @@ def _add_workload_packbits():
     OpArgMngr.add_workload('packbits', a, bitorder='little')
 
 
+def _add_workload_pmt():
+    OpArgMngr.add_workload('pmt', 0.1 / 12, 1, 60, 55000)
+
+
 def _add_workload_poly():
     a = np.array([3, -np.sqrt(2), np.sqrt(2)])
     b = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 0]])
@@ -3118,6 +3163,7 @@ def _prepare_workloads():
     _add_workload_correlate()
     _add_workload_count_nonzero()
     _add_workload_cov()
+    _add_workload_cumprod()
     _add_workload_digitize()
     _add_workload_divmod()
     _add_workload_extract()
@@ -3149,12 +3195,16 @@ def _prepare_workloads():
     _add_workload_nanpercentile()
     _add_workload_nanprod()
     _add_workload_nanquantile()
+    _add_workload_nanstd()
+    _add_workload_nansum()
+    _add_workload_nanvar()
     _add_workload_ndim()
     _add_workload_npv()
+    _add_workload_packbits()
     _add_workload_pad()
     _add_workload_partition()
     _add_workload_piecewise()
-    _add_workload_packbits()
+    _add_workload_pmt()
     _add_workload_poly()
     _add_workload_polyadd()
     _add_workload_polydiv()
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 3a71595..580478e 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -170,7 +170,7 @@ def test_np_tensordot(a_shape, b_shape, axes, hybridize, dtype):
         mx_sym = mx.sym.np.tensordot(a_sym, b_sym, axes).as_nd_ndarray()
         check_numeric_gradient(mx_sym, [a.as_nd_ndarray(), b.as_nd_ndarray()],
           rtol=1e-1, atol=1e-1, dtype = dtype)
-    
+
     # General Gradient Test
     for a_grad_status in ['add', 'write']:
         for b_grad_status in ['add', 'write']:
@@ -5749,7 +5749,25 @@ def test_np_linalg_norm():
 
 @with_seed()
 @use_np
-def test_np_linalg_svd():
+@pytest.mark.parametrize('shape', [
+    (3, 3),
+    (3, 5),
+    (4, 4),
+    (4, 5),
+    (5, 5),
+    (5, 6),
+    (6, 6),
+    (0, 1),
+    (6, 5, 6),
+    (2, 3, 3, 4),
+    (4, 2, 1, 2),
+    (0, 5, 3, 3),
+    (5, 0, 3, 3),
+    (3, 3, 0, 0),
+])
+@pytest.mark.parametrize('dtype', ['float32', 'float64'])
+@pytest.mark.parametrize('hybridize', [False, True])
+def test_np_linalg_svd(shape, dtype, hybridize):
     class TestSVD(HybridBlock):
         def __init__(self):
             super(TestSVD, self).__init__()
@@ -5799,60 +5817,40 @@ def test_np_linalg_svd():
         assert I.shape == I_np.shape
         assert_almost_equal(I, I_np, rtol=rtol, atol=atol)
 
-    shapes = [
-        (3, 3),
-        (3, 5),
-        (4, 4),
-        (4, 5),
-        (5, 5),
-        (5, 6),
-        (6, 6),
-        (0, 1),
-        (6, 5, 6),
-        (2, 3, 3, 4),
-        (4, 2, 1, 2),
-        (0, 5, 3, 3),
-        (5, 0, 3, 3),
-        (3, 3, 0, 0),
-    ]
-    dtypes = ['float32', 'float64']
-    for hybridize in [True, False]:
-        for dtype in dtypes:
-            for shape in shapes:
-                rtol = atol = 0.01
-                test_svd = TestSVD()
-                if hybridize:
-                    test_svd.hybridize()
-                data_np = _np.random.uniform(-10.0, 10.0, shape)
-                data_np = _np.array(data_np, dtype=dtype)
-                data = np.array(data_np, dtype=dtype)
-                if effective_dtype(data) == np.dtype(np.float16):
-                    continue
-                data.attach_grad()
-                with mx.autograd.record():
-                    ret = test_svd(data)
-                UT = ret[0].asnumpy()
-                L = ret[1].asnumpy()
-                V = ret[2].asnumpy()
-                # check svd validity
-                check_svd(UT, L, V, data_np)
-                # check descending singular values
-                s = [L[..., i] - L[..., i + 1] for i in range(L.shape[-1] - 1)]
-                s = _np.array(s)
-                assert (s >= -1e-5).all()
-                if L.size > 0:
-                    assert (L[..., -1] >= -1e-5).all()
-                # check backward
-                mx.autograd.backward(ret)
-                if ((s > 1e-5).all() and (L.size == 0 or (L > 1e-5).all())):
-                    backward_expected = get_grad(ret[0].asnumpy(), ret[1].asnumpy(), ret[2].asnumpy())
-                    assert_almost_equal(data.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)
-                # Test imperative once again
-                ret = np.linalg.svd(data)
-                UT = ret[0].asnumpy()
-                L = ret[1].asnumpy()
-                V = ret[2].asnumpy()
-                check_svd(UT, L, V, data_np)
+    rtol = atol = 0.01
+    test_svd = TestSVD()
+    if hybridize:
+        test_svd.hybridize()
+    data_np = _np.random.uniform(-10.0, 10.0, shape)
+    data_np = _np.array(data_np, dtype=dtype)
+    data = np.array(data_np, dtype=dtype)
+    if effective_dtype(data) == _np.dtype(np.float16):
+        pytest.skip()
+    data.attach_grad()
+    with mx.autograd.record():
+        ret = test_svd(data)
+    UT = ret[0].asnumpy()
+    L = ret[1].asnumpy()
+    V = ret[2].asnumpy()
+    # check svd validity
+    check_svd(UT, L, V, data_np)
+    # check descending singular values
+    s = [L[..., i] - L[..., i + 1] for i in range(L.shape[-1] - 1)]
+    s = _np.array(s)
+    assert (s >= -1e-5).all()
+    if L.size > 0:
+        assert (L[..., -1] >= -1e-5).all()
+    # check backward
+    mx.autograd.backward(ret)
+    if ((s > 1e-5).all() and (L.size == 0 or (L > 1e-5).all())):
+        backward_expected = get_grad(ret[0].asnumpy(), ret[1].asnumpy(), ret[2].asnumpy())
+        assert_almost_equal(data.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)
+    # Test imperative once again
+    ret = np.linalg.svd(data)
+    UT = ret[0].asnumpy()
+    L = ret[1].asnumpy()
+    V = ret[2].asnumpy()
+    check_svd(UT, L, V, data_np)
 
 
 @with_seed()
diff --git a/tools/license_header.py b/tools/license_header.py
index 193ec1e..d4c89c0 100755
--- a/tools/license_header.py
+++ b/tools/license_header.py
@@ -126,7 +126,7 @@ _LANGS = {'.cc':'*', '.h':'*', '.cu':'*', '.cuh':'*', '.py':'#',
           '.java':'*', '.sh':'#', '.cpp':'*', '.hpp':'*', '.c':'*',
           '.bat':'rem', '.pl':'#', '.m':'%', '.R':'#', '.mk':'#', '.cfg':'#',
           '.t':'#', '.ps1':'#', '.jl':'#', '.clj':';;', '.pyx':'#', '.js':'*',
-          '.md':'<!---'}
+          '.md':'<!---', '.rst':'.. '}
 
 # Previous license header, which will be removed
 _OLD_LICENSE = re.compile('.*Copyright.*by Contributors')
@@ -305,7 +305,7 @@ def main():
 
     if action == 'check':
         logging.info("Start to check %d files", (len(files)))
-        if False in list(map(file_has_license, files)):
+        if False in [file_has_license(f) for f in files if os.path.exists(f)]:
             return 1
         else:
             logging.info("All known and whitelisted files have license")