You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by zh...@apache.org on 2021/07/16 21:32:17 UTC

[incubator-mxnet] branch master updated: [ONNX] Foward port new mx2onnx into master (#20355)

This is an automated email from the ASF dual-hosted git repository.

zha0q1 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 3480ba2  [ONNX] Foward port new mx2onnx into master (#20355)
3480ba2 is described below

commit 3480ba2c6df02bb907d3a975d354efa8697c4e71
Author: Zhaoqi Zhu <zh...@gmail.com>
AuthorDate: Fri Jul 16 14:30:24 2021 -0700

    [ONNX] Foward port new mx2onnx into master (#20355)
    
    * initial: forward port mx2onnx and remove onnx2mx
    
    * fix sanity
    
    * add onnx operator unit tests
    
    * add test file
    
    * add model test
    
    * fix license & doc
    
    * fix
    
    * marching toward 2.0
    
    * fix typo
    
    * add more ops
    
    * more ops
    
    * more ops
    
    * more ops
    
    * fix softmax and sanity
    
    * more ops
    
    * more ops
    
    * more ops
    
    * naming
    
    * more ops
    
    * more ops
    
    * more ops and bug fix
    
    * more ops and skip unvisited tests
    
    * fix sanity
    
    * fix for onnx18
    
    * more ops
    
    * fix
    
    * fix onnx 18
    
    * more ops
    
    * skip model test
    
    * update read me
    
    * more ops
    
    * more ops
    
    * more ops
    
    * more ops
    
    * more ops
    
    * Update test_models.py
---
 LICENSE                                            |    5 +-
 ci/docker/install/requirements                     |    4 +-
 ci/docker/runtime_functions.sh                     |   10 +
 ci/jenkins/Jenkins_steps.groovy                    |   22 +
 ci/jenkins/Jenkinsfile_unix_cpu                    |    1 +
 python/mxnet/contrib/onnx/__init__.py              |   11 +-
 .../mxnet/contrib/onnx/mx2onnx/_op_translations.py | 2629 ----------
 python/mxnet/contrib/onnx/mx2onnx/export_model.py  |  101 -
 python/mxnet/contrib/onnx/mx2onnx/export_onnx.py   |  321 --
 .../mxnet/contrib/onnx/onnx2mx/_import_helper.py   |  148 -
 .../mxnet/contrib/onnx/onnx2mx/_op_translations.py |  818 ---
 .../contrib/onnx/onnx2mx/_translation_utils.py     |  192 -
 python/mxnet/contrib/onnx/onnx2mx/import_model.py  |   94 -
 python/mxnet/contrib/onnx/onnx2mx/import_onnx.py   |  230 -
 .../mxnet/contrib/onnx/onnx2mx/import_to_gluon.py  |   54 -
 python/mxnet/onnx/README.md                        |   97 +
 .../{contrib/onnx/mx2onnx => onnx}/__init__.py     |    4 +-
 python/mxnet/{contrib => }/onnx/mx2onnx/LICENSE    |    0
 .../mxnet/{contrib => }/onnx/mx2onnx/__init__.py   |    6 +-
 .../{contrib => }/onnx/mx2onnx/_export_helper.py   |    0
 python/mxnet/onnx/mx2onnx/_export_model.py         |  163 +
 python/mxnet/onnx/mx2onnx/_export_onnx.py          |  455 ++
 .../mx2onnx/_op_translations}/__init__.py          |    8 +-
 .../_op_translations/_op_translations_opset12.py   | 5349 ++++++++++++++++++++
 .../_op_translations/_op_translations_opset13.py   | 2060 ++++++++
 .../onnx/mx2onnx/__init__.py => onnx/setup.py}     |   28 +-
 tests/python/onnx/test_models.py                   |   67 +
 tests/python/onnx/test_operators.py                | 1917 +++++++
 tools/license_header.py                            |    5 +
 29 files changed, 10190 insertions(+), 4609 deletions(-)

diff --git a/LICENSE b/LICENSE
index 13c9371..1709907 100644
--- a/LICENSE
+++ b/LICENSE
@@ -311,8 +311,9 @@
     Apache-2.0 license + 3-clause BSD license
     =======================================================================================
 
-    python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
-    python/mxnet/contrib/onnx/mx2onnx/export_onnx.py
+    python/mxnet/onnx/mx2onnx/_export_onnx.py
+    python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset12.py
+    python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset13.py
 
     =======================================================================================
     Apache-2.0 license + MIT License
diff --git a/ci/docker/install/requirements b/ci/docker/install/requirements
index e3d90d9..5d051b5 100644
--- a/ci/docker/install/requirements
+++ b/ci/docker/install/requirements
@@ -25,8 +25,8 @@ graphviz<0.9.0,>=0.8.1
 contextvars;python_version<"3.7"
 
 # Optional dependencies
-onnx==1.7.0
-onnxruntime==1.4.0
+onnx==1.8.0
+onnxruntime==1.7.0
 protobuf==3.14.0
 scipy==1.4.1
 tabulate==0.7.5
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index d0f2cd0..70934f6 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -767,6 +767,16 @@ cd_unittest_ubuntu() {
     fi
 }
 
+unittest_ubuntu_python3_cpu_onnx() {
+    set -ex
+    export PYTHONPATH=./python/
+    export MXNET_SUBGRAPH_VERBOSE=0
+    export DMLC_LOG_STACK_TRACE_DEPTH=10
+
+    pytest --cov-report xml:onnx_unittest.xml --verbose tests/python/onnx/test_operators.py
+    pytest --cov-report xml:onnx_unittest.xml --cov-append --verbose tests/python/onnx/test_models.py
+}
+
 unittest_ubuntu_python3_cpu() {
     set -ex
     export PYTHONPATH=./python/
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index d3ad361..ac83c10 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -48,6 +48,12 @@ def python3_ut(docker_container_name) {
   }
 }
 
+def python3_ut_onnx(docker_container_name) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_onnx', false)
+  }
+}
+
 def python3_ut_onednn(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
     utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_onednn', false)
@@ -724,6 +730,22 @@ def test_unix_python3_cpu_no_tvm_op(lib_name) {
     }]
 }
 
+def test_unix_python3_onnx_cpu(lib_name) {
+    return ['Python3: ONNX-CPU': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/ut-python3-onnx-cpu') {
+          try {
+            utils.unpack_and_init(lib_name, mx_lib, true)
+            python3_ut_onnx('ubuntu_cpu')
+            utils.publish_test_coverage()
+          } finally {
+            utils.collect_test_results_unix('onnx_unittest.xml', 'tests_python3_onnx_cpu_unittest.xml')
+          }
+        }
+      }
+    }]
+}
+
 def test_unix_python3_onednn_cpu(lib_name) {
     return ['Python3: ONEDNN-CPU': {
       node(NODE_LINUX_CPU) {
diff --git a/ci/jenkins/Jenkinsfile_unix_cpu b/ci/jenkins/Jenkinsfile_unix_cpu
index 7cc70d4..9e189b3 100644
--- a/ci/jenkins/Jenkinsfile_unix_cpu
+++ b/ci/jenkins/Jenkinsfile_unix_cpu
@@ -45,6 +45,7 @@ core_logic: {
 
   utils.parallel_stage('Tests', [
     custom_steps.test_unix_python3_cpu('cpu'),
+    custom_steps.test_unix_python3_onnx_cpu('cpu'),
     custom_steps.test_unix_python3_mkl_cpu('cpu_mkl'),
     custom_steps.test_unix_python3_onednn_cpu('onednn_cpu'),
     custom_steps.test_unix_python3_onednn_mkl_cpu('onednn_mkl_cpu'),
diff --git a/python/mxnet/contrib/onnx/__init__.py b/python/mxnet/contrib/onnx/__init__.py
index 9f27060..97266df 100644
--- a/python/mxnet/contrib/onnx/__init__.py
+++ b/python/mxnet/contrib/onnx/__init__.py
@@ -16,6 +16,11 @@
 # under the License.
 """Module for ONNX model format support for Apache MXNet."""
 
-from .onnx2mx.import_model import import_model, get_model_metadata
-from .onnx2mx.import_to_gluon import import_to_gluon
-from .mx2onnx.export_model import export_model
+from ...onnx import export_model as export_model_
+
+def export_model(*args, **kwargs):
+    print('Calling mxnet.contrib.onnx.export_model...')
+    print('Please be advised that the ONNX module has been moved to mxnet.onnx and '
+          'mxnet.onnx.export_model is the preferred path. The current path will be deprecated '
+          'in the upcoming MXNet v1.10 release.')
+    return export_model_(*args, **kwargs)
diff --git a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
deleted file mode 100644
index d46d7f4..0000000
--- a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
+++ /dev/null
@@ -1,2629 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Based on
-#  https://github.com/NVIDIA/mxnet_to_onnx/blob/master/mx2onnx_converter/
-# mx2onnx_converter_functions.py
-#  Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
-#
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions
-#  are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-#  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-#  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# coding: utf-8
-# pylint: disable=too-many-locals,no-else-return,too-many-lines
-# pylint: disable=anomalous-backslash-in-string,eval-used
-"""
-Conversion Functions for common layers.
-Add new functions here with a decorator.
-"""
-
-import re
-import logging
-import numpy as np
-from .export_onnx import MXNetGraph as mx_op
-try:
-    import onnx
-except ImportError:
-    onnx = None
-
-
-def parse_helper(attrs, attrs_name, alt_value=None):
-    """Helper function to parse operator attributes in required format."""
-    tuple_re = re.compile('\([0-9L|,| ]+\)')
-    if not attrs:
-        return alt_value
-    attrs_str = None if attrs.get(attrs_name) is None else str(attrs.get(attrs_name))
-    if attrs_str is None:
-        return alt_value
-    attrs_match = tuple_re.search(attrs_str)
-    if attrs_match is not None:
-        if attrs_match.span() == (0, len(attrs_str)):
-            dims = eval(attrs_str)
-            return dims
-        else:
-            raise AttributeError("Malformed %s dimensions: %s" % (attrs_name, str(attrs_str)))
-    return alt_value
-
-def transform_padding(pad_width):
-    """Helper function to convert padding format for pad operator.
-    """
-    num_pad_values = len(pad_width)
-    onnx_pad_width = [0]*num_pad_values
-
-    start_index = 0
-    # num_pad_values will always be multiple of 2
-    end_index = int(num_pad_values/2)
-    for idx in range(0, num_pad_values):
-        if idx % 2 == 0:
-            onnx_pad_width[start_index] = pad_width[idx]
-            start_index += 1
-        else:
-            onnx_pad_width[end_index] = pad_width[idx]
-            end_index += 1
-
-    return onnx_pad_width
-
-
-def convert_string_to_list(string_val):
-    """Helper function to convert string to list.
-     Used to convert shape attribute string to list format.
-    """
-    result_list = []
-
-    list_string = string_val.split(',')
-    for val in list_string:
-        val = str(val.strip())
-        val = val.replace("(", "")
-        val = val.replace(")", "")
-        val = val.replace("L", "")
-        val = val.replace("[", "")
-        val = val.replace("]", "")
-        if val not in ("", "None"):
-            result_list.append(int(val))
-
-    return result_list
-
-
-def get_boolean_attribute_value(attrs, attr_name):
-    """ Helper function to convert a string version
-    of Boolean attributes to integer for ONNX.
-    Takes attribute dictionary and attr_name as
-    parameters.
-    """
-    return 1 if attrs.get(attr_name, 0) in ["True", "1"] else 0
-
-
-def get_inputs(node, kwargs, with_shapes=False):
-    """Helper function to get inputs"""
-    name = node["name"]
-    proc_nodes = kwargs["proc_nodes"]
-    index_lookup = kwargs["index_lookup"]
-    graph_shapes = kwargs["graph_shapes"]
-    inputs = node["inputs"]
-    attrs = node.get("attrs", {})
-
-    input_nodes = []
-    input_shapes = []
-    for ip in inputs:
-        input_node_id = index_lookup[ip[0]]
-        try:
-            # ip[1] defines which output index to use
-            input_nodes.append(proc_nodes[input_node_id].output[ip[1]])
-        except AttributeError:
-            # fallback to the name attribute as output if the output attribute does not exist (e.g. for data nodes)
-            input_nodes.append(proc_nodes[input_node_id].name)
-
-        input_shapes.append(graph_shapes.get(input_nodes[-1]))
-
-    if with_shapes:
-        return name, input_nodes, input_shapes, attrs
-
-    return name, input_nodes, attrs
-
-
-def create_basic_op_node(op_name, node, kwargs):
-    """Helper function to create a basic operator
-    node that doesn't contain op specific attrs"""
-    name, input_nodes, _ = get_inputs(node, kwargs)
-
-    node = onnx.helper.make_node(
-        op_name,
-        input_nodes,
-        [name],
-        name=name
-    )
-    return [node]
-
-
-@mx_op.register("null")
-def convert_weights_and_inputs(node, **kwargs):
-    """Helper function to convert weights and inputs.
-    """
-    name, _, _ = get_inputs(node, kwargs)
-
-    if kwargs["is_input"] is False:
-        weights = kwargs["weights"]
-        initializer = kwargs["initializer"]
-        np_arr = weights[name]
-        data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np_arr.dtype]
-        dims = np.shape(np_arr)
-
-        tensor_node = onnx.helper.make_tensor_value_info(name, data_type, dims)
-
-        initializer.append(
-            onnx.helper.make_tensor(
-                name=name,
-                data_type=data_type,
-                dims=dims,
-                vals=np_arr.flatten().tolist(),
-                raw=False
-            )
-        )
-
-        return [tensor_node]
-    else:
-        tval_node = onnx.helper.make_tensor_value_info(name, kwargs["in_type"], kwargs["in_shape"])
-        return [tval_node]
-
-
-@mx_op.register("Convolution")
-def convert_convolution(node, **kwargs):
-    """Map MXNet's convolution operator attributes to onnx's Conv operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    kernel_dims = list(parse_helper(attrs, "kernel"))
-    stride_dims = list(parse_helper(attrs, "stride", [1, 1]))
-    pad_dims = list(parse_helper(attrs, "pad", [0, 0]))
-    num_group = int(attrs.get("num_group", 1))
-    dilations = list(parse_helper(attrs, "dilate", [1, 1]))
-
-    pad_dims = pad_dims + pad_dims
-
-    conv_node = onnx.helper.make_node(
-        "Conv",
-        inputs=input_nodes,
-        outputs=[name],
-        kernel_shape=kernel_dims,
-        strides=stride_dims,
-        dilations=dilations,
-        pads=pad_dims,
-        group=num_group,
-        name=name
-    )
-
-    return [conv_node]
-
-
-@mx_op.register("Deconvolution")
-def convert_deconvolution(node, **kwargs):
-    """Map MXNet's deconvolution operator attributes to onnx's ConvTranspose operator
-    and return the created node.
-    """
-    name, inputs, attrs = get_inputs(node, kwargs)
-
-    kernel_dims = list(parse_helper(attrs, "kernel"))
-    stride_dims = list(parse_helper(attrs, "stride", [1, 1]))
-    pad_dims = list(parse_helper(attrs, "pad", [0, 0]))
-    num_group = int(attrs.get("num_group", 1))
-    dilations = list(parse_helper(attrs, "dilate", [1, 1]))
-    adj_dims = list(parse_helper(attrs, "adj", [0, 0]))
-
-    pad_dims = pad_dims + pad_dims
-
-    deconv_node = onnx.helper.make_node(
-        "ConvTranspose",
-        inputs=inputs,
-        outputs=[name],
-        kernel_shape=kernel_dims,
-        strides=stride_dims,
-        dilations=dilations,
-        output_padding=adj_dims,
-        pads=pad_dims,
-        group=num_group,
-        name=name
-    )
-
-    return [deconv_node]
-
-
-@mx_op.register("Crop")
-def convert_crop(node, **kwargs):
-    """Map MXNet's crop operator attributes to onnx's Crop operator
-    and return the created node.
-    """
-    name, inputs, attrs = get_inputs(node, kwargs)
-    num_inputs = len(inputs)
-
-    y, x = list(parse_helper(attrs, "offset", [0, 0]))
-    h, w = list(parse_helper(attrs, "h_w", [0, 0]))
-    if num_inputs > 1:
-        h, w = kwargs["out_shape"][-2:]
-    border = [x, y, x + w, y + h]
-
-    crop_node = onnx.helper.make_node(
-        "Crop",
-        inputs=[inputs[0]],
-        outputs=[name],
-        border=border,
-        scale=[1, 1],
-        name=name
-    )
-
-    logging.warning(
-        "Using an experimental ONNX operator: Crop. " \
-        "Its definition can change.")
-
-    return [crop_node]
-
-
-@mx_op.register("FullyConnected")
-def convert_fully_connected(node, **kwargs):
-    """Map MXNet's FullyConnected operator attributes to onnx's Gemm operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    initializer = kwargs["initializer"]
-
-    no_bias = get_boolean_attribute_value(attrs, "no_bias")
-
-    fcnode = []
-
-    op_name = "flatten_" + str(kwargs["idx"])
-    flatten_node = onnx.helper.make_node(
-        'Flatten',
-        inputs=[input_nodes[0]],
-        outputs=[op_name],
-        name=op_name
-    )
-
-    input_nodes[0] = op_name
-    fcnode.append(flatten_node)
-
-    if no_bias:
-        data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype('int64')]
-        bias_name = "bias" + str(kwargs["idx"])
-        tensor_node = onnx.helper.make_tensor_value_info(bias_name, data_type, (1,))
-        initializer.append(
-            onnx.helper.make_tensor(
-                name=bias_name,
-                data_type=data_type,
-                dims=(1,),
-                vals=[0],
-                raw=False,
-            )
-        )
-        input_nodes.append(bias_name)
-        fcnode.append(tensor_node)
-
-    node = onnx.helper.make_node(
-        "Gemm",
-        input_nodes,  # input (A, B, C) - C can be in place
-        [name],  # output
-        alpha=1.0,
-        beta=1.0,
-        transA=False,
-        transB=True,
-        name=name
-    )
-
-    fcnode.append(node)
-
-    return fcnode
-
-
-@mx_op.register("BatchNorm")
-def convert_batchnorm(node, **kwargs):
-    """Map MXNet's BatchNorm operator attributes to onnx's BatchNormalization operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    momentum = float(attrs.get("momentum", 0.9))
-    eps = float(attrs.get("eps", 0.001))
-
-    bn_node = onnx.helper.make_node(
-        "BatchNormalization",
-        input_nodes,
-        [name],
-        name=name,
-        epsilon=eps,
-        momentum=momentum
-        # MXNet computes mean and variance per channel for batchnorm.
-        # Default for onnx is across all spatial features. Relying on default
-        # ONNX behavior of spatial=1 for ONNX opset 8 and below. As the spatial
-        # attribute is deprecated in opset 9 and above, not explicitly encoding it.
-    )
-    return [bn_node]
-
-
-@mx_op.register("tanh")
-def convert_tanh(node, **kwargs):
-    """Map MXNet's tanh operator attributes to onnx's Tanh operator
-    and return the created node.
-    """
-    return create_basic_op_node('Tanh', node, kwargs)
-
-@mx_op.register("cos")
-def convert_cos(node, **kwargs):
-    """Map MXNet's cos operator attributes to onnx's Cos operator
-    and return the created node.
-    """
-    return create_basic_op_node('Cos', node, kwargs)
-
-@mx_op.register("sin")
-def convert_sin(node, **kwargs):
-    """Map MXNet's sin operator attributes to onnx's Sin operator
-    and return the created node.
-    """
-    return create_basic_op_node('Sin', node, kwargs)
-
-@mx_op.register("tan")
-def convert_tan(node, **kwargs):
-    """Map MXNet's tan operator attributes to onnx's tan operator
-    and return the created node.
-    """
-    return create_basic_op_node('Tan', node, kwargs)
-
-@mx_op.register("arccos")
-def convert_acos(node, **kwargs):
-    """Map MXNet's acos operator attributes to onnx's acos operator
-    and return the created node.
-    """
-    return create_basic_op_node('Acos', node, kwargs)
-
-@mx_op.register("arcsin")
-def convert_asin(node, **kwargs):
-    """Map MXNet's asin operator attributes to onnx's asin operator
-    and return the created node.
-    """
-    return create_basic_op_node('Asin', node, kwargs)
-
-@mx_op.register("arctan")
-def convert_atan(node, **kwargs):
-    """Map MXNet's atan operator attributes to onnx's atan operator
-    and return the created node.
-    """
-    return create_basic_op_node('Atan', node, kwargs)
-
-#Basic neural network functions
-@mx_op.register("sigmoid")
-def convert_sigmoid(node, **kwargs):
-    """Map MXNet's sigmoid operator attributes to onnx's Sigmoid operator
-    and return the created node.
-    """
-    return create_basic_op_node('Sigmoid', node, kwargs)
-
-@mx_op.register("relu")
-def convert_relu(node, **kwargs):
-    """Map MXNet's relu operator attributes to onnx's Relu operator
-    and return the created node.
-    """
-    return create_basic_op_node('Relu', node, kwargs)
-
-@mx_op.register("Activation")
-def convert_activation(node, **kwargs):
-    """Map MXNet's Activation operator attributes to onnx's Tanh/Relu operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    act_type = attrs["act_type"]
-
-    # Creating a dictionary here, but if this titlecase pattern
-    # mxnet_name.title()
-    act_types = {
-        "tanh": "Tanh",
-        "relu": "Relu",
-        "sigmoid": "Sigmoid",
-        "softrelu": "Softplus",
-        "softsign": "Softsign"
-    }
-
-    act_name = act_types.get(act_type)
-    if act_name:
-        node = onnx.helper.make_node(
-            act_name,
-            input_nodes,
-            [name],
-            name=name
-        )
-    else:
-        raise AttributeError(
-            "Activation %s not implemented or recognized in the converter" % act_type
-        )
-
-    return [node]
-
-
-@mx_op.register("Pad")
-def convert_pad(node, **kwargs):
-    """Map MXNet's pad operator attributes to onnx's Pad operator
-    and return the created node.
-    """
-    opset_version = kwargs["opset_version"]
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    mxnet_pad_width = convert_string_to_list(attrs.get("pad_width"))
-    onnx_pad_width = transform_padding(mxnet_pad_width)
-
-    pad_mode = attrs.get("mode")
-    pad_value = np.float32(attrs.get("constant_value", 0.0))
-
-    if opset_version >= 11:
-        # starting with opset 11, pads and constant_value are inputs instead of attributes
-        from onnx.helper import make_tensor, make_tensor_value_info
-        initializer = kwargs["initializer"]
-        pads_input_name = name + "_pads"
-        pads_input_type = onnx.TensorProto.INT64
-        pads_input_shape = np.shape(np.array(onnx_pad_width))
-        pads_value_node = make_tensor_value_info(pads_input_name, pads_input_type, pads_input_shape)
-        pads_tensor_node = make_tensor(pads_input_name, pads_input_type, pads_input_shape, onnx_pad_width)
-        initializer.append(pads_tensor_node)
-        input_nodes.append(pads_input_name)
-
-        if pad_mode == "constant":
-            const_input_name = name + "_constant"
-            const_input_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[pad_value.dtype]
-            const_value_node = make_tensor_value_info(const_input_name, const_input_type, ())
-            const_tensor_node = make_tensor(const_input_name, const_input_type, (), [pad_value])
-            initializer.append(const_tensor_node)
-            input_nodes.append(const_input_name)
-            pad_node = onnx.helper.make_node(
-                "Pad",
-                input_nodes,
-                [name],
-                mode=pad_mode,
-                name=name
-            )
-            return [pads_value_node, const_value_node, pad_node]
-        else:
-            pad_node = onnx.helper.make_node(
-                "Pad",
-                input_nodes,
-                [name],
-                mode=pad_mode,
-                name=name
-            )
-            return [pads_value_node, pad_node]
-    else:
-        if pad_mode == "constant":
-            node = onnx.helper.make_node(
-                'Pad',
-                inputs=input_nodes,
-                outputs=[name],
-                mode='constant',
-                value=pad_value,
-                pads=onnx_pad_width,
-                name=name
-            )
-            return [node]
-        else:
-            node = onnx.helper.make_node(
-                'Pad',
-                inputs=input_nodes,
-                outputs=[name],
-                mode=pad_mode,
-                pads=onnx_pad_width,
-                name=name
-            )
-            return [node]
-
-def create_helper_tensor_node(input_vals, output_name, kwargs):
-    """create extra tensor node from numpy values"""
-    data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[input_vals.dtype]
-
-    tensor_node = onnx.helper.make_tensor_value_info(
-        name=output_name,
-        elem_type=data_type,
-        shape=input_vals.shape
-    )
-    kwargs["initializer"].append(
-        onnx.helper.make_tensor(
-            name=output_name,
-            data_type=data_type,
-            dims=input_vals.shape,
-            vals=input_vals.flatten(),
-            raw=False,
-        )
-    )
-
-    return [tensor_node]
-
-def create_helper_reshape_node(input_name, output_name, shape, kwargs):
-    """create extra reshape node with static shape"""
-    shape_tensor_node, = create_helper_tensor_node(
-        np.asarray(shape, dtype=np.int64), output_name + "__shape", kwargs
-    )
-    reshape_node = onnx.helper.make_node(
-        "Reshape",
-        inputs=[input_name, shape_tensor_node.name],
-        outputs=[output_name],
-        name=output_name
-    )
-
-    return [shape_tensor_node, reshape_node]
-
-def create_helper_trans_node(input_name, output_name, perm=None):
-    """create extra transpose node"""
-    attrs = {}
-    if perm is not None:
-        attrs['perm'] = perm
-    trans_node = onnx.helper.make_node(
-        'Transpose',
-        inputs=[input_name],
-        outputs=[output_name],
-        name=output_name,
-        **attrs
-    )
-    return [trans_node]
-
-def create_helper_concat_node(inputs, output_name, axis=0):
-    """create extra concat node"""
-    concat_node = onnx.helper.make_node(
-        "Concat",
-        inputs=inputs,
-        outputs=[output_name],
-        name=output_name,
-        axis=axis,
-    )
-    return [concat_node]
-
-def create_helper_expand_node(input_name, output_name, expand_shape):
-    """create extra expand node"""
-    expand_node = onnx.helper.make_node(
-        "Expand",
-        inputs=[input_name, expand_shape],
-        outputs=[output_name],
-        name=output_name,
-    )
-    return [expand_node]
-
-def create_helper_gather_node(
-        input_name, output_name,
-        indices, kwargs,
-        axis=None
-    ):
-    """create extra gather node with static indices"""
-    attrs = {}
-    if axis is not None:
-        attrs['axis'] = axis
-    gather_tensor_node, = create_helper_tensor_node(
-        np.asarray(indices, np.int64), output_name + "__indices", kwargs
-    )
-    gather_node = onnx.helper.make_node(
-        "Gather",
-        inputs=[input_name, gather_tensor_node.name],
-        outputs=[output_name],
-        name=output_name,
-        **attrs
-    )
-    return [gather_tensor_node, gather_node]
-
-def create_helper_build_values_node(
-        inputs, output_name,
-        dtype, kwargs, axis=0
-    ):
-    """create extra node, with specified values
-
-    (allows mixing node names and static values)
-    """
-    values = []
-    tensor_nodes = []
-    for idx, inp in enumerate(inputs):
-        if not isinstance(inp, (str, bytes)):
-            inp, = create_helper_tensor_node(
-                np.array([inp], dtype=dtype),
-                output_name + "__value" + str(idx),
-                kwargs
-            )
-            tensor_nodes.append(inp)
-            inp = inp.name
-        values.append(inp)
-    concat_node, = create_helper_concat_node(values, output_name, axis=axis)
-    return tensor_nodes + [concat_node,]
-
-def create_helper_shape_node(input_name, output_name):
-    """create extra shape node for specified input node"""
-    shape_node = onnx.helper.make_node(
-        "Shape",
-        inputs=[input_name],
-        outputs=[output_name],
-        name=output_name,
-    )
-    return [shape_node]
-
-@mx_op.register("dot")
-def convert_dot(node, **kwargs):
-    """Map MXNet's dot operator attributes to onnx's
-    MatMul and Transpose operators based on the values set for
-    transpose_a, transpose_b attributes."""
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-    input_node_a = input_nodes[0]
-    input_node_b = input_nodes[1]
-
-    trans_a_node = None
-    trans_b_node = None
-
-    trans_a = get_boolean_attribute_value(attrs, "transpose_a")
-    trans_b = get_boolean_attribute_value(attrs, "transpose_b")
-
-    op_name = "transpose" + str(kwargs["idx"])
-
-    if trans_a:
-        input_node_a = op_name + "_a"
-        trans_a_node, = create_helper_trans_node(input_nodes[0], input_node_a)
-    if trans_b:
-        input_node_b = op_name + "_b"
-        trans_b_node, = create_helper_trans_node(input_nodes[1], input_node_b)
-
-    matmul_node = onnx.helper.make_node(
-        'MatMul',
-        inputs=[input_node_a, input_node_b],
-        outputs=[name],
-        name=name
-    )
-
-    if not trans_a and not trans_b:
-        return [matmul_node]
-    elif trans_a and not trans_b:
-        return [trans_a_node, matmul_node]
-    elif trans_b and not trans_a:
-        return [trans_b_node, matmul_node]
-    else:
-        return [trans_a_node, trans_b_node, matmul_node]
-
-
-@mx_op.register("_linalg_gemm2")
-def convert_linalg_gemm2(node, **kwargs):
-    """Map MXNet's _linalg_gemm2 operator attributes to onnx's
-    MatMul and Transpose operators based on the values set for
-    transpose_a, transpose_b attributes.
-    Return multiple nodes created.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    # Getting the attributes and assigning default values.
-    alpha = float(attrs.get("alpha", 1.0))
-    trans_a = get_boolean_attribute_value(attrs, "transpose_a")
-    trans_b = get_boolean_attribute_value(attrs, "transpose_b")
-
-    op_name = "transpose" + str(kwargs["idx"])
-
-    if alpha == 1.0 and trans_a == 0 and trans_b == 0:
-        matmul_node = onnx.helper.make_node(
-            'MatMul',
-            inputs=input_nodes,
-            outputs=[name],
-            name=name
-        )
-        return [matmul_node]
-    elif trans_a == 1 and trans_b == 0:
-        op_name = "transpose" + str(kwargs["idx"])
-        node_name = op_name+"_a"
-        trans_a_node = onnx.helper.make_node(
-            'Transpose',
-            inputs=[input_nodes[0]],
-            outputs=[op_name+"_a"],
-            name=node_name
-        )
-
-        matmul_node = onnx.helper.make_node(
-            'MatMul',
-            inputs=[node_name, input_nodes[1]],
-            outputs=[name],
-            name=name
-        )
-        return [trans_a_node, matmul_node]
-
-    elif trans_a == 0 and trans_b == 1:
-        node_name = op_name + "_b"
-        trans_b_node = onnx.helper.make_node(
-            'Transpose',
-            inputs=[input_nodes[1]],
-            outputs=[op_name+"_b"],
-            name=node_name
-        )
-
-        matmul_node = onnx.helper.make_node(
-            'MatMul',
-            inputs=[input_nodes[0], node_name],
-            outputs=[name],
-            name=name
-        )
-
-        return [trans_b_node, matmul_node]
-    else:
-        node_name_a = op_name+"_a"
-        trans_a_node = onnx.helper.make_node(
-            'Transpose',
-            inputs=[input_nodes[0]],
-            outputs=[op_name+"_a"],
-            name=node_name_a
-        )
-
-        node_name_b = op_name + "_b"
-        trans_b_node = onnx.helper.make_node(
-            'Transpose',
-            inputs=[input_nodes[1]],
-            outputs=[op_name+"_b"],
-            name=node_name_b
-        )
-
-        matmul_node = onnx.helper.make_node(
-            'MatMul',
-            inputs=input_nodes,
-            outputs=[name],
-            name=name
-        )
-
-        return [trans_a_node, trans_b_node, matmul_node]
-
-
-@mx_op.register("Pooling")
-def convert_pooling(node, **kwargs):
-    """Map MXNet's Pooling operator attributes to onnx's
-    MaxPool/AveragePool/GlobalMaxPool/GlobalAveragePool operators
-    based on the input node's attributes and return the created node.
-    """
-    opset_version = kwargs["opset_version"]
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    kernel = eval(attrs["kernel"])
-    pool_type = attrs["pool_type"] if attrs.get("pool_type") else "max"
-    stride = eval(attrs["stride"]) if attrs.get("stride") else (1, 1)
-    global_pool = get_boolean_attribute_value(attrs, "global_pool")
-    p_value = attrs.get('p_value', 'None')
-
-    pooling_convention = attrs.get('pooling_convention', 'valid')
-    ceil_mode = False
-    if pooling_convention == 'full':
-        if opset_version < 10:
-            pooling_warning = "Pooling: ONNX lower than 1.5.0 doesn't support pooling_convention. " \
-                              "This might lead to shape or accuracy issues. " \
-                              "https://github.com/onnx/onnx/issues/549"
-            logging.warning(pooling_warning)
-        ceil_mode = True
-
-    pad_dims = list(parse_helper(attrs, "pad", [0, 0]))
-    pad_dims = pad_dims + pad_dims
-    pool_types = {"max": "MaxPool", "avg": "AveragePool", "lp": "LpPool"}
-    global_pool_types = {"max": "GlobalMaxPool", "avg": "GlobalAveragePool",
-                         "lp": "GlobalLpPool"}
-
-    if pool_type == 'lp' and p_value == 'None':
-        raise AttributeError('ONNX requires a p value for LpPool and GlobalLpPool')
-
-    if global_pool:
-        if pool_type == 'lp':
-            node = onnx.helper.make_node(
-                global_pool_types[pool_type],
-                input_nodes,  # input
-                [name],
-                p=int(p_value),
-                name=name
-            )
-        else:
-            node = onnx.helper.make_node(
-                global_pool_types[pool_type],
-                input_nodes,  # input
-                [name],
-                name=name
-            )
-    else:
-        if pool_type == 'lp':
-            node = onnx.helper.make_node(
-                pool_types[pool_type],
-                input_nodes,  # input
-                [name],
-                p=int(p_value),
-                kernel_shape=kernel,
-                pads=pad_dims,
-                strides=stride,
-                name=name
-            )
-        else:
-            if opset_version >= 10:
-                node = onnx.helper.make_node(
-                    pool_types[pool_type],
-                    input_nodes,  # input
-                    [name],
-                    kernel_shape=kernel,
-                    pads=pad_dims,
-                    strides=stride,
-                    name=name,
-                    ceil_mode=ceil_mode
-                )
-            else:
-                node = onnx.helper.make_node(
-                    pool_types[pool_type],
-                    input_nodes,  # input
-                    [name],
-                    kernel_shape=kernel,
-                    pads=pad_dims,
-                    strides=stride,
-                    name=name
-                )
-
-    return [node]
-
-
-@mx_op.register("exp")
-def convert_exp(node, **kwargs):
-    """Map MXNet's exp operator attributes to onnx's Exp operator
-    and return the created node.
-    """
-    return create_basic_op_node('Exp', node, kwargs)
-
-@mx_op.register("_copy")
-def convert_copy(node, **kwargs):
-    """Map MXNet's _copy operator attributes to onnx's Identity operator
-    and return the created node.
-    """
-    return create_basic_op_node('Identity', node, kwargs)
-
-@mx_op.register("identity")
-def convert_identity(node, **kwargs):
-    """Map MXNet's identity operator attributes to onnx's ConstantFill operator
-    and return the created node.
-    """
-    return create_basic_op_node('ConstantFill', node, kwargs)
-
-@mx_op.register("InstanceNorm")
-def convert_instancenorm(node, **kwargs):
-    """Map MXNet's InstanceNorm operator attributes to onnx's InstanceNormalization operator
-    based on the input node's attributes and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    eps = float(attrs.get("eps", 0.001))
-
-    node = onnx.helper.make_node(
-        'InstanceNormalization',
-        inputs=input_nodes,
-        outputs=[name],
-        name=name,
-        epsilon=eps)
-
-    return [node]
-
-@mx_op.register("LeakyReLU")
-def convert_leakyrelu(node, **kwargs):
-    """Map MXNet's LeakyReLU operator attributes to onnx's Elu/LeakyRelu/PRelu operators
-    based on the input node's attributes and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    act_type = attrs.get("act_type", "leaky")
-    alpha = float(attrs.get("slope", 0.25))
-
-    act_name = {"elu": "Elu", "leaky": "LeakyRelu", "prelu": "PRelu",
-                "selu": "Selu"}
-
-    if act_type in ("prelu", "selu"):
-        node = onnx.helper.make_node(
-            act_name[act_type],
-            inputs=input_nodes,
-            outputs=[name],
-            name=name)
-    else:
-        node = onnx.helper.make_node(
-            act_name[act_type],
-            inputs=input_nodes,
-            outputs=[name],
-            name=name,
-            alpha=alpha)
-
-    return [node]
-
-
-@mx_op.register("softmax")
-def convert_softmax(node, **kwargs):
-    """Map MXNet's softmax operator attributes to onnx's Softmax operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    axis = int(attrs.get("axis", -1))
-
-    softmax_node = onnx.helper.make_node(
-        "Softmax",
-        input_nodes,
-        [name],
-        axis=axis,
-        name=name
-    )
-
-    return [softmax_node]
-
-
-@mx_op.register("BlockGrad")
-def convert_blockgrad(node, **kwargs):
-    """ Skip operator  """
-    return create_basic_op_node('ConstantFill', node, kwargs)
-
-@mx_op.register("MakeLoss")
-def convert_makeloss(node, **kwargs):
-    """ Skip operator  """
-    return create_basic_op_node('ConstantFill', node, kwargs)
-
-@mx_op.register("Concat")
-def convert_concat(node, **kwargs):
-    """Map MXNet's Concat operator attributes to onnx's Concat operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    axis = int(attrs.get("dim", 1))
-    concat_node = onnx.helper.make_node(
-        "Concat",
-        input_nodes,
-        [name],
-        axis=axis,
-        name=name
-    )
-    return [concat_node]
-
-@mx_op.register("RNN")
-def convert_RNN(node, **kwargs):
-    """Map MXNet's RNN operator attributes to onnx's RNN operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-    nodes = []
-
-    # ============================== Attributes ==============================
-    mode = attrs['mode'].upper()
-    rnn_kwargs = {}
-    if mode != 'LSTM':
-        raise NotImplementedError(
-            "Only LSTM mode RNN conversion to ONNX is currently supported."
-        )
-
-    hidden_size = rnn_kwargs['hidden_size'] = int(attrs.get("state_size"))
-    if eval(attrs.get('bidirectional', 'False')):
-        rnn_kwargs['direction'] = 'bidirectional'
-        num_directions = 2
-    else:
-        rnn_kwargs['direction'] = 'forward'
-        num_directions = 1
-
-    clip_min = eval(attrs.get('lstm_state_clip_min', 'None'))
-    clip_max = eval(attrs.get('lstm_state_clip_max', 'None'))
-    if clip_min is not None or clip_max is not None:
-        # ONNX LSTMs have the `clip` attribute, however it seems to give
-        # slightly different results, when compared to the MXNet equivalent
-        raise NotImplementedError(
-            "Conversion of RNNs with lstm_state_clip_min/max "
-            "to ONNX is currently not supported."
-        )
-
-    if eval(attrs.get('lstm_state_clip_nan', 'False')):
-        raise NotImplementedError(
-            "ONNX RNN operator doesn't support lstm_state_clip_nan"
-        )
-
-    if eval(attrs.get('use_sequence_length', 'False')):
-        # This can maybe be implemented using the `sequence_len` optional input
-        raise NotImplementedError(
-            "Conversion of RNNs with variable input sequence length "
-            "to ONNX is currently not supported."
-        )
-
-    if eval(attrs.get('num_layers', '1')) != 1:
-        raise NotImplementedError(
-            "Conversion of RNNs with num_layers > 1 "
-            "to ONNX is currently not supported."
-        )
-
-    if eval(attrs.get('p', '0')) != 0:
-        # WARNING! The `p` attribute in mxnet is "dropout probability" while
-        # the `p` optional input of ONNX LSTMs is the peephole weights tensor.
-        raise NotImplementedError(
-            "Conversion of RNNs with dropout "
-            "to ONNX is currently not supported."
-        )
-
-    if eval(attrs.get('projection_size', 'None')) is not None:
-        raise NotImplementedError(
-            "Conversion of RNNs with custom projection_size "
-            "to ONNX is currently not supported."
-        )
-
-    if not eval(attrs.get('state_outputs', 'True')):
-        raise NotImplementedError(
-            "Conversion of RNNs with state_outputs=False "
-            "to ONNX is currently not supported."
-        )
-
-    # ============================== Parameters ==============================
-
-    # (See _rnn_param_concat for part 1 of this comment section)
-
-    # Unfortunately, mxnets version of _rnn_param_concat concatenates *ALL*
-    # the parameters, instead of grouping them like ONNX. The workaround,
-    # used here, is that the _rnn_param_concat node conversion code will
-    # produce multiple nodes with names ending in rnn_param_concatN__P
-    # (Where P is the parameter group name W, R or B)
-    # We then use regular expressions to get the "extra outputs" of the
-    # _rnn_param_concat node.
-
-    x, param_concat, *initial_states = input_nodes
-    param_pattern = re.compile(r'(.*rnn_param_concat[0-9]+__)[WRB]$')
-    if not param_pattern.match(param_concat):
-        # ToDo: Maybe do something more sane after Issue #17621 gets resolved
-        raise NotImplementedError(
-            "The order of RNN parameters is different between mxnet and ONNX. "
-            "Currently, an automatic conversion is only possible, if the RNN "
-            "parameters were concatenated using the internal "
-            "_rnn_param_concat operator."
-        )
-    w, r, b = (
-        param_pattern.sub(r'\1' + param, param_concat)
-        for param in 'WRB'
-    )
-
-    # The second conversion step handles
-    #     * parameter shapes, since mxnet uses flattened parameters, while
-    #       ONNX requires specific tensor shapes
-    #     * gate order, since both frameworks require the weights and biases
-    #       of the 4 basic gates (forget, input, cell and output) to be
-    #       concatenated, but in different order
-    #       ([ifco] for mxnet and [iofc] for ONNX)
-
-    def fix_rnn_parameter(p, p_shape_in, p_shape_out, p_order=(0, 3, 1, 2)):
-        p_ = p
-
-        # 1) Reshape flat parameters to their original shape, such that
-        #    the gates are concatenated along axis=1
-        p_reshaped_in = create_helper_reshape_node(
-            p, p_ + "__reshaped_in", p_shape_in, kwargs
-        )
-        nodes.extend(p_reshaped_in)
-        p = p_reshaped_in[-1].name
-
-        # 2) Use a Gather node to pick gates along axis=1, permuting them
-        p_reordered = create_helper_gather_node(
-            p, p_ + "__reordered", p_order, kwargs, axis=1
-        )
-        nodes.extend(p_reordered)
-        p = p_reordered[-1].name
-
-        # 3) Reshape the parameters to their final shape, squeezing the gate
-        #    and hidden dimensions together
-        p_reshaped_out = create_helper_reshape_node(
-            p, p_ + "__reshaped_out", p_shape_out, kwargs
-        )
-        nodes.extend(p_reshaped_out)
-        return p_reshaped_out[-1].name
-
-    w = fix_rnn_parameter(
-        w,
-        p_shape_in=(num_directions, 4, hidden_size, -1),
-        p_shape_out=(num_directions, 4 * hidden_size, -1),
-    )
-
-    r = fix_rnn_parameter(
-        r,
-        p_shape_in=(num_directions, 4, hidden_size, hidden_size),
-        p_shape_out=(num_directions, 4 * hidden_size, hidden_size),
-    )
-
-    b = fix_rnn_parameter(
-        b,
-        p_shape_in=(2 * num_directions, 4, hidden_size),
-        p_shape_out=(num_directions, 8 * hidden_size),
-    )
-
-    # ============================= Inputs/States ============================
-    input_shape = create_helper_shape_node(x, x + "__shape")
-    nodes.extend(input_shape)
-    input_shape = input_shape[-1].name
-
-    batch_size = create_helper_gather_node(
-        input_shape,
-        x + "__batch_size",
-        indices=[1],
-        axis=0,
-        kwargs=kwargs,
-    )
-    nodes.extend(batch_size)
-    batch_size = batch_size[-1].name
-
-    state_shape = create_helper_build_values_node(
-        [num_directions, batch_size, hidden_size],
-        name + "__state_shape",
-        dtype=np.int64,
-        kwargs=kwargs,
-    )
-    nodes.extend(state_shape)
-    state_shape = state_shape[-1].name
-
-    expanded_states = []
-    for state in initial_states:
-        expanded_state = create_helper_expand_node(
-            state, state + "__expanded", state_shape
-        )
-        nodes.extend(expanded_state)
-        expanded_states.append(expanded_state[-1].name)
-    initial_states = expanded_states
-
-    # =========================== RNN node/outputs ===========================
-    y_out = [onnx.helper.make_node(
-        mode,  # RNN or LSTM or GRU
-        inputs=[x, w, r, b, '', *initial_states],
-        outputs=[name + '__Y'],
-        name=name + '__Y',
-        **rnn_kwargs
-    )]
-    nodes.extend(y_out)
-    y = y_out[-1].name
-
-    # We are almost done. The only thing left to do is to convert the output
-    # of the RNN node from the [S, D, B, H] layout, which ONNX returns
-    # to the [S, B, D*H] layout, which mxnet uses
-
-    # 1) Transpose [S, D, B, H] -> [S, B, D, H]
-    y_perm = (0, 2, 1, 3)
-    y_transposed = create_helper_trans_node(
-        y, y + "__transposed", y_perm
-    )
-    nodes.extend(y_transposed)
-    y = y_transposed[-1].name
-
-    # 2) Reshape [S, B, D, H] -> [S, B, D*H]
-    y_shape = (0, 0, -1)
-    y_reshaped = create_helper_reshape_node(y, name, y_shape, kwargs)
-    nodes.extend(y_reshaped)
-
-    return nodes
-
-@mx_op.register('_rnn_param_concat')
-def convert_rnn_param_concat(node, **kwargs):
-    """Map MXNet's _rnn_param_concat operator attributes to onnx's Concat
-    operator and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-    axis = int(attrs.get("dim"))
-
-    # mxnet RNN node and ONNX RNN/LSTM/GRU nodes
-    # use different ways to store their parameters
-
-    # The conversion between these formats is broken into 2 steps
-    # The first step (performed here in _rnn_param_concat) regroups the
-    # flattened parameters according to the table below.
-    # The second step corrects the shapes and orders of gates and is
-    # performed and described in more detail in the RNN node
-
-    # mxnet            [ONNX] -> ONNX (group)
-    # i2h_weights [W (+  WB)] -> W    (input weights)
-    # h2h_weights [R (+  RB)] -> R    (recurrence weights)
-    # i2h_biases [Wb (+ WBb)] -> B = [Wb + Rb (+ WBb + RBb)]
-    # h2h_biases [Rb (+ RBb)] ->      (biases)
-
-    split = len(input_nodes) // 2
-    weights, biases = input_nodes[:split], input_nodes[split:]
-    i2h_weights = weights[::2]
-    h2h_weights = weights[1::2]
-    i2h_biases = biases[::2]
-    h2h_biases = biases[1::2]
-    reordered_biases = [
-        bias
-        for pair in zip(i2h_biases, h2h_biases)
-        for bias in pair
-    ]
-
-    # The order of mxnet parameters in the inputs is:
-    # [
-    #     '{}{}_{}_{}'.format(d, l, g, t)
-    #     for t in ['weight', 'bias']
-    #     for l in range(num_layers)
-    #     for d in ['l', 'r'][:num_directions]
-    #     for g in ['i2h', 'h2h']
-    # ]
-
-    w = onnx.helper.make_node(
-        "Concat",
-        inputs=i2h_weights,
-        outputs=[name + "__W"],
-        axis=axis,
-        name=name + "__W"
-    )
-    r = onnx.helper.make_node(
-        "Concat",
-        inputs=h2h_weights,
-        outputs=[name + "__R"],
-        axis=axis,
-        name=name + "__R"
-    )
-    b = onnx.helper.make_node(
-        "Concat",
-        inputs=reordered_biases,
-        outputs=[name + "__B"],
-        axis=axis,
-        name=name + "__B"
-    )
-    return [w, r, b]
-
-@mx_op.register("_zeros")
-@mx_op.register("_ones")
-@mx_op.register("_full")
-def convert_full(node, **kwargs):
-    """Map MXNet's _zeros, _ones and _full operators attributes to onnx's
-    tensors and return the created node.
-    """
-    # ToDo: Use Constant or ConstantOfShape, when Issue #15101 is resolved?
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-    del input_nodes
-
-    # Convert "0"s dimensions to "1"s. This is a workaround for the case, where
-    # mxnet symbols can broadcast "0"s, while ONNX can only broadcast over "1"s
-    shape = convert_string_to_list(attrs["shape"])
-    shape = tuple(dim if dim else 1 for dim in shape)
-
-    value = {
-        '_zeros': 0.0,
-        '_ones': 1.0,
-        '_full': eval(attrs.get('value', '0')),
-    }[node['op']]
-    dtype = attrs.get('dtype')
-    data = np.full(shape, value, dtype)
-
-    return create_helper_tensor_node(data, name, kwargs)
-
-@mx_op.register("transpose")
-def convert_transpose(node, **kwargs):
-    """Map MXNet's transpose operator attributes to onnx's Transpose operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    axes = attrs.get("axes", ())
-    if axes:
-        axes = tuple(map(int, re.findall(r'\d+', axes)))
-
-        transpose_node = onnx.helper.make_node(
-            "Transpose",
-            input_nodes,
-            [name],
-            perm=axes,
-            name=name
-        )
-    else:
-        transpose_node = onnx.helper.make_node(
-            "Transpose",
-            input_nodes,
-            [name],
-            name=name
-        )
-
-    return [transpose_node]
-
-
-@mx_op.register("LRN")
-def convert_lrn(node, **kwargs):
-    """Map MXNet's LRN operator attributes to onnx's LRN operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    alpha = float(attrs.get("alpha", 0.0001))
-    beta = float(attrs.get("beta", 0.75))
-    bias = float(attrs.get("knorm", 1.0))
-    size = int(attrs.get("nsize"))
-
-    lrn_node = onnx.helper.make_node(
-        "LRN",
-        inputs=input_nodes,
-        outputs=[name],
-        name=name,
-        alpha=alpha,
-        beta=beta,
-        bias=bias,
-        size=size
-    )
-
-    return [lrn_node]
-
-
-@mx_op.register("L2Normalization")
-def convert_l2normalization(node, **kwargs):
-    """Map MXNet's L2Normalization operator attributes to onnx's LpNormalization operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    mode = attrs.get("mode", "instance")
-
-    if mode != "channel":
-        raise AttributeError("L2Normalization: ONNX currently supports channel mode only")
-
-    l2norm_node = onnx.helper.make_node(
-        "LpNormalization",
-        input_nodes,
-        [name],
-        axis=1,  # channel only
-        name=name
-    )
-    return [l2norm_node]
-
-
-@mx_op.register("Dropout")
-def convert_dropout(node, **kwargs):
-    """Map MXNet's Dropout operator attributes to onnx's Dropout operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-    opset_version = kwargs["opset_version"]
-
-    probability = float(attrs.get("p", 0.5))
-
-    if opset_version >= 12:
-        # opset >= 12 requires the ratio to be an input
-        initializer = kwargs["initializer"]
-        ratio_input_name = name + "_ratio"
-        value_node = onnx.helper.make_tensor_value_info(ratio_input_name,
-                                                        onnx.TensorProto.FLOAT, ())
-        tensor_node = onnx.helper.make_tensor(ratio_input_name, onnx.TensorProto.FLOAT,
-                                              (), [probability])
-        initializer.append(tensor_node)
-        dropout_node = onnx.helper.make_node(
-            "Dropout",
-            [input_nodes[0], ratio_input_name],
-            [name],
-            name=name
-        )
-        return [value_node, dropout_node]
-    else:
-        dropout_node = onnx.helper.make_node(
-            "Dropout",
-            input_nodes,
-            [name],
-            ratio=probability,
-            name=name
-        )
-        return [dropout_node]
-
-
-@mx_op.register("Flatten")
-def convert_flatten(node, **kwargs):
-    """Map MXNet's Flatten operator attributes to onnx's Flatten operator
-    and return the created node.
-    """
-    return create_basic_op_node('Flatten', node, kwargs)
-
-@mx_op.register("clip")
-def convert_clip(node, **kwargs):
-    """Map MXNet's Clip operator attributes to onnx's Clip operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-    opset_version = kwargs["opset_version"]
-
-    a_min = float(attrs.get('a_min', -np.inf))
-    a_max = float(attrs.get('a_max', np.inf))
-
-    if opset_version >= 11:
-        # opset >= 11 requires min/max to be inputs
-        initializer = kwargs["initializer"]
-        min_input_name = name + "_min"
-        max_input_name = name + "_max"
-        min_value_node = onnx.helper.make_tensor_value_info(min_input_name,
-                                                            onnx.TensorProto.FLOAT, ())
-        max_value_node = onnx.helper.make_tensor_value_info(max_input_name,
-                                                            onnx.TensorProto.FLOAT, ())
-        min_tensor_node = onnx.helper.make_tensor(min_input_name, onnx.TensorProto.FLOAT,
-                                                  (), [a_min])
-        max_tensor_node = onnx.helper.make_tensor(max_input_name, onnx.TensorProto.FLOAT,
-                                                  (), [a_max])
-        initializer.append(min_tensor_node)
-        initializer.append(max_tensor_node)
-        input_nodes.append(min_input_name)
-        input_nodes.append(max_input_name)
-        clip_node = onnx.helper.make_node(
-            "Clip",
-            input_nodes,
-            [name],
-            name=name
-        )
-        return [min_value_node, max_value_node, clip_node]
-
-    else:
-        clip_node = onnx.helper.make_node(
-            "Clip",
-            input_nodes,
-            [name],
-            name=name,
-            min=a_min,
-            max=a_max
-        )
-        return [clip_node]
-
-
-def scalar_op_helper(node, op_name, **kwargs):
-    """Helper function for scalar arithmetic operations"""
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-    from onnx import numpy_helper
-    input_type = kwargs["in_type"]
-    scalar_value = np.array([attrs.get("scalar", 1)],
-                            dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[input_type])
-
-    initializer = kwargs["initializer"]
-    flag = True
-    # If the input value is in initializer, just multiply with scalar input
-    # and create a new initializer
-    for i in initializer:
-        if i.name == input_nodes[0]:
-            if op_name == 'Mul':
-                new_initializer = numpy_helper.to_array(i) * scalar_value[0]
-            elif op_name == 'Sub':
-                if name.startswith("_rminusscalar"):
-                    new_initializer = scalar_value[0] - numpy_helper.to_array(i)
-                else:
-                    new_initializer = numpy_helper.to_array(i) - scalar_value[0]
-            elif op_name == 'Add':
-                new_initializer = numpy_helper.to_array(i) + scalar_value[0]
-            elif op_name == 'Div':
-                if name.startswith("_rdivscalar"):
-                    new_initializer = scalar_value[0] / numpy_helper.to_array(i)
-                else:
-                    new_initializer = numpy_helper.to_array(i) / scalar_value[0]
-            elif op_name == 'Pow':
-                new_initializer = numpy_helper.to_array(i) ** scalar_value[0]
-            flag = False
-            break
-
-    # else create a new tensor of the scalar value, add it in initializer
-    if flag is True:
-        dims = np.shape(scalar_value)
-
-        scalar_op_name = "scalar_op" + str(kwargs["idx"])
-        tensor_node = onnx.helper.make_tensor_value_info(scalar_op_name, input_type, dims)
-
-        initializer.append(
-            onnx.helper.make_tensor(
-                name=scalar_op_name,
-                data_type=input_type,
-                dims=dims,
-                vals=scalar_value,
-                raw=False,
-            )
-        )
-
-        mul_node = onnx.helper.make_node(
-            op_name,
-            [input_nodes[0], scalar_op_name],
-            [name],
-            name=name
-        )
-
-        return [tensor_node, mul_node]
-    else:
-        data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[new_initializer.dtype]
-        dims = np.shape(new_initializer)
-
-        new_a_node = input_nodes[0] + str(kwargs["idx"])
-        tensor_node = onnx.helper.make_tensor_value_info(new_a_node, data_type, dims)
-
-        initializer.append(
-            onnx.helper.make_tensor(
-                name=new_a_node,
-                data_type=data_type,
-                dims=dims,
-                vals=new_initializer,
-                raw=False,
-            )
-        )
-        return [tensor_node]
-
-# Convert scalar value into node and pass it as input to mul_node
-@mx_op.register("_mul_scalar")
-def convert_mul_scalar(node, **kwargs):
-    """Map MXNet's _mul_scalar operator attributes to onnx's Mul operator.
-    Creates a new node for the input scalar value, adds it to the initializer
-    and return multiple created nodes.
-    """
-    return scalar_op_helper(node, 'Mul', **kwargs)
-
-
-# Convert scalar value into node and pass it as input to mul_node
-@mx_op.register("_minus_scalar")
-def convert_minus_scalar(node, **kwargs):
-    """Map MXNet's _minus_scalar operator attributes to onnx's Minus operator.
-    Creates a new node for the input scalar value, adds it to the initializer
-    and return multiple created nodes.
-    """
-    return scalar_op_helper(node, 'Sub', **kwargs)
-
-@mx_op.register("_rminus_scalar")
-def convert_rminus_scalar(node, **kwargs):
-    """Map MXNet's _rminus_scalar operator attributes to onnx's Sub operator.
-    Creates a new node for the input scalar value, adds it to the initializer
-    and return multiple created nodes.
-    """
-    return scalar_op_helper(node, 'Sub', **kwargs)
-
-# Convert scalar value into node and pass it as input to mul_node
-@mx_op.register("_plus_scalar")
-def convert_add_scalar(node, **kwargs):
-    """Map MXNet's _plus_scalar operator attributes to onnx's Add operator.
-    Creates a new node for the input scalar value, adds it to the initializer
-    and return multiple created nodes.
-    """
-    return scalar_op_helper(node, 'Add', **kwargs)
-
-# Convert scalar value into node and pass it as input to mul_node
-@mx_op.register("_div_scalar")
-def convert_div_scalar(node, **kwargs):
-    """Map MXNet's _div_scalar operator attributes to onnx's Div operator.
-    Creates a new node for the input scalar value, adds it to the initializer
-    and return multiple created nodes.
-    """
-    return scalar_op_helper(node, 'Div', **kwargs)
-
-@mx_op.register("_rdiv_scalar")
-def convert_rdiv_scalar(node, **kwargs):
-    """Map MXNet's _rdiv_scalar operator attributes to onnx's Div operator.
-    Creates a new node for the input scalar value, adds it to the initializer
-    and return multiple created nodes.
-    """
-    return scalar_op_helper(node, 'Div', **kwargs)
-
-@mx_op.register("_power_scalar")
-def convert_pow_scalar(node, **kwargs):
-    """Map MXNet's _pow_scalar operator attributes to onnx's Pow operator.
-    Creates a new node for the input scalar value, adds it to the initializer
-    and return multiple created nodes.
-    """
-    return scalar_op_helper(node, 'Pow', **kwargs)
-
-# Sorting and Searching
-@mx_op.register("argmax")
-def convert_argmax(node, **kwargs):
-    """Map MXNet's argmax operator attributes to onnx's ArgMax operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    axis = int(attrs.get("axis"))
-    keepdims = get_boolean_attribute_value(attrs, "keepdims")
-
-    node = onnx.helper.make_node(
-        'ArgMax',
-        inputs=input_nodes,
-        axis=axis,
-        keepdims=keepdims,
-        outputs=[name],
-        name=name
-    )
-    return [node]
-
-@mx_op.register("argmin")
-def convert_argmin(node, **kwargs):
-    """Map MXNet's argmin operator attributes to onnx's ArgMin operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    axis = int(attrs.get("axis"))
-    keepdims = get_boolean_attribute_value(attrs, "keepdims")
-
-    node = onnx.helper.make_node(
-        'ArgMin',
-        inputs=input_nodes,
-        axis=axis,
-        keepdims=keepdims,
-        outputs=[name],
-        name=name
-    )
-    return [node]
-
-@mx_op.register("_maximum")
-def convert_maximum(node, **kwargs):
-    """Map MXNet's _maximum operator attributes to onnx's Max operator
-    and return the created node.
-    """
-    return create_basic_op_node('Max', node, kwargs)
-
-
-@mx_op.register("_minimum")
-def convert_minimum(node, **kwargs):
-    """Map MXNet's _minimum operator attributes to onnx's Min operator
-    and return the created node.
-    """
-    return create_basic_op_node('Min', node, kwargs)
-
-@mx_op.register("min")
-def convert_min(node, **kwargs):
-    """Map MXNet's min operator attributes to onnx's ReduceMin operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    mx_axis = attrs.get("axis", None)
-    axes = convert_string_to_list(str(mx_axis)) if mx_axis is not None else None
-
-    keepdims = get_boolean_attribute_value(attrs, "keepdims")
-
-    if axes is not None:
-        node = onnx.helper.make_node(
-            'ReduceMin',
-            inputs=input_nodes,
-            outputs=[name],
-            axes=axes,
-            keepdims=keepdims,
-            name=name
-        )
-
-        return [node]
-    else:
-        node = onnx.helper.make_node(
-            'ReduceMin',
-            inputs=input_nodes,
-            outputs=[name],
-            keepdims=keepdims,
-            name=name
-        )
-
-        return [node]
-
-
-@mx_op.register("max")
-def convert_max(node, **kwargs):
-    """Map MXNet's max operator attributes to onnx's ReduceMax operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    mx_axis = attrs.get("axis", None)
-    axes = convert_string_to_list(str(mx_axis)) if mx_axis is not None else None
-
-    keepdims = get_boolean_attribute_value(attrs, "keepdims")
-
-    if axes is not None:
-        node = onnx.helper.make_node(
-            'ReduceMax',
-            inputs=input_nodes,
-            outputs=[name],
-            axes=axes,
-            keepdims=keepdims,
-            name=name
-        )
-
-        return [node]
-    else:
-        node = onnx.helper.make_node(
-            'ReduceMax',
-            inputs=input_nodes,
-            outputs=[name],
-            keepdims=keepdims,
-            name=name
-        )
-
-        return [node]
-
-
-@mx_op.register("mean")
-def convert_mean(node, **kwargs):
-    """Map MXNet's mean operator attributes to onnx's ReduceMean operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    mx_axis = attrs.get("axis", None)
-    axes = convert_string_to_list(str(mx_axis)) if mx_axis is not None else None
-
-    keepdims = get_boolean_attribute_value(attrs, "keepdims")
-
-    if axes is not None:
-        node = onnx.helper.make_node(
-            'ReduceMean',
-            inputs=input_nodes,
-            outputs=[name],
-            axes=axes,
-            keepdims=keepdims,
-            name=name
-        )
-
-        return [node]
-    else:
-        node = onnx.helper.make_node(
-            'ReduceMean',
-            inputs=input_nodes,
-            outputs=[name],
-            keepdims=keepdims,
-            name=name
-        )
-
-        return [node]
-
-
-@mx_op.register("prod")
-def convert_prod(node, **kwargs):
-    """Map MXNet's prod operator attributes to onnx's ReduceProd operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    mx_axis = attrs.get("axis", None)
-    axes = convert_string_to_list(str(mx_axis)) if mx_axis is not None else None
-
-    keepdims = get_boolean_attribute_value(attrs, "keepdims")
-
-    if axes is not None:
-        node = onnx.helper.make_node(
-            'ReduceProd',
-            inputs=input_nodes,
-            outputs=[name],
-            axes=axes,
-            keepdims=keepdims,
-            name=name
-        )
-
-        return [node]
-    else:
-        node = onnx.helper.make_node(
-            'ReduceProd',
-            inputs=input_nodes,
-            outputs=[name],
-            keepdims=keepdims,
-            name=name
-        )
-
-        return [node]
-
-
-# Arithmetic Operations
-@mx_op.register("elemwise_add")
-def convert_elementwise_add(node, **kwargs):
-    """Map MXNet's elemwise_add operator attributes to onnx's Add operator
-    and return the created node.
-    """
-    return create_basic_op_node('Add', node, kwargs)
-
-
-@mx_op.register("broadcast_add")
-def covert_broadcast_add(node, **kwargs):
-    """Map MXNet's broadcast_add operator attributes to onnx's Add operator
-    and return the created node.
-    """
-    return create_basic_op_node('Add', node, kwargs)
-
-
-@mx_op.register("elemwise_sub")
-def convert_elementwise_sub(node, **kwargs):
-    """Map MXNet's elemwise_sub operator attributes to onnx's Sub operator
-    and return the created node.
-    """
-    return create_basic_op_node('Sub', node, kwargs)
-
-@mx_op.register("broadcast_sub")
-def covert_broadcast_sub(node, **kwargs):
-    """Map MXNet's broadcast_sub operator attributes to onnx's Sub operator
-    and return the created node.
-    """
-    return create_basic_op_node('Sub', node, kwargs)
-
-@mx_op.register("elemwise_mul")
-def convert_elemwise_mul(node, **kwargs):
-    """Map MXNet's elemwise_mul operator attributes to onnx's Mul operator
-    and return the created node.
-    """
-    return create_basic_op_node('Mul', node, kwargs)
-
-@mx_op.register("broadcast_mul")
-def convert_broadcast_mul(node, **kwargs):
-    """Map MXNet's broadcast_mul operator attributes to onnx's Mul operator
-    and return the created node.
-    """
-    return create_basic_op_node('Mul', node, kwargs)
-
-@mx_op.register("elemwise_div")
-def convert_elemwise_div(node, **kwargs):
-    """Map MXNet's elemwise_div operator attributes to onnx's Div operator
-    and return the created node.
-    """
-    return create_basic_op_node('Div', node, kwargs)
-
-@mx_op.register("broadcast_div")
-def convert_broadcast_div(node, **kwargs):
-    """Map MXNet's broadcast_div operator attributes to onnx's Div operator
-    and return the created node.
-    """
-    return create_basic_op_node('Div', node, kwargs)
-
-@mx_op.register("negative")
-def convert_negative(node, **kwargs):
-    """Map MXNet's negative operator attributes to onnx's Neg operator
-    and return the created node.
-    """
-    return create_basic_op_node('Neg', node, kwargs)
-
-@mx_op.register("abs")
-def convert_abs(node, **kwargs):
-    """Map MXNet's abs operator attributes to onnx's Abs operator
-    and return the created node.
-    """
-    return create_basic_op_node('Abs', node, kwargs)
-
-@mx_op.register("add_n")
-def convert_addn(node, **kwargs):
-    """Map MXNet's add_n operator attributes to onnx's Sum operator
-    and return the created node.
-    """
-    return create_basic_op_node('Sum', node, kwargs)
-
- # Rounding
-@mx_op.register("ceil")
-def convert_ceil(node, **kwargs):
-    """Map MXNet's ceil operator attributes to onnx's Ceil operator
-    and return the created node.
-    """
-    return create_basic_op_node('Ceil', node, kwargs)
-
-@mx_op.register("floor")
-def convert_floor(node, **kwargs):
-    """Map MXNet's floor operator attributes to onnx's Floor operator
-    and return the created node.
-    """
-    return create_basic_op_node('Floor', node, kwargs)
-
-# Changing shape and type.
-@mx_op.register("Reshape")
-def convert_reshape(node, **kwargs):
-    """Map MXNet's Reshape operator attributes to onnx's Reshape operator.
-    Converts output shape attribute to output shape tensor
-    and return multiple created nodes.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    output_shape_list = convert_string_to_list(attrs["shape"])
-
-    initializer = kwargs["initializer"]
-    output_shape_np = np.array(output_shape_list, dtype='int64')
-    data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[output_shape_np.dtype]
-    dims = np.shape(output_shape_np)
-
-    output_shape_name = "reshape_attr_tensor" + str(kwargs["idx"])
-    tensor_node = onnx.helper.make_tensor_value_info(output_shape_name, data_type, dims)
-
-    initializer.append(
-        onnx.helper.make_tensor(
-            name=output_shape_name,
-            data_type=data_type,
-            dims=dims,
-            vals=output_shape_list,
-            raw=False,
-        )
-    )
-
-    input_nodes.append(output_shape_name)
-
-    not_supported_shape = [-2, -3, -4]
-
-    for val in output_shape_list:
-        if val in not_supported_shape:
-            raise AttributeError("Reshape: Shape value not supported in ONNX", val)
-
-    reshape_node = onnx.helper.make_node(
-        "Reshape",
-        input_nodes,
-        [name],
-        name=name
-    )
-
-    return [tensor_node, reshape_node]
-
-@mx_op.register("Cast")
-def convert_cast(node, **kwargs):
-    """Map MXNet's Cast operator attributes to onnx's Cast operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    dtype = attrs["dtype"]
-
-    # dtype can be mapped only with types from TensorProto
-    # float32 is mapped to float and float64 to double in onnx
-    # following tensorproto mapping https://github.com/onnx/onnx/blob/master/onnx/mapping.py
-    if dtype == 'float32':
-        dtype = 'float'
-    elif dtype == 'float64':
-        dtype = 'double'
-
-    node = onnx.helper.make_node(
-        "Cast",
-        input_nodes,
-        [name],
-        to=getattr(onnx.TensorProto, dtype.upper()),
-        name=name,
-    )
-    return [node]
-
-
-@mx_op.register("slice_axis")
-def convert_slice_axis(node, **kwargs):
-    """Map MXNet's slice_axis operator attributes to onnx's Slice operator
-    and return the created node.
-    """
-    name, input_nodes, input_shapes, attrs = get_inputs(node, kwargs, with_shapes=True)
-
-    axes = int(attrs.get("axis"))
-    starts = int(attrs.get("begin"))
-    ends = attrs.get("end", None)
-    if not ends or ends == 'None':
-        # ONNX doesn't support None for ends. Since ends=None depicts
-        # length of dimension, passing dimension in this case.
-        in_shape = input_shapes[0]
-        ends = in_shape[axes]
-
-    export_nodes = []
-
-    starts = np.atleast_1d(np.asarray(starts, dtype=np.int))
-    ends = np.atleast_1d(np.asarray(ends, dtype=np.int))
-    axes = np.atleast_1d(np.asarray(axes, dtype=np.int))
-
-    starts_node = create_helper_tensor_node(starts, name + '__starts', kwargs)
-    export_nodes.extend(starts_node)
-    starts_node = starts_node[-1].name
-
-    ends_node = create_helper_tensor_node(ends, name + '__ends', kwargs)
-    export_nodes.extend(ends_node)
-    ends_node = ends_node[-1].name
-
-    axes_node = create_helper_tensor_node(axes, name + '__axes', kwargs)
-    export_nodes.extend(axes_node)
-    axes_node = axes_node[-1].name
-
-    input_node = input_nodes[0]
-    node = onnx.helper.make_node(
-        "Slice",
-        [input_node, starts_node, ends_node, axes_node],
-        [name],
-        name=name,
-    )
-    export_nodes.extend([node])
-
-    return export_nodes
-
-
-@mx_op.register("SliceChannel")
-def convert_slice_channel(node, **kwargs):
-    """Map MXNet's SliceChannel operator attributes to onnx's Squeeze or Split
-    operator based on squeeze_axis attribute
-    and return the created node.
-    """
-    name, input_nodes, input_shapes, attrs = get_inputs(node, kwargs, with_shapes=True)
-
-    num_outputs = int(attrs.get("num_outputs"))
-    axis = int(attrs.get("axis", 1))
-    squeeze_axis = int(attrs.get("squeeze_axis", 0))
-
-    if squeeze_axis == 1 and num_outputs == 1:
-        node = onnx.helper.make_node(
-            "Squeeze",
-            input_nodes,
-            [name],
-            axes=[axis],
-            name=name,
-        )
-        return [node]
-    elif squeeze_axis == 0 and num_outputs > 1:
-        in_shape = input_shapes[0]
-        split = in_shape[axis] // num_outputs
-        node = onnx.helper.make_node(
-            "Split",
-            input_nodes,
-            [name+'_output'+str(i) for i in range(num_outputs)],
-            axis=axis,
-            split=[split for _ in range(num_outputs)],
-            name=name,
-        )
-        return [node]
-    else:
-        raise NotImplementedError("SliceChannel operator with num_outputs>1 and"
-                                  "squeeze_axis true is not implemented.")
-
-
-@mx_op.register("expand_dims")
-def convert_expand_dims(node, **kwargs):
-    """Map MXNet's expand_dims operator attributes to onnx's Unsqueeze operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    axis = int(attrs.get("axis"))
-
-    node = onnx.helper.make_node(
-        "Unsqueeze",
-        input_nodes,
-        [name],
-        axes=[axis],
-        name=name,
-    )
-    return [node]
-
-@mx_op.register("squeeze")
-def convert_squeeze(node, **kwargs):
-    """Map MXNet's squeeze operator attributes to onnx's squeeze operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    axis = attrs.get("axis", None)
-    if not axis:
-        raise AttributeError("Squeeze: Missing axis attribute: ONNX currently requires axis to "
-                             "be specified for squeeze operator")
-    axis = convert_string_to_list(axis)
-
-    node = onnx.helper.make_node(
-        "Squeeze",
-        input_nodes,
-        [name],
-        axes=axis,
-        name=name,
-    )
-    return [node]
-
-
-@mx_op.register("log")
-def convert_log(node, **kwargs):
-    """Map MXNet's log operator attributes to onnx's Log operator
-    and return the created node.
-    """
-    return create_basic_op_node('Log', node, kwargs)
-
-@mx_op.register("reciprocal")
-def convert_reciprocal(node, **kwargs):
-    """Map MXNet's reciprocal operator attributes to onnx's Reciprocal operator
-    and return the created node.
-    """
-    return create_basic_op_node('Reciprocal', node, kwargs)
-
-@mx_op.register("_power")
-def convert_power(node, **kwargs):
-    """Map MXNet's _power operator attributes to onnx's Pow operator
-    and return the created node.
-    """
-    return create_basic_op_node('Pow', node, kwargs)
-
-@mx_op.register("broadcast_power")
-def convert_broadcast_power(node, **kwargs):
-    """Map MXNet's _power operator attributes to onnx's Pow operator
-    and return the created node.
-    """
-    return create_basic_op_node('Pow', node, kwargs)
-
-@mx_op.register("sqrt")
-def convert_sqrt(node, **kwargs):
-    """Map MXNet's sqrt operator attributes to onnx's Sqrt operator
-    and return the created node.
-    """
-    return create_basic_op_node('Sqrt', node, kwargs)
-
-@mx_op.register("depth_to_space")
-def convert_depthtospace(node, **kwargs):
-    """Map MXNet's depth_to_space operator attributes to onnx's
-    DepthToSpace operator and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    blksize = int(attrs.get("block_size", 0))
-
-    node = onnx.helper.make_node(
-        "DepthToSpace",
-        input_nodes,
-        [name],
-        blocksize=blksize,
-        name=name,
-    )
-    return [node]
-
-@mx_op.register("space_to_depth")
-def convert_spacetodepth(node, **kwargs):
-    """Map MXNet's space_to_depth operator attributes to onnx's
-    SpaceToDepth operator and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    blksize = int(attrs.get("block_size", 0))
-
-    node = onnx.helper.make_node(
-        "SpaceToDepth",
-        input_nodes,
-        [name],
-        blocksize=blksize,
-        name=name,
-    )
-    return [node]
-
-@mx_op.register("square")
-def convert_square(node, **kwargs):
-    """Map MXNet's square operator attributes to onnx's Pow operator
-    and return the created node.
-    """
-    name, input_nodes, _ = get_inputs(node, kwargs)
-
-    initializer = kwargs["initializer"]
-    data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype('int64')]
-
-    power2_name = "square_tensor" + str(kwargs["idx"])
-    tensor_node = onnx.helper.make_tensor_value_info(power2_name, data_type, (1,))
-    initializer.append(
-        onnx.helper.make_tensor(
-            name=power2_name,
-            data_type=data_type,
-            dims=(1,),
-            vals=[2],
-            raw=False,
-        )
-    )
-
-    input_nodes.append(power2_name)
-
-    node = onnx.helper.make_node(
-        "Pow",
-        input_nodes,
-        [name],
-        name=name
-    )
-    return [tensor_node, node]
-
-@mx_op.register("sum")
-def convert_sum(node, **kwargs):
-    """Map MXNet's sum operator attributes to onnx's ReduceSum operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    mx_axis = attrs.get("axis", None)
-    axes = convert_string_to_list(str(mx_axis)) if mx_axis is not None else None
-
-    keepdims = get_boolean_attribute_value(attrs, "keepdims")
-
-    if axes:
-        node = onnx.helper.make_node(
-            'ReduceSum',
-            inputs=input_nodes,
-            outputs=[name],
-            axes=axes,
-            keepdims=keepdims,
-            name=name
-        )
-    else:
-        node = onnx.helper.make_node(
-            'ReduceSum',
-            inputs=input_nodes,
-            outputs=[name],
-            keepdims=keepdims,
-            name=name
-        )
-    return [node]
-
-
-@mx_op.register("shape_array")
-def convert_shape(node, **kwargs):
-    """Map MXNet's shape_array operator attributes to onnx's Shape operator
-    and return the created node.
-    """
-    return create_basic_op_node('Shape', node, kwargs)
-
-
-@mx_op.register("hard_sigmoid")
-def convert_hardsigmoid(node, **kwargs):
-    """Map MXNet's hard_sigmoid operator attributes to onnx's HardSigmoid operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    # Converting to float32
-    alpha = float(attrs.get("alpha", 0.2))
-    beta = float(attrs.get("beta", 0.5))
-
-    node = onnx.helper.make_node(
-        'HardSigmoid',
-        input_nodes,
-        [name],
-        alpha=alpha,
-        beta=beta,
-        name=name
-    )
-    return [node]
-
-@mx_op.register("broadcast_lesser")
-def convert_broadcast_lesser(node, **kwargs):
-    """Map MXNet's broadcast_lesser operator attributes to onnx's Less operator
-    and return the created node.
-    """
-    return create_basic_op_node('Less', node, kwargs)
-
-@mx_op.register("broadcast_greater")
-def convert_broadcast_greater(node, **kwargs):
-    """Map MXNet's broadcast_greater operator attributes to onnx's Greater operator
-    and return the created node.
-    """
-    return create_basic_op_node('Greater', node, kwargs)
-
-@mx_op.register("broadcast_equal")
-def convert_broadcast_equal(node, **kwargs):
-    """Map MXNet's broadcast_equal operator attributes to onnx's Equal operator
-    and return the created node.
-    """
-    return create_basic_op_node('Equal', node, kwargs)
-
-
-@mx_op.register("broadcast_logical_and")
-def convert_broadcast_logical_and(node, **kwargs):
-    """Map MXNet's broadcast logical and operator attributes to onnx's Add operator
-    and return the created node.
-    """
-    return create_basic_op_node('And', node, kwargs)
-
-
-@mx_op.register("broadcast_logical_or")
-def convert_broadcast_logical_or(node, **kwargs):
-    """Map MXNet's broadcast logical or operator attributes to onnx's Or operator
-    and return the created node.
-    """
-    return create_basic_op_node('Or', node, kwargs)
-
-
-@mx_op.register("broadcast_logical_xor")
-def convert_broadcast_logical_xor(node, **kwargs):
-    """Map MXNet's broadcast logical xor operator attributes to onnx's Xor operator
-    and return the created node.
-    """
-    return create_basic_op_node('Xor', node, kwargs)
-
-
-@mx_op.register("logical_not")
-def convert_logical_not(node, **kwargs):
-    """Map MXNet's logical not operator attributes to onnx's Not operator
-    and return the created node.
-    """
-    return create_basic_op_node('Not', node, kwargs)
-
-
-@mx_op.register("size_array")
-def convert_size(node, **kwargs):
-    """Map MXNet's size_array operator attributes to onnx's Size operator
-    and return the created node.
-    """
-    return create_basic_op_node('Size', node, kwargs)
-
-
-@mx_op.register("log_softmax")
-def convert_logsoftmax(node, **kwargs):
-    """Map MXNet's log_softmax operator attributes to onnx's LogSoftMax operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    # Converting to int
-    axis = int(attrs.get("axis", -1))
-    temp = attrs.get("temperature", 'None')
-    if temp != 'None':
-        raise AttributeError("LogSoftMax: ONNX supports only temperature=None")
-
-    node = onnx.helper.make_node(
-        'LogSoftmax',
-        input_nodes,
-        [name],
-        axis=axis,
-        name=name
-    )
-    return [node]
-
-@mx_op.register("norm")
-def convert_norm(node, **kwargs):
-    """Map MXNet's norm operator attributes to onnx's ReduceL1 and ReduceL2 operators
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    mx_axis = attrs.get("axis", None)
-    axes = convert_string_to_list(str(mx_axis)) if mx_axis else None
-
-    keepdims = get_boolean_attribute_value(attrs, "keepdims")
-    ord = int(attrs.get("ord", 2))
-
-    onnx_op_name = "ReduceL1" if ord == 1 else "ReduceL2"
-
-    if axes:
-        reduce_node = onnx.helper.make_node(
-            onnx_op_name,
-            input_nodes,
-            [name],
-            axes=axes,
-            keepdims=keepdims,
-            name=name
-        )
-        return [reduce_node]
-    else:
-        reduce_node = onnx.helper.make_node(
-            onnx_op_name,
-            input_nodes,
-            [name],
-            keepdims=keepdims,
-            name=name
-        )
-        return [reduce_node]
-
-@mx_op.register("_sample_multinomial")
-def convert_multinomial(node, **kwargs):
-    """Map MXNet's multinomial operator attributes to onnx's
-    Multinomial operator and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-    dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(attrs.get("dtype", 'int32'))]
-    sample_size = convert_string_to_list(attrs.get("shape", '1'))
-    if len(sample_size) < 2:
-        sample_size = sample_size[-1]
-    else:
-        raise AttributeError("ONNX currently supports integer sample_size only")
-    node = onnx.helper.make_node(
-        "Multinomial",
-        input_nodes,
-        [name],
-        dtype=dtype,
-        sample_size=sample_size,
-        name=name,
-    )
-    return [node]
-
-
-@mx_op.register("_random_uniform")
-def convert_random_uniform(node, **kwargs):
-    """Map MXNet's random_uniform operator attributes to onnx's RandomUniform
-    operator and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    # Converting to float32
-    low = float(attrs.get("low", 0))
-    high = float(attrs.get("high", 1.0))
-    shape = convert_string_to_list(attrs.get('shape', '[]'))
-    dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(attrs.get('dtype', 'float32'))]
-
-    node = onnx.helper.make_node(
-        'RandomUniform',
-        input_nodes,
-        [name],
-        low=low,
-        high=high,
-        dtype=dtype,
-        shape=shape,
-        name=name
-    )
-    return [node]
-
-
-@mx_op.register("_random_normal")
-def convert_random_normal(node, **kwargs):
-    """Map MXNet's random_normal operator attributes to onnx's RandomNormal
-    operator and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    # Converting to float32
-    mean = float(attrs.get("loc", 0))
-    scale = float(attrs.get("scale", 1.0))
-    shape = convert_string_to_list(attrs.get('shape', '[]'))
-    dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(attrs.get('dtype', 'float32'))]
-
-    node = onnx.helper.make_node(
-        'RandomNormal',
-        input_nodes,
-        [name],
-        mean=mean,
-        scale=scale,
-        dtype=dtype,
-        shape=shape,
-        name=name
-    )
-    return [node]
-
-
-@mx_op.register("ROIPooling")
-def convert_roipooling(node, **kwargs):
-    """Map MXNet's ROIPooling operator attributes to onnx's MaxRoiPool
-    operator and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    pooled_shape = convert_string_to_list(attrs.get('pooled_size'))
-    scale = float(attrs.get("spatial_scale"))
-
-    node = onnx.helper.make_node(
-        'MaxRoiPool',
-        input_nodes,
-        [name],
-        pooled_shape=pooled_shape,
-        spatial_scale=scale,
-        name=name
-    )
-    return [node]
-
-
-@mx_op.register("tile")
-def convert_tile(node, **kwargs):
-    """Map MXNet's Tile operator attributes to onnx's Tile
-    operator and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    reps_list = convert_string_to_list(attrs["reps"])
-
-    initializer = kwargs["initializer"]
-    reps_shape_np = np.array(reps_list, dtype='int64')
-    data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[reps_shape_np.dtype]
-    dims = np.shape(reps_shape_np)
-
-    output_shape_name = "reps_attr_tensor" + str(kwargs["idx"])
-    tensor_node = onnx.helper.make_tensor_value_info(output_shape_name, data_type, dims)
-
-    initializer.append(
-        onnx.helper.make_tensor(
-            name=output_shape_name,
-            data_type=data_type,
-            dims=dims,
-            vals=reps_list,
-            raw=False,
-        )
-    )
-
-    input_nodes.append(output_shape_name)
-    tile_node = onnx.helper.make_node(
-        "Tile",
-        input_nodes,
-        [name],
-        name=name
-    )
-
-    return [tensor_node, tile_node]
-
-
-@mx_op.register("broadcast_to")
-def convert_broadcast_to(node, **kwargs):
-    """Map MXNet's broadcast_to operator attributes to onnx's Expand
-    operator and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    shape_list = convert_string_to_list(attrs["shape"])
-
-    initializer = kwargs["initializer"]
-    output_shape_np = np.array(shape_list, dtype='int64')
-    data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[output_shape_np.dtype]
-    dims = np.shape(output_shape_np)
-
-    output_shape_name = "expand_attr_tensor" + str(kwargs["idx"])
-    tensor_node = onnx.helper.make_tensor_value_info(output_shape_name, data_type, dims)
-
-    initializer.append(
-        onnx.helper.make_tensor(
-            name=output_shape_name,
-            data_type=data_type,
-            dims=dims,
-            vals=shape_list,
-            raw=False,
-        )
-    )
-
-    input_nodes.append(output_shape_name)
-    expand_node = onnx.helper.make_node(
-        "Expand",
-        input_nodes,
-        [name],
-        name=name
-    )
-
-    return [tensor_node, expand_node]
-
-
-@mx_op.register("topk")
-def convert_topk(node, **kwargs):
-    """Map MXNet's topk operator attributes to onnx's TopK operator
-    and return the created node.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    axis = int(attrs.get('axis', '-1'))
-    k = int(attrs.get('k', '1'))
-    ret_type = attrs.get('ret_typ')
-    dtype = attrs.get('dtype')
-    outputs = [name + '_output0']
-
-    if ret_type and ret_type == 'both':
-        if dtype and dtype == 'int64':
-            outputs.append(name + '_output1')
-        else:
-            raise NotImplementedError("ONNX expects indices to be of type int64")
-    else:
-        raise NotImplementedError("ONNX expects both value and indices as output")
-
-    opset_version = kwargs['opset_version']
-    if opset_version >= 10:
-        from onnx.helper import make_tensor, make_tensor_value_info
-        initializer = kwargs["initializer"]
-        k_input_name = name + "_k"
-        k_input_type = onnx.TensorProto.INT64
-        k_value_node = make_tensor_value_info(k_input_name, k_input_type, ())
-        k_tensor_node = make_tensor(k_input_name, k_input_type, (), (k, ))
-        initializer.append(k_tensor_node)
-        input_nodes.append(k_input_name)
-
-        topk_node = onnx.helper.make_node(
-            "TopK",
-            input_nodes,
-            outputs,
-            axis=axis,
-            name=name
-        )
-        return [k_value_node, topk_node]
-    else:
-        topk_node = onnx.helper.make_node(
-            "TopK",
-            input_nodes,
-            outputs,
-            axis=axis,
-            k=k,
-            name=name
-        )
-
-    return [topk_node]
-
-
-@mx_op.register("take")
-def convert_take(node, **kwargs):
-    """Map MXNet's Take operator attributes to onnx's Gather operator.
-    """
-    name, input_nodes, attrs = get_inputs(node, kwargs)
-
-    axis = int(attrs.get('axis', 0))
-
-    node = onnx.helper.make_node(
-        "Gather",
-        input_nodes,
-        [name],
-        axis=axis,
-        name=name,
-    )
-    return [node]
diff --git a/python/mxnet/contrib/onnx/mx2onnx/export_model.py b/python/mxnet/contrib/onnx/mx2onnx/export_model.py
deleted file mode 100644
index 2fc7760..0000000
--- a/python/mxnet/contrib/onnx/mx2onnx/export_model.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-#pylint: disable-msg=too-many-arguments
-
-"""Exports an MXNet model to the ONNX model format"""
-import logging
-import numpy as np
-
-from ....base import string_types
-from .... import symbol
-from .export_onnx import MXNetGraph
-from ._export_helper import load_module
-
-
-def export_model(sym, params, input_shape, input_type=np.float32,
-                 onnx_file_path='model.onnx', verbose=False, opset_version=None):
-    """Exports the MXNet model file, passed as a parameter, into ONNX model.
-    Accepts both symbol,parameter objects as well as json and params filepaths as input.
-    Operator support and coverage -
-    https://cwiki.apache.org/confluence/display/MXNET/ONNX+Operator+Coverage
-
-    Parameters
-    ----------
-    sym : str or symbol object
-        Path to the json file or Symbol object
-    params : str or symbol object
-        Path to the params file or params dictionary. (Including both arg_params and aux_params)
-    input_shape : List of tuple
-        Input shape of the model e.g [(1,3,224,224)]
-    input_type : data type
-        Input data type e.g. np.float32
-    onnx_file_path : str
-        Path where to save the generated onnx file
-    verbose : Boolean
-        If true will print logs of the model conversion
-
-    Returns
-    -------
-    onnx_file_path : str
-        Onnx file path
-
-    Notes
-    -----
-    This method is available when you ``import mxnet.contrib.onnx``
-
-    """
-
-    try:
-        from onnx import helper, mapping
-        from onnx.defs import onnx_opset_version
-    except ImportError:
-        raise ImportError("Onnx and protobuf need to be installed. "
-                          + "Instructions to install - https://github.com/onnx/onnx")
-
-    converter = MXNetGraph()
-    if opset_version is None:
-        # default is to use latest opset version the onnx package supports
-        opset_version = onnx_opset_version()
-
-    data_format = np.dtype(input_type)
-    # if input parameters are strings(file paths), load files and create symbol parameter objects
-    if isinstance(sym, string_types) and isinstance(params, string_types):
-        logging.info("Converting json and weight file to sym and params")
-        sym_obj, params_obj = load_module(sym, params)
-        onnx_graph = converter.create_onnx_graph_proto(sym_obj, params_obj, input_shape,
-                                                       mapping.NP_TYPE_TO_TENSOR_TYPE[data_format],
-                                                       verbose=verbose, opset_version=opset_version)
-    elif isinstance(sym, symbol.Symbol) and isinstance(params, dict):
-        onnx_graph = converter.create_onnx_graph_proto(sym, params, input_shape,
-                                                       mapping.NP_TYPE_TO_TENSOR_TYPE[data_format],
-                                                       verbose=verbose, opset_version=opset_version)
-    else:
-        raise ValueError("Input sym and params should either be files or objects")
-
-    # Create the model (ModelProto)
-    onnx_model = helper.make_model(onnx_graph)
-
-    # Save model on disk
-    with open(onnx_file_path, "wb") as file_handle:
-        serialized = onnx_model.SerializeToString()
-        file_handle.write(serialized)
-        logging.info("Input shape of the model %s ", input_shape)
-        logging.info("Exported ONNX file %s saved to disk", onnx_file_path)
-
-    return onnx_file_path
diff --git a/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py b/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py
deleted file mode 100644
index 07fdabd..0000000
--- a/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Based on
-# https://github.com/NVIDIA/mxnet_to_onnx/blob/master/mx2onnx_converter/mx2onnx_converter.py#
-#  Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
-#
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions
-#  are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-#  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-#  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# coding: utf-8
-# pylint: disable=invalid-name,too-many-locals,no-self-use,too-many-arguments,
-# pylint: disable=maybe-no-member,too-many-nested-blocks
-"""MXNet to ONNX graph converter functions"""
-import logging
-import json
-
-from .... import ndarray as nd
-
-
-class MXNetGraph(object):
-    """Class to convert MXNet to ONNX graph"""
-    registry_ = {}
-    input_output_maps_ = {}
-
-    def __init__(self):
-        # topologically sorted nodes
-        self.nodes = []
-        self.input_tensors = []
-        self.output_tensors = []
-
-    @staticmethod
-    def register(op_name):
-        """Register operators"""
-        def wrapper(func):
-            """Helper function to map functions"""
-            try:
-                import onnx as _
-                MXNetGraph.registry_[op_name] = func
-            except ImportError:
-                pass
-            return func
-
-        return wrapper
-
-    @staticmethod
-    def convert_layer(node, **kwargs):
-        """Convert MXNet layer to ONNX"""
-        op = str(node["op"])
-        if op not in MXNetGraph.registry_:
-            raise AttributeError("No conversion function registered for op type %s yet." % op)
-        convert_func = MXNetGraph.registry_[op]
-        return convert_func(node, **kwargs)
-
-    @staticmethod
-    def split_params(sym, params):
-        """Helper function to split params dictionary into args and aux params
-
-        Parameters
-        ----------
-        sym : :class:`~mxnet.symbol.Symbol`
-            MXNet symbol object
-        params : dict of ``str`` to :class:`~mxnet.ndarray.NDArray`
-            Dict of converted parameters stored in ``mxnet.ndarray.NDArray`` format
-
-        Returns
-        -------
-        arg_params : dict of ``str`` to :class:`~mxnet.ndarray.NDArray`
-            Dict of converted parameters stored in ``mxnet.ndarray.NDArray`` format
-        aux_params : dict of ``str`` to :class:`~mxnet.ndarray.NDArray`
-            Dict of converted parameters stored in ``mxnet.ndarray.NDArray`` format
-        """
-        arg_params = {}
-        aux_params = {}
-        for args in sym.list_arguments():
-            if args in params:
-                arg_params.update({args: nd.array(params[args])})
-        for aux in sym.list_auxiliary_states():
-            if aux in params:
-                aux_params.update({aux: nd.array(params[aux])})
-        return arg_params, aux_params
-
-    @staticmethod
-    def get_outputs(sym, params, in_shape, in_label, verbose=True):
-        """ Infer output shapes and return dictionary of output name to shape
-
-        :param :class:`~mxnet.symbol.Symbol` sym: symbol to perform infer shape on
-        :param dic of (str, nd.NDArray) params:
-        :param list of tuple(int, ...) in_shape: list of all input shapes
-        :param  in_label: name of label typically used in loss that may be left in graph. This name is
-            removed from list of inputs required by symbol
-        :param verbose: If false, info logging messages are deactivated
-        :return: dictionary of output name to shape
-        :rtype: dict of (str, tuple(int, ...))
-        """
-        # remove any input listed in params from sym.list_inputs() and bind them to the input shapes provided
-        # by user. Also remove in_label, which is the name of the label symbol that may have been used
-        # as the label for loss during training.
-        inputs = {n: tuple(s) for n, s in zip([n for n in sym.list_inputs() if n not in params and n != in_label],
-                                              in_shape)}
-        # Add params and their shape to list of inputs
-        inputs.update({n: v.shape for n, v in params.items() if n in sym.list_inputs()})
-        # Provide input data as well as input params to infer_shape()
-        _, out_shapes, _ = sym.infer_shape(**inputs)
-
-        out_names = list()
-        for name in sym.list_outputs():
-            if name.endswith('_output'):
-                out_names.append(name[:-len('_output')])
-            else:
-                if verbose:
-                    logging.info("output '%s' does not end with '_output'", name)
-                out_names.append(name)
-
-        assert len(out_shapes) == len(out_names)
-        # bind output shapes with output names
-        graph_outputs = {n: s for n, s in zip(out_names, out_shapes)}
-
-        return graph_outputs
-
-    @staticmethod
-    def convert_weights_to_numpy(weights_dict):
-        """Convert weights to numpy"""
-        return dict([(k.replace("arg:", "").replace("aux:", ""), v.asnumpy())
-                     for k, v in weights_dict.items()])
-
-    def create_onnx_graph_proto(self, sym, params, in_shape, in_type, verbose=False, opset_version=None):
-        """Convert MXNet graph to ONNX graph
-
-        Parameters
-        ----------
-        sym : :class:`~mxnet.symbol.Symbol`
-            MXNet symbol object
-        params : dict of ``str`` to :class:`~mxnet.ndarray.NDArray`
-            Dict of converted parameters stored in ``mxnet.ndarray.NDArray`` format
-        in_shape : List of tuple
-            Input shape of the model e.g [(1,3,224,224)]
-        in_type : data type
-            Input data type e.g. np.float32
-        verbose : Boolean
-            If true will print logs of the model conversion
-        opset_version : Int
-            ONNX opset version to use for export, defaults to latest supported by onnx package
-
-        Returns
-        -------
-        graph : GraphProto
-            ONNX graph
-        """
-        try:
-            from onnx import (checker, helper, NodeProto, ValueInfoProto, TensorProto)
-            from onnx.helper import make_tensor_value_info
-            from onnx.defs import onnx_opset_version
-        except ImportError:
-            raise ImportError("Onnx and protobuf need to be installed. "
-                              + "Instructions to install - https://github.com/onnx/onnx")
-
-        if opset_version is None:
-            opset_version = onnx_opset_version()
-
-        # When MXNet model is saved to json file , MXNet adds a node for label.
-        # The name of this node is, name of the last node + "_label" ( i.e if last node
-        # name is "Softmax", this node will have a name "Softmax_label". Also, the new node
-        # will always be second last node in the json graph.
-        # Deriving the output_label name.
-        output_label = sym.get_internals()[len(sym.get_internals()) - 1].name + "_label"
-
-        weights = MXNetGraph.convert_weights_to_numpy(params)
-
-        mx_graph = json.loads(sym.tojson())["nodes"]
-
-        initializer = []
-        all_processed_nodes = []
-        onnx_processed_nodes = []
-        onnx_processed_inputs = []
-        onnx_processed_outputs = []
-        index_lookup = []
-
-        # Determine output and internal shapes
-        graph_outputs = MXNetGraph.get_outputs(sym, params, in_shape, output_label)
-        graph_shapes = MXNetGraph.get_outputs(sym.get_internals(), params, in_shape, output_label, verbose=False)
-
-        graph_input_idx = 0
-        for idx, node in enumerate(mx_graph):
-            op = node["op"]
-            name = node["name"]
-            if verbose:
-                logging.info("Converting idx: %d, op: %s, name: %s", idx, op, name)
-
-            # A node is an input node if its op_name is "null" and is not
-            # in params dict
-            if op == "null" and name not in params:
-                # Handling graph input
-
-                # Skipping output_label node, as this node is not part of graph
-                # Refer "output_label" assignment above for more details.
-                if name == output_label:
-                    continue
-                converted = MXNetGraph.convert_layer(
-                    node,
-                    is_input=True,
-                    mx_graph=mx_graph,
-                    weights=weights,
-                    in_shape=in_shape[graph_input_idx],
-                    in_type=in_type,
-                    proc_nodes=all_processed_nodes,
-                    graph_shapes=graph_shapes,
-                    initializer=initializer,
-                    index_lookup=index_lookup)
-                graph_input_idx += 1
-
-            else:
-                # Handling graph layers
-                converted = MXNetGraph.convert_layer(
-                    node,
-                    is_input=False,
-                    mx_graph=mx_graph,
-                    weights=weights,
-                    in_shape=in_shape,
-                    in_type=in_type,
-                    proc_nodes=all_processed_nodes,
-                    graph_shapes=graph_shapes,
-                    initializer=initializer,
-                    index_lookup=index_lookup,
-                    idx=idx,
-                    opset_version=opset_version
-                )
-
-            if isinstance(converted, list):
-                # Iterate for all converted nodes
-                for converted_node in converted:
-                    # If converted node is ValueInfoProto, add it in inputs
-                    if isinstance(converted_node, ValueInfoProto):
-                        onnx_processed_inputs.append(converted_node)
-                    # If converted node is NodeProto, add it in processed nodes list
-                    elif isinstance(converted_node, NodeProto):
-                        onnx_processed_nodes.append(converted_node)
-                        # some operators have multiple outputs,
-                        # therefore, check all output node names
-                        node_names = list(converted_node.output)
-                        for nodename in node_names:
-                            if nodename in graph_outputs:
-                                onnx_processed_outputs.append(
-                                    make_tensor_value_info(
-                                        name=nodename,
-                                        elem_type=in_type,
-                                        shape=graph_outputs[nodename]
-                                    )
-                                )
-                                if verbose:
-                                    logging.info("Output node is: %s", nodename)
-                    elif isinstance(converted_node, TensorProto):
-                        raise ValueError("Did not expect TensorProto")
-                    else:
-                        raise ValueError("node is of an unrecognized type: %s" % type(node))
-
-                    all_processed_nodes.append(converted_node)
-
-                if idx > 0:
-                    # Handling extra node added to the graph if the MXNet model was
-                    # saved to json file,
-                    # refer "output_label" initialization above for more details.
-                    # if extra node was added then prev_index to the last node is adjusted.
-                    if idx == (len(mx_graph) - 1) and \
-                            mx_graph[len(mx_graph)-2]["name"] == output_label:
-                        prev_index = index_lookup[idx - 2]
-                    else:
-                        prev_index = index_lookup[idx - 1]
-
-                    index_lookup.append(prev_index+len(converted))
-                else:
-                    index_lookup.append(len(converted) - 1)
-            else:
-                logging.info("Operator converter function should always return a list")
-
-        graph = helper.make_graph(
-            onnx_processed_nodes,
-            "mxnet_converted_model",
-            onnx_processed_inputs,
-            onnx_processed_outputs
-        )
-
-        graph.initializer.extend(initializer)
-
-        checker.check_graph(graph)
-        return graph
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py b/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py
deleted file mode 100644
index 418fb08..0000000
--- a/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8_
-# pylint: disable=invalid-name
-"""Operator attributes conversion"""
-from ._op_translations import identity, random_uniform, random_normal, sample_multinomial
-from ._op_translations import absolute, negative, add_n
-from ._op_translations import tanh, arccos, arcsin, arctan, _cos, _sin, _tan
-from ._op_translations import softplus, shape, gather, lp_pooling, size
-from ._op_translations import ceil, floor, hardsigmoid, global_lppooling
-from ._op_translations import concat, hardmax, topk
-from ._op_translations import leaky_relu, _elu, _prelu, _selu, softmax, fully_connected
-from ._op_translations import global_avgpooling, global_maxpooling, linalg_gemm
-from ._op_translations import sigmoid, pad, relu, matrix_multiplication, batch_norm
-from ._op_translations import dropout, local_response_norm, conv, deconv
-from ._op_translations import reshape, cast, split, _slice, transpose, squeeze, flatten
-from ._op_translations import reciprocal, squareroot, power, exponent, _log, unsqueeze
-from ._op_translations import reduce_max, reduce_mean, reduce_min, reduce_sum
-from ._op_translations import reduce_prod, avg_pooling, max_pooling, instance_norm
-from ._op_translations import argmax, argmin, maximum, minimum
-from ._op_translations import clip, reduce_log_sum, reduce_log_sum_exp
-from ._op_translations import reduce_sum_square, reduce_l1, reduce_l2, max_roi_pooling
-from ._op_translations import log_softmax, softsign, lesser, greater, equal
-from ._op_translations import logical_and, logical_or, logical_xor, logical_not
-from ._op_translations import mean, depthtospace, spacetodepth, lpnormalization
-
-# convert_map defines maps of ONNX operator names to converter functor(callable)
-# defined in the op_translations module.
-_convert_map = {
-    # Generator Functions
-    'Constant'          : identity,
-    'RandomUniform'     : random_uniform,
-    'RandomNormal'      : random_normal,
-    'RandomUniformLike' : random_uniform,
-    'RandomNormalLike'  : random_normal,
-    'Multinomial'       : sample_multinomial,
-    # Arithmetic Operators
-    'Abs'               : absolute,
-    'Neg'               : negative,
-    'Sum'               : add_n, #elemwise sum
-    #Hyperbolic functions
-    'Tanh'              : tanh,
-    # Rounding
-    'Ceil'              : ceil,
-    'Floor'             : floor,
-    # Joining and spliting
-    'Concat'            : concat,
-    # Basic neural network functions
-    'Sigmoid'           : sigmoid,
-    'Relu'              : relu,
-    'Pad'               : pad,
-    'MatMul'            : matrix_multiplication, #linalg_gemm2
-    'Conv'              : conv,
-    'ConvTranspose'     : deconv,
-    'BatchNormalization': batch_norm,
-    'SpatialBN'         : batch_norm,
-    'LeakyRelu'         : leaky_relu,
-    'Elu'               : _elu,
-    'PRelu'             : _prelu,
-    'Selu'              : _selu,
-    'Softmax'           : softmax,
-    'FC'                : fully_connected,
-    'GlobalAveragePool' : global_avgpooling,
-    'GlobalMaxPool'     : global_maxpooling,
-    'GlobalLpPool'      : global_lppooling,
-    'Gemm'              : linalg_gemm,
-    'LRN'               : local_response_norm,
-    'Dropout'           : dropout,
-    # Changing shape and type.
-    'Reshape'           : reshape,
-    'Cast'              : cast,
-    'Split'             : split,
-    'Slice'             : _slice,
-    'Transpose'         : transpose,
-    'Squeeze'           : squeeze,
-    'Unsqueeze'         : unsqueeze,
-    'Flatten'           : flatten,
-    'Identity'          : identity,
-    #Powers
-    'Reciprocal'        : reciprocal,
-    'Sqrt'              : squareroot,
-    'Pow'               : power,
-    'Exp'               : exponent,
-    'Log'               : _log,
-    # Reduce Functions
-    'ReduceMax'         : reduce_max,
-    'ReduceMean'        : reduce_mean,
-    'ReduceMin'         : reduce_min,
-    'ReduceSum'         : reduce_sum,
-    'ReduceProd'        : reduce_prod,
-    'AveragePool'       : avg_pooling,
-    'MaxPool'           : max_pooling,
-    # Sorting and Searching
-    'ArgMax'            : argmax,
-    'ArgMin'            : argmin,
-    'Max'               : maximum,
-    'Min'               : minimum,
-    'Clip'              : clip,
-    'ReduceLogSum'      : reduce_log_sum,
-    'ReduceLogSumExp'   : reduce_log_sum_exp,
-    'ReduceSumSquare'   : reduce_sum_square,
-    'ReduceL1'          : reduce_l1,
-    'ReduceL2'          : reduce_l2,
-    'MaxRoiPool'        : max_roi_pooling,
-    'InstanceNormalization' : instance_norm,
-    'LogSoftmax'        : log_softmax,
-    'Softsign'          : softsign,
-    'Less'              : lesser,
-    'Greater'           : greater,
-    'Equal'             : equal,
-    'And'               : logical_and,
-    'Xor'               : logical_xor,
-    'Not'               : logical_not,
-    'Or'                : logical_or,
-    'Mean'              : mean,
-    'Acos'              : arccos,
-    'Asin'              : arcsin,
-    'Atan'              : arctan,
-    'Cos'               : _cos,
-    'Sin'               : _sin,
-    'Softplus'          : softplus,
-    'Tan'               : _tan,
-    'Shape'             : shape,
-    'Size'              : size,
-    'Gather'            : gather,
-    'HardSigmoid'       : hardsigmoid,
-    'LpPool'            : lp_pooling,
-    'DepthToSpace'      : depthtospace,
-    'SpaceToDepth'      : spacetodepth,
-    'Hardmax'           : hardmax,
-    'LpNormalization'   : lpnormalization,
-    'TopK'              : topk
-}
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py b/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
deleted file mode 100644
index 76c8e61..0000000
--- a/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
+++ /dev/null
@@ -1,818 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-""" Module for translating ONNX operators into Mxnet operatoes"""
-# pylint: disable=unused-argument,protected-access
-import numpy as np
-from . import _translation_utils as translation_utils
-from .... import symbol
-# Method definitions for the callable objects mapped in the import_helper module
-
-def identity(attrs, inputs, proto_obj):
-    """Returns the identity function of the input."""
-    return 'identity', attrs, inputs
-
-def random_uniform(attrs, inputs, proto_obj):
-    """Draw random samples from a uniform distribtuion."""
-    try:
-        from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
-    except ImportError:
-        raise ImportError("Onnx and protobuf need to be installed. "
-                          "Instructions to install - https://github.com/onnx/onnx")
-    new_attrs = translation_utils._remove_attributes(attrs, ['seed'])
-    new_attrs['dtype'] = TENSOR_TYPE_TO_NP_TYPE[int(new_attrs.get('dtype', 1))]
-    return 'random_uniform', new_attrs, inputs
-
-def random_normal(attrs, inputs, proto_obj):
-    """Draw random samples from a Gaussian distribution."""
-    try:
-        from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
-    except ImportError:
-        raise ImportError("Onnx and protobuf need to be installed. "
-                          "Instructions to install - https://github.com/onnx/onnx")
-    new_attr = translation_utils._remove_attributes(attrs, ['seed'])
-    new_attr = translation_utils._fix_attribute_names(new_attr, {'mean': 'loc'})
-    new_attr['dtype'] = TENSOR_TYPE_TO_NP_TYPE[int(new_attr.get('dtype', 1))]
-    return 'random_normal', new_attr, inputs
-
-def sample_multinomial(attrs, inputs, proto_obj):
-    """Draw random samples from a multinomial distribution."""
-    try:
-        from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
-    except ImportError:
-        raise ImportError("Onnx and protobuf need to be installed. "
-                          + "Instructions to install - https://github.com/onnx/onnx")
-    new_attrs = translation_utils._remove_attributes(attrs, ['seed'])
-    new_attrs = translation_utils._fix_attribute_names(new_attrs, {'sample_size': 'shape'})
-    new_attrs['dtype'] = TENSOR_TYPE_TO_NP_TYPE[int(attrs.get('dtype', 6))]
-    return 'sample_multinomial', new_attrs, inputs
-
-def mean(attrs, inputs, proto_obj):
-    """Mean of all the input tensors."""
-    concat_input = [symbol.expand_dims(op_input, axis=0) for op_input in inputs]
-    concat_sym = symbol.concat(*concat_input, dim=0)
-    mean_sym = symbol.mean(concat_sym, axis=0)
-    return mean_sym, attrs, inputs
-
-def logical_and(attrs, inputs, proto_obj):
-    """Logical and of two input arrays."""
-    return 'broadcast_logical_and', attrs, inputs
-
-def logical_or(attrs, inputs, proto_obj):
-    """Logical or of two input arrays."""
-    return 'broadcast_logical_or', attrs, inputs
-
-def logical_xor(attrs, inputs, proto_obj):
-    """Logical xor of two input arrays."""
-    return 'broadcast_logical_xor', attrs, inputs
-
-def logical_not(attrs, inputs, proto_obj):
-    """Logical not of two input arrays."""
-    return 'logical_not', attrs, inputs
-
-def absolute(attrs, inputs, proto_obj):
-    """Returns element-wise absolute value of the input."""
-    return 'abs', attrs, inputs
-
-def negative(attrs, inputs, proto_obj):
-    """Negation of every element in a tensor"""
-    return 'negative', attrs, inputs
-
-def add_n(attrs, inputs, proto_obj):
-    """Elementwise sum of arrays"""
-    return 'add_n', attrs, inputs
-
-# Sorting and Searching
-def argmax(attrs, inputs, proto_obj):
-    """Returns indices of the maximum values along an axis"""
-    axis = attrs.get('axis', 0)
-    keepdims = attrs.get('keepdims', 1)
-    argmax_op = symbol.argmax(inputs[0], axis=axis, keepdims=keepdims)
-    # onnx argmax operator always expects int64 as output type
-    cast_attrs = {'dtype': 'int64'}
-    return 'cast', cast_attrs, argmax_op
-
-def argmin(attrs, inputs, proto_obj):
-    """Returns indices of the minimum values along an axis."""
-    axis = attrs.get('axis', 0)
-    keepdims = attrs.get('keepdims', 1)
-    argmin_op = symbol.argmin(inputs[0], axis=axis, keepdims=keepdims)
-    # onnx argmax operator always expects int64 as output type
-    cast_attrs = {'dtype': 'int64'}
-    return 'cast', cast_attrs, argmin_op
-
-def maximum(attrs, inputs, proto_obj):
-    """
-    Elementwise maximum of arrays.
-    MXNet maximum compares only two symbols at a time.
-    ONNX can send more than two to compare.
-    Breaking into multiple mxnet ops to compare two symbols at a time
-    """
-    if len(inputs) > 1:
-        mxnet_op = symbol.maximum(inputs[0], inputs[1])
-        for op_input in inputs[2:]:
-            mxnet_op = symbol.maximum(mxnet_op, op_input)
-    else:
-        mxnet_op = symbol.maximum(inputs[0], inputs[0])
-    return mxnet_op, attrs, inputs
-
-def minimum(attrs, inputs, proto_obj):
-    """Elementwise minimum of arrays."""
-    # MXNet minimum compares only two symbols at a time.
-    # ONNX can send more than two to compare.
-    # Breaking into multiple mxnet ops to compare two symbols at a time
-    if len(inputs) > 1:
-        mxnet_op = symbol.minimum(inputs[0], inputs[1])
-        for op_input in inputs[2:]:
-            mxnet_op = symbol.minimum(mxnet_op, op_input)
-    else:
-        mxnet_op = symbol.minimum(inputs[0], inputs[0])
-    return mxnet_op, attrs, inputs
-
-def lesser(attrs, inputs, proto_obj):
-    """Logical Lesser operator with broadcasting."""
-    return 'broadcast_lesser', attrs, inputs
-
-def greater(attrs, inputs, proto_obj):
-    """Logical Greater operator with broadcasting."""
-    return 'broadcast_greater', attrs, inputs
-
-def equal(attrs, inputs, proto_obj):
-    """Logical Equal operator with broadcasting."""
-    return 'broadcast_equal', attrs, inputs
-
-#Hyperbolic functions
-def tanh(attrs, inputs, proto_obj):
-    """Returns the hyperbolic tangent of the input array."""
-    return 'tanh', attrs, inputs
-
-# Rounding
-def ceil(attrs, inputs, proto_obj):
-    """ Calculate ceil value for input """
-    return 'ceil', attrs, inputs
-
-def floor(attrs, inputs, proto_obj):
-    """ Calculate floor value for input """
-    return 'floor', attrs, inputs
-
-# Joining and spliting
-def concat(attrs, inputs, proto_obj):
-    """ Joins input arrays along a given axis. """
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'axis': 'dim'})
-    return 'concat', new_attrs, inputs
-
-# Basic neural network functions
-def softsign(attrs, inputs, proto_obj):
-    """Computes softsign of x element-wise."""
-    return 'softsign', attrs, inputs
-
-def sigmoid(attrs, inputs, proto_obj):
-    """Computes elementwise sigmoid of the input array"""
-    return 'sigmoid', attrs, inputs
-
-def hardsigmoid(attrs, inputs, proto_obj):
-    """Computes elementwise hard sigmoid of the input array"""
-    return 'hard_sigmoid', attrs, inputs
-
-def relu(attrs, inputs, proto_obj):
-    """Computes rectified linear function."""
-    return 'relu', attrs, inputs
-
-def pad(attrs, inputs, proto_obj):
-    """ Add padding to input tensor"""
-    opset_version = proto_obj.opset_version
-    if 'mode' not in attrs.keys():
-        attrs['mode'] = 'constant'
-    if opset_version >= 11:
-        pads = list(proto_obj._params[inputs[1].name].asnumpy())
-        pads = tuple([int(i) for i in pads])
-        new_attrs = translation_utils._add_extra_attributes(attrs, {'pad_width': pads})
-        if len(inputs) == 3:
-            const = proto_obj._params[inputs[2].name].asnumpy()[0]
-            new_attrs = translation_utils._add_extra_attributes(new_attrs, {'constant_value': const})
-        new_attrs['pad_width'] = translation_utils._pad_sequence_fix(new_attrs.get('pad_width'))
-        return 'pad', new_attrs, inputs[0]
-    else:
-        new_attrs = translation_utils._fix_attribute_names(attrs, {'pads'  : 'pad_width',
-                                                                   'value' : 'constant_value'
-                                                                  })
-        new_attrs['pad_width'] = translation_utils._pad_sequence_fix(new_attrs.get('pad_width'))
-        return 'pad', new_attrs, inputs
-
-def matrix_multiplication(attrs, inputs, proto_obj):
-    """Performs general matrix multiplication"""
-    return 'linalg_gemm2', attrs, inputs
-
-def batch_norm(attrs, inputs, proto_obj):
-    """Batch normalization."""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'epsilon': 'eps',
-                                                               'is_test': 'fix_gamma'})
-    new_attrs = translation_utils._remove_attributes(new_attrs,
-                                                     ['spatial', 'consumed_inputs'])
-    # Disable cuDNN BN only if epsilon from model is < than minimum cuDNN eps (1e-5)
-    cudnn_min_eps = 1e-5
-    cudnn_off = 0 if attrs.get('epsilon', cudnn_min_eps) >= cudnn_min_eps else 1
-    new_attrs = translation_utils._add_extra_attributes(new_attrs, {'cudnn_off': cudnn_off})
-
-    # in test mode "fix_gamma" should be unset.
-    new_attrs['fix_gamma'] = not attrs.get('is_test', 1)
-    return 'BatchNorm', new_attrs, inputs
-
-def instance_norm(attrs, inputs, proto_obj):
-    """Instance Normalization."""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'epsilon' : 'eps'})
-    new_attrs['eps'] = attrs.get('epsilon', 1e-5)
-    return 'InstanceNorm', new_attrs, inputs
-
-def leaky_relu(attrs, inputs, proto_obj):
-    """Leaky Relu function"""
-    if 'alpha' in attrs:
-        new_attrs = translation_utils._fix_attribute_names(attrs, {'alpha' : 'slope'})
-    else:
-        new_attrs = translation_utils._add_extra_attributes(attrs, {'slope': 0.01})
-    return 'LeakyReLU', new_attrs, inputs
-
-def _elu(attrs, inputs, proto_obj):
-    """Elu function"""
-    if 'alpha' in attrs:
-        new_attrs = translation_utils._fix_attribute_names(attrs, {'alpha' : 'slope'})
-    else:
-        new_attrs = translation_utils._add_extra_attributes(attrs, {'slope': 1.0})
-    new_attrs = translation_utils._add_extra_attributes(new_attrs, {'act_type': 'elu'})
-    return 'LeakyReLU', new_attrs, inputs
-
-def _prelu(attrs, inputs, proto_obj):
-    """PRelu function"""
-    new_attrs = translation_utils._add_extra_attributes(attrs, {'act_type': 'prelu'})
-    return 'LeakyReLU', new_attrs, inputs
-
-def _selu(attrs, inputs, proto_obj):
-    """Selu function"""
-    new_attrs = translation_utils._add_extra_attributes(attrs, {'act_type': 'selu'})
-    return 'LeakyReLU', new_attrs, inputs
-
-def softmax(attrs, inputs, proto_obj):
-    """Softmax function."""
-    if 'axis' not in attrs:
-        attrs = translation_utils._add_extra_attributes(attrs, {'axis': 1})
-    return 'softmax', attrs, inputs
-
-def log_softmax(attrs, inputs, proto_obj):
-    """Computes the log softmax of the input. This is equivalent to
-    computing softmax followed by log."""
-    return 'log_softmax', attrs, inputs
-
-def softplus(attrs, inputs, proto_obj):
-    """Applies the sofplus activation function element-wise to the input."""
-    new_attrs = translation_utils._add_extra_attributes(attrs, {'act_type' : 'softrelu'})
-    return 'Activation', new_attrs, inputs
-
-def conv(attrs, inputs, proto_obj):
-    """Compute N-D convolution on (N+2)-D input."""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'kernel_shape' : 'kernel',
-                                                               'strides' : 'stride',
-                                                               'pads': 'pad',
-                                                               'dilations': 'dilate',
-                                                               'group': 'num_group'})
-    new_attrs = translation_utils._add_extra_attributes(new_attrs, {'num_group' : 1})
-    new_attrs = translation_utils._fix_bias('Convolution', new_attrs, len(inputs))
-
-    new_attrs = translation_utils._fix_channels('Convolution', new_attrs, inputs, proto_obj)
-    kernel = new_attrs['kernel']
-    stride = new_attrs['stride'] if 'stride' in new_attrs else []
-    padding = new_attrs['pad'] if 'pad' in new_attrs else []
-    dilations = new_attrs['dilate'] if 'dilate' in new_attrs else []
-    num_filter = new_attrs['num_filter']
-    num_group = new_attrs['num_group']
-    no_bias = new_attrs['no_bias'] if 'no_bias' in new_attrs else 0
-    bias = None if no_bias is True else inputs[2]
-
-    mxnet_pad = translation_utils._pad_sequence_fix(padding, kernel_dim=len(kernel))
-
-    left_pads = mxnet_pad[0::2]
-    right_pads = mxnet_pad[1::2]
-    is_pad_sym = left_pads == right_pads
-
-    if not is_pad_sym:
-        # Unlike ONNX, MXNet's convolution operator does not support asymmetric padding, so we first
-        # use 'Pad' operator, which supports asymmetric padding. Then use the convolution operator.
-        pad_width = (0, 0, 0, 0) + mxnet_pad
-        pad_op = symbol.pad(inputs[0], mode='constant', pad_width=pad_width)
-        conv_op = symbol.Convolution(pad_op, inputs[1], bias,
-                                     kernel=kernel, stride=stride, dilate=dilations,
-                                     num_filter=num_filter, num_group=num_group, no_bias=no_bias)
-    else:
-        pad_width = left_pads
-        conv_op = symbol.Convolution(inputs[0], inputs[1], bias,
-                                     kernel=kernel, stride=stride, dilate=dilations, pad=pad_width,
-                                     num_filter=num_filter, num_group=num_group, no_bias=no_bias)
-
-    return conv_op, new_attrs, inputs
-
-def deconv(attrs, inputs, proto_obj):
-    """Computes transposed convolution of the input tensor."""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'kernel_shape' : 'kernel',
-                                                               'strides' : 'stride',
-                                                               'pads': 'pad',
-                                                               'dilations': 'dilate',
-                                                               'group': 'num_group'})
-    new_attrs = translation_utils._add_extra_attributes(new_attrs, {'num_group' : 1})
-    new_attrs = translation_utils._fix_bias('Deconvolution', new_attrs, len(inputs))
-
-    new_attrs = translation_utils._fix_channels('Deconvolution', new_attrs, inputs, proto_obj)
-    kernel = new_attrs['kernel'] if 'kernel' in new_attrs else []
-    stride = new_attrs['stride'] if 'stride' in new_attrs else []
-    padding = new_attrs['pad'] if 'pad' in new_attrs else []
-    dilations = new_attrs['dilate'] if 'dilate' in new_attrs else []
-    num_filter = new_attrs['num_filter']
-    num_group = new_attrs['num_group']
-    no_bias = new_attrs['no_bias'] if 'no_bias' in new_attrs else False
-    bias = None if no_bias is True else inputs[2]
-
-    # Unlike ONNX, MXNet's deconvolution operator does not support asymmetric padding, so we first
-    # use 'Pad' operator, which supports asymmetric padding. Then use the deconvolution operator.
-    pad_width = (0, 0, 0, 0) + translation_utils._pad_sequence_fix(padding, kernel_dim=len(kernel))
-    pad_op = symbol.pad(inputs[0], mode='constant', pad_width=pad_width)
-
-    deconv_op = symbol.Deconvolution(pad_op, inputs[1], bias,
-                                     kernel=kernel, stride=stride, dilate=dilations,
-                                     num_filter=num_filter, num_group=num_group, no_bias=no_bias)
-
-    return deconv_op, new_attrs, inputs
-
-def fully_connected(attrs, inputs, proto_obj):
-    """Applies a linear transformation: Y=XWT+b."""
-    new_attrs = translation_utils._remove_attributes(attrs, ['axis'])
-
-    new_attrs = translation_utils._fix_bias('FullyConnected', new_attrs, len(inputs))
-
-    new_attrs = translation_utils._fix_channels('FullyConnected', new_attrs, inputs, proto_obj)
-
-    return 'FullyConnected', new_attrs, inputs
-
-
-def global_maxpooling(attrs, inputs, proto_obj):
-    """Performs max pooling on the input."""
-    new_attrs = translation_utils._add_extra_attributes(attrs, {'global_pool': True,
-                                                                'kernel': (1, 1),
-                                                                'pool_type': 'max'})
-    return 'Pooling', new_attrs, inputs
-
-
-def global_avgpooling(attrs, inputs, proto_obj):
-    """Performs avg pooling on the input."""
-    new_attrs = translation_utils._add_extra_attributes(attrs, {'global_pool': True,
-                                                                'kernel': (1, 1),
-                                                                'pool_type': 'avg'})
-    return 'Pooling', new_attrs, inputs
-
-def global_lppooling(attrs, inputs, proto_obj):
-    """Performs global lp pooling on the input."""
-    p_value = attrs.get('p', 2)
-    new_attrs = translation_utils._add_extra_attributes(attrs, {'global_pool': True,
-                                                                'kernel': (1, 1),
-                                                                'pool_type': 'lp',
-                                                                'p_value': p_value})
-    new_attrs = translation_utils._remove_attributes(new_attrs, ['p'])
-    return 'Pooling', new_attrs, inputs
-
-def linalg_gemm(attrs, inputs, proto_obj):
-    """Performs general matrix multiplication and accumulation"""
-    trans_a = 0
-    trans_b = 0
-    alpha = 1
-    beta = 1
-    if 'transA' in attrs:
-        trans_a = attrs['transA']
-    if 'transB' in attrs:
-        trans_b = attrs['transB']
-    if 'alpha' in attrs:
-        alpha = attrs['alpha']
-    if 'beta' in attrs:
-        beta = attrs['beta']
-    flatten_a = symbol.flatten(inputs[0])
-    matmul_op = symbol.linalg_gemm2(A=flatten_a, B=inputs[1],
-                                    transpose_a=trans_a, transpose_b=trans_b,
-                                    alpha=alpha)
-    gemm_op = symbol.broadcast_add(matmul_op, beta*inputs[2])
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'transA': 'transpose_a',
-                                                               'transB': 'transpose_b'})
-    new_attrs = translation_utils._remove_attributes(new_attrs, ['broadcast'])
-    return gemm_op, new_attrs, inputs
-
-def local_response_norm(attrs, inputs, proto_obj):
-    """Local Response Normalization."""
-    new_attrs = translation_utils._fix_attribute_names(attrs,
-                                                       {'bias': 'knorm',
-                                                        'size' : 'nsize'})
-    return 'LRN', new_attrs, inputs
-
-def dropout(attrs, inputs, proto_obj):
-    """Dropout Regularization."""
-    mode = 'training'
-    opset_version = proto_obj.opset_version
-    if 'is_test' in attrs and attrs['is_test'] == 0:
-        mode = 'always'
-    new_attrs = translation_utils._remove_attributes(attrs, ['is_test'])
-    new_attrs = translation_utils._add_extra_attributes(new_attrs, {'mode': mode})
-    if opset_version >= 12:
-        new_attrs = translation_utils._remove_attributes(new_attrs, ['seed'])
-        if len(inputs) == 2:
-            ratio_float = proto_obj._params[inputs[1].name].asnumpy()[0]
-            new_attrs = translation_utils._remove_attributes(new_attrs, ['p'])
-            new_attrs = translation_utils._add_extra_attributes(new_attrs, {'p': ratio_float})
-        elif len(inputs) == 1:
-            new_attrs = translation_utils._fix_attribute_names(new_attrs, {'ratio': 'p'})
-        return 'Dropout', new_attrs, inputs[0]
-    else:
-        new_attrs = translation_utils._fix_attribute_names(new_attrs, {'ratio': 'p'})
-    return 'Dropout', new_attrs, inputs
-
-# Changing shape and type.
-def reshape(attrs, inputs, proto_obj):
-    """Reshape the given array by the shape attribute."""
-    if len(inputs) == 1:
-        return 'reshape', attrs, inputs[0]
-    reshape_shape = list(proto_obj._params[inputs[1].name].asnumpy())
-    reshape_shape = [int(i) for i in reshape_shape]
-    new_attrs = {'shape': reshape_shape}
-    return 'reshape', new_attrs, inputs[:1]
-
-def cast(attrs, inputs, proto_obj):
-    """ Cast input to a given dtype"""
-    try:
-        from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
-    except ImportError:
-        raise ImportError("Onnx and protobuf need to be installed. "
-                          + "Instructions to install - https://github.com/onnx/onnx")
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'to' : 'dtype'})
-    new_attrs['dtype'] = TENSOR_TYPE_TO_NP_TYPE[int(new_attrs['dtype'])]
-    return 'cast', new_attrs, inputs
-
-def split(attrs, inputs, proto_obj):
-    """Splits an array along a particular axis into multiple sub-arrays."""
-    split_list = attrs.get('split') if 'split' in attrs else []
-    new_attrs = translation_utils._fix_attribute_names(attrs,
-                                                       {'split' : 'num_outputs'})
-    if 'axis' not in attrs:
-        new_attrs = translation_utils._add_extra_attributes(new_attrs, {'axis': 0})
-
-    if not split_list:
-        num_outputs = len(proto_obj.model_metadata.get('output_tensor_data'))
-    else:
-        if len(set(split_list)) == 1:
-            num_outputs = len(split_list)
-        else:
-            raise NotImplementedError("Operator {} in MXNet does not support variable splits."
-                                      "Tracking the issue to support variable split here: "
-                                      "https://github.com/apache/incubator-mxnet/issues/11594"
-                                      .format('split'))
-
-    new_attrs['num_outputs'] = num_outputs
-    return 'split', new_attrs, inputs
-
-def _slice(attrs, inputs, proto_obj):
-    """Returns a slice of the input tensor along multiple axes."""
-    input_tensor_data = proto_obj.model_metadata.get('input_tensor_data')[0]
-    input_shape = input_tensor_data[1]
-
-    if proto_obj.opset_version >= 10:
-        begin = proto_obj._params[inputs[1].name].asnumpy()
-        end = proto_obj._params[inputs[2].name].asnumpy()
-        if len(inputs) >= 4:
-            axes = list(proto_obj._params[inputs[3].name].asnumpy())
-            axes = tuple([int(i) for i in axes])
-        else:
-            axes = tuple(range(len(begin)))
-        new_attrs = translation_utils._add_extra_attributes(attrs, {'axes' : axes,
-                                                                    'begin' : begin,
-                                                                    'end' : end
-                                                                   })
-    else:
-        new_attrs = translation_utils._fix_attribute_names(attrs,
-                                                           {'axes' : 'axis',
-                                                            'ends' : 'end',
-                                                            'starts' : 'begin'})
-        # onnx slice provides slicing on multiple axis. Adding multiple slice_axis operator
-        # for multiple axes from mxnet
-        begin = new_attrs.get('begin')
-        end = list(new_attrs.get('end'))
-        axes = new_attrs.get('axis', tuple(range(len(begin))))
-
-    for i, axis in enumerate(axes):
-        end[i] = None if end[i] >= input_shape[axis] else end[i]
-    slice_op = symbol.slice_axis(inputs[0], axis=axes[0], begin=begin[0], end=end[0])
-    if len(axes) > 1:
-        for i, axis in enumerate(axes):
-            slice_op = symbol.slice_axis(slice_op, axis=axis, begin=begin[i], end=end[i])
-    return slice_op, new_attrs, inputs
-
-def transpose(attrs, inputs, proto_obj):
-    """Transpose the input array."""
-    new_attrs = translation_utils._fix_attribute_names(attrs,
-                                                       {'perm' : 'axes'})
-    return 'transpose', new_attrs, inputs
-
-def squeeze(attrs, inputs, proto_obj):
-    """Remove single-dimensional entries from the shape of a tensor."""
-    new_attrs = translation_utils._fix_attribute_names(attrs,
-                                                       {'axes' : 'axis'})
-    return 'squeeze', new_attrs, inputs
-
-def unsqueeze(attrs, inputs, cls):
-    """Inserts a new axis of size 1 into the array shape"""
-    # MXNet can only add one axis at a time.
-    mxnet_op = inputs[0]
-    for axis in attrs["axes"]:
-        mxnet_op = symbol.expand_dims(mxnet_op, axis=axis)
-
-    return mxnet_op, attrs, inputs
-
-def flatten(attrs, inputs, proto_obj):
-    """Flattens the input array into a 2-D array by collapsing the higher dimensions."""
-    #Mxnet does not have axis support. By default uses axis=1
-    if 'axis' in attrs and attrs['axis'] != 1:
-        raise RuntimeError("Flatten operator only supports axis=1")
-    new_attrs = translation_utils._remove_attributes(attrs, ['axis'])
-    return 'Flatten', new_attrs, inputs
-
-def clip(attrs, inputs, proto_obj):
-    """Clips (limits) the values in an array."""
-    opset_version = proto_obj.opset_version
-    if opset_version >= 11:
-        if len(inputs) == 1:
-            new_attrs = translation_utils._add_extra_attributes(new_attrs, {'a_max' : np.inf,
-                                                                            'a_min' : -np.inf})
-        elif len(inputs) == 2:
-            min_float = proto_obj._params[inputs[1].name].asnumpy()
-            new_attrs = translation_utils._add_extra_attributes(attrs, {'a_min': min_float[0],
-                                                                        'a_max': np.inf})
-        elif len(inputs) == 3:
-            min_float = proto_obj._params[inputs[1].name].asnumpy()
-            max_float = proto_obj._params[inputs[2].name].asnumpy()
-            new_attrs = translation_utils._add_extra_attributes(attrs, {'a_min': min_float[0],
-                                                                        'a_max': max_float[0]})
-    else:
-        new_attrs = translation_utils._fix_attribute_names(attrs, {'min' : 'a_min',
-                                                                   'max' : 'a_max'})
-        if 'a_max' not in new_attrs:
-            new_attrs = translation_utils._add_extra_attributes(new_attrs, {'a_max' : np.inf})
-        if 'a_min' not in new_attrs:
-            new_attrs = translation_utils._add_extra_attributes(new_attrs, {'a_min' : -np.inf})
-    return 'clip', new_attrs, inputs[0]
-
-def gather(attrs, inputs, proto_obj):
-    """Gather elements from an input array along the given axis."""
-    return 'take', attrs, inputs
-
-#Powers
-def reciprocal(attrs, inputs, proto_obj):
-    """Returns the reciprocal of the argument, element-wise."""
-    return 'reciprocal', attrs, inputs
-
-def squareroot(attrs, inputs, proto_obj):
-    """Returns element-wise square-root value of the input."""
-    return 'sqrt', attrs, inputs
-
-def power(attrs, inputs, proto_obj):
-    """Returns element-wise result of base element raised to powers from exp element."""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'exponent':'exp'})
-    if 'broadcast' in attrs:
-        new_attrs = translation_utils._remove_attributes(new_attrs, ['broadcast'])
-        if attrs['broadcast'] == 1:
-            return 'broadcast_power', new_attrs, inputs
-        else:
-            mxnet_op = symbol.pow(inputs[0], inputs[1])
-            return mxnet_op, new_attrs, inputs
-    mxnet_op = symbol.broadcast_power(inputs[0], inputs[1])
-    return mxnet_op, new_attrs, inputs
-
-def exponent(attrs, inputs, proto_obj):
-    """Elementwise exponent of input array."""
-    return 'exp', attrs, inputs
-
-def _cos(attrs, inputs, proto_obj):
-    """Elementwise cosine of input array."""
-    return 'cos', attrs, inputs
-
-def _sin(attrs, inputs, proto_obj):
-    """Elementwise sine of input array."""
-    return 'sin', attrs, inputs
-
-def _tan(attrs, inputs, proto_obj):
-    """Elementwise tan of input array."""
-    return 'tan', attrs, inputs
-
-def arccos(attrs, inputs, proto_obj):
-    """Elementwise inverse cos of input array."""
-    return 'arccos', attrs, inputs
-
-def arcsin(attrs, inputs, proto_obj):
-    """Elementwise inverse sin of input array."""
-    return 'arcsin', attrs, inputs
-
-def arctan(attrs, inputs, proto_obj):
-    """Elementwise inverse tan of input array."""
-    return 'arctan', attrs, inputs
-
-def _log(attrs, inputs, proto_obj):
-    """Elementwise log of input array."""
-    return 'log', attrs, inputs
-
-# Reduce Functions
-def reduce_max(attrs, inputs, proto_obj):
-    """Reduce the array along a given axis by maximum value"""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'axes':'axis'})
-    return 'max', new_attrs, inputs
-
-def reduce_mean(attrs, inputs, proto_obj):
-    """Reduce the array along a given axis by mean value"""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'axes':'axis'})
-    return 'mean', new_attrs, inputs
-
-def reduce_min(attrs, inputs, proto_obj):
-    """Reduce the array along a given axis by minimum value"""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'axes':'axis'})
-    return 'min', new_attrs, inputs
-
-def reduce_sum(attrs, inputs, proto_obj):
-    """Reduce the array along a given axis by sum value"""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'axes':'axis'})
-    return 'sum', new_attrs, inputs
-
-def reduce_prod(attrs, inputs, proto_obj):
-    """Reduce the array along a given axis by product value"""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'axes':'axis'})
-    return 'prod', new_attrs, inputs
-
-def reduce_log_sum(attrs, inputs, proto_obj):
-    """Reduce the array along a given axis by log sum value"""
-    keep_dims = True if 'keepdims' not in attrs else attrs.get('keepdims')
-    sum_op = symbol.sum(inputs[0], axis=attrs.get('axes'),
-                        keepdims=keep_dims)
-    log_sym = symbol.log(sum_op)
-    return log_sym, attrs, inputs
-
-def reduce_log_sum_exp(attrs, inputs, proto_obj):
-    """Reduce the array along a given axis by log sum exp value"""
-    keep_dims = True if 'keepdims' not in attrs else attrs.get('keepdims')
-    exp_op = symbol.exp(inputs[0])
-    sum_op = symbol.sum(exp_op, axis=attrs.get('axes'),
-                        keepdims=keep_dims)
-    log_sym = symbol.log(sum_op)
-    return log_sym, attrs, inputs
-
-def reduce_sum_square(attrs, inputs, proto_obj):
-    """Reduce the array along a given axis by sum square value"""
-    square_op = symbol.square(inputs[0])
-    sum_op = symbol.sum(square_op, axis=attrs.get('axes'),
-                        keepdims=attrs.get('keepdims'))
-    return sum_op, attrs, inputs
-
-def reduce_l1(attrs, inputs, proto_obj):
-    """Reduce input tensor by l1 normalization."""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'axes':'axis'})
-    new_attrs = translation_utils._add_extra_attributes(new_attrs,
-                                                        {'ord' : 1})
-    return 'norm', new_attrs, inputs
-
-def shape(attrs, inputs, proto_obj):
-    """Returns shape of input array."""
-    return 'shape_array', attrs, inputs
-
-def size(attrs, inputs, proto_obj):
-    """Returns array containing size of data."""
-    return "size_array", attrs, inputs
-
-def reduce_l2(attrs, inputs, proto_obj):
-    """Reduce input tensor by l2 normalization."""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'axes':'axis'})
-    return 'norm', new_attrs, inputs
-
-def avg_pooling(attrs, inputs, proto_obj):
-    """ Average pooling"""
-    new_attrs = translation_utils._fix_attribute_names(attrs,
-                                                       {'kernel_shape': 'kernel',
-                                                        'strides': 'stride',
-                                                        'pads': 'pad',
-                                                       })
-    new_attrs = translation_utils._add_extra_attributes(new_attrs,
-                                                        {'pooling_convention': 'valid'
-                                                        })
-    new_op = translation_utils._fix_pooling('avg', inputs, new_attrs)
-
-    return new_op, new_attrs, inputs
-
-def lp_pooling(attrs, inputs, proto_obj):
-    """LP Pooling"""
-    p_value = attrs.get('p', 2)
-    new_attrs = translation_utils._fix_attribute_names(attrs,
-                                                       {'kernel_shape': 'kernel',
-                                                        'strides': 'stride',
-                                                        'pads': 'pad'
-                                                       })
-    new_attrs = translation_utils._remove_attributes(new_attrs, ['p'])
-    new_attrs = translation_utils._add_extra_attributes(new_attrs,
-                                                        {'pooling_convention': 'valid',
-                                                         'p_value': p_value
-                                                        })
-    new_op = translation_utils._fix_pooling('lp', inputs, new_attrs)
-    return new_op, new_attrs, inputs
-
-def max_pooling(attrs, inputs, proto_obj):
-    """ Average pooling"""
-    new_attrs = translation_utils._fix_attribute_names(attrs,
-                                                       {'kernel_shape': 'kernel',
-                                                        'strides': 'stride',
-                                                        'pads': 'pad',
-                                                       })
-
-    new_attrs = translation_utils._add_extra_attributes(new_attrs,
-                                                        {'pooling_convention': 'valid'
-                                                        })
-    new_op = translation_utils._fix_pooling('max', inputs, new_attrs)
-
-    return new_op, new_attrs, inputs
-
-def max_roi_pooling(attrs, inputs, proto_obj):
-    """Max ROI Pooling."""
-    new_attrs = translation_utils._fix_attribute_names(attrs,
-                                                       {'pooled_shape': 'pooled_size',
-                                                        'spatial_scale': 'spatial_scale'
-                                                       })
-    return 'ROIPooling', new_attrs, inputs
-
-def depthtospace(attrs, inputs, proto_obj):
-    """Rearranges data from depth into blocks of spatial data."""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'blocksize':'block_size'})
-
-    return "depth_to_space", new_attrs, inputs
-
-def spacetodepth(attrs, inputs, proto_obj):
-    """Rearranges blocks of spatial data into depth."""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'blocksize':'block_size'})
-
-    return "space_to_depth", new_attrs, inputs
-
-def hardmax(attrs, inputs, proto_obj):
-    """Returns batched one-hot vectors."""
-    input_tensor_data = proto_obj.model_metadata.get('input_tensor_data')[0]
-    input_shape = input_tensor_data[1]
-
-    axis = int(attrs.get('axis', 1))
-    axis = axis if axis >= 0 else len(input_shape) + axis
-
-    if axis == len(input_shape) - 1:
-        amax = symbol.argmax(inputs[0], axis=-1)
-        one_hot = symbol.one_hot(amax, depth=input_shape[-1])
-        return one_hot, attrs, inputs
-
-    # since reshape doesn't take a tensor for shape,
-    # computing with np.prod. This needs to be changed to
-    # to use mx.sym.prod() when mx.sym.reshape() is fixed.
-    # (https://github.com/apache/incubator-mxnet/issues/10789)
-    new_shape = (int(np.prod(input_shape[:axis])),
-                 int(np.prod(input_shape[axis:])))
-    reshape_op = symbol.reshape(inputs[0], new_shape)
-    amax = symbol.argmax(reshape_op, axis=-1)
-    one_hot = symbol.one_hot(amax, depth=new_shape[-1])
-    hardmax_op = symbol.reshape(one_hot, input_shape)
-    return hardmax_op, attrs, inputs
-
-def lpnormalization(attrs, inputs, proto_obj):
-    """ONNX does not have eps attribute, so cannot map it to L2normalization in MXNet
-     without that, it works as norm operator discussion in PR:
-     https://github.com/onnx/onnx/pull/1330"""
-    new_attrs = translation_utils._fix_attribute_names(attrs, {'p': 'ord'})
-    axis = int(attrs.get("axis", -1))
-    new_attrs.update(axis=axis)
-    return 'norm', new_attrs, inputs
-
-
-def topk(attrs, inputs, proto_obj):
-    """Returns the top k elements in an input array along the given axis."""
-    new_attrs = translation_utils._add_extra_attributes(attrs,
-                                                        {'ret_typ': 'both',
-                                                         'dtype': 'int64'})
-    opset_version = proto_obj.opset_version
-    if opset_version >= 10:
-        k_vals = proto_obj._params[inputs[1].name].asnumpy()
-        new_attrs = translation_utils._add_extra_attributes(new_attrs, {'k': k_vals})
-        return 'topk', new_attrs, inputs[0]
-    else:
-        return 'topk', new_attrs, inputs
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py b/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py
deleted file mode 100644
index 376d72d..0000000
--- a/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-"""Utilities used for translating operators from Onnx to Mxnet."""
-# pylint: disable=protected-access
-from .... import symbol
-
-
-def _fix_attribute_names(attrs, change_map):
-    """
-    Change attribute names as per values in change_map dictionary.
-    Parameters
-    ----------
-    :param attrs : dict Dict of operator attributes
-    :param change_map : dict Dict of onnx attribute name to mxnet attribute names.
-
-    Returns
-    -------
-    :return new_attr : dict Converted dict of operator attributes.
-    """
-    new_attr = {}
-    for k in attrs.keys():
-        if k in change_map:
-            new_attr[change_map[k]] = attrs[k]
-        else:
-            new_attr[k] = attrs[k]
-    return new_attr
-
-def _remove_attributes(attrs, remove_list):
-    """
-    Removes attributes in the remove list from the input attribute dict
-    :param attrs : Dict of operator attributes
-    :param remove_list : list of attributes to be removed
-
-    :return new_attr : Dict of operator attributes without the listed attributes.
-    """
-    new_attrs = {}
-    for attr in attrs.keys():
-        if attr not in remove_list:
-            new_attrs[attr] = attrs[attr]
-    return new_attrs
-
-def _add_extra_attributes(attrs, extra_attr_map):
-    """
-    :param attrs:  Current Attribute list
-    :param extraAttrMap:  Additional attributes to be added
-    :return: new_attr
-    """
-    for attr in extra_attr_map:
-        if attr not in attrs:
-            attrs[attr] = extra_attr_map[attr]
-    return attrs
-
-
-def _pad_sequence_fix(attr, kernel_dim=None):
-    """Changing onnx's pads sequence to match with mxnet's pad_width
-    mxnet: (x1_begin, x1_end, ... , xn_begin, xn_end)
-    onnx: (x1_begin, x2_begin, ... , xn_end, xn_end)"""
-    new_attr = ()
-    if len(attr) % 2 == 0:
-        for index in range(int(len(attr) / 2)):
-            new_attr = new_attr + attr[index::int(len(attr) / 2)]
-        # Making sure pad values  are in the attr for all axes.
-        if kernel_dim is not None:
-            while len(new_attr) < kernel_dim*2:
-                new_attr = new_attr + (0, 0)
-
-    return new_attr
-
-
-def _fix_pooling(pool_type, inputs, new_attr):
-    """onnx pooling operator supports asymmetrical padding
-    Adding pad operator before pooling in mxnet to work with onnx"""
-    stride = new_attr.get('stride')
-    kernel = new_attr.get('kernel')
-    padding = new_attr.get('pad')
-    p_value = new_attr.get('p_value')
-
-    # Adding default stride.
-    if stride is None:
-        stride = (1,) * len(kernel)
-
-    # Add padding attr if not provided.
-    if padding is None:
-        padding = (0,) * len(kernel) * 2
-
-    # Mxnet Pad operator supports only 4D/5D tensors.
-    # For 1D case, these are the steps:
-    #    Step 1. Add extra dummy dimension to make it 4D. Adding to  axis = 2
-    #    Step 2. Apply padding to this changed tensor
-    #    Step 3. Remove the extra dimension added in step 1.
-    if len(kernel) == 1:
-        dummy_axis = 2
-        # setting 0 padding to the new dim to be added.
-        padding = (0, padding[0], 0, padding[1])
-        pad_width = (0, 0, 0, 0) + _pad_sequence_fix(padding, kernel_dim=2)
-
-        # Step 1.
-        curr_sym = symbol.expand_dims(inputs[0], axis=dummy_axis)
-
-        # Step 2. Common for all tensor sizes
-        new_pad_op = symbol.pad(curr_sym, mode='edge', pad_width=pad_width)
-
-        # Step 3: Removing extra dim added.
-        new_pad_op = symbol.split(new_pad_op, axis=dummy_axis, num_outputs=1, squeeze_axis=1)
-    else:
-        # For 2D/3D cases:
-        # Apply padding
-        pad_width = (0, 0, 0, 0) + _pad_sequence_fix(padding, kernel_dim=len(kernel))
-        curr_sym = inputs[0]
-
-        if pool_type == 'max':
-            # For max pool : mode = 'edge', we should replicate the
-            # edge values to pad, so that we only include  input data values
-            # for calculating 'max'
-            new_pad_op = symbol.pad(curr_sym, mode='edge', pad_width=pad_width)
-        else:
-            # For avg pool, we should add 'zeros' for padding  so mode='constant'
-            new_pad_op = symbol.pad(curr_sym, mode='constant', pad_width=pad_width)
-
-    # Apply pooling without pads.
-    if pool_type == 'lp':
-        new_pooling_op = symbol.Pooling(new_pad_op, pool_type=pool_type, stride=stride, kernel=kernel, p_value=p_value)
-    else:
-        new_pooling_op = symbol.Pooling(new_pad_op, pool_type=pool_type, stride=stride, kernel=kernel)
-    return new_pooling_op
-
-def _fix_bias(op_name, attrs, num_inputs):
-    """A workaround for 'use_bias' attribute since onnx don't provide this attribute,
-    we have to check the number of inputs to decide it."""
-    if num_inputs == 3:
-        attrs['no_bias'] = False
-    elif num_inputs == 2:
-        attrs['no_bias'] = True
-    else:
-        raise ValueError("Unexpected number of inputs for: {}".format(op_name))
-    return attrs
-
-def _fix_channels(op_name, attrs, inputs, proto_obj):
-    """A workaround for getting 'channels' or 'units' since onnx don't provide
-    these attributes. We check the shape of weights provided to get the number.
-    """
-    weight_name = inputs[1].name
-    if not weight_name in proto_obj._params:
-        raise ValueError("Unable to get channels/units attr from onnx graph.")
-
-    wshape = proto_obj._params[weight_name].shape
-    assert len(wshape) >= 2, "Weights shape is invalid: {}".format(wshape)
-
-    if op_name == 'FullyConnected':
-        attrs['num_hidden'] = wshape[0]
-    else:
-        if op_name == 'Convolution':
-            # Weight shape for Conv and FC: (M x C x kH x kW) : M is number of
-            # feature maps/hidden  and C is number of channels
-            attrs['num_filter'] = wshape[0]
-        elif op_name == 'Deconvolution':
-            # Weight shape for DeConv : (C x M x kH x kW) : M is number of
-            # feature maps/filters and C is number of channels
-            attrs['num_filter'] = wshape[1]
-    return attrs
-
-
-def _fix_gemm(op_name, inputs, old_attr, proto_obj):
-    """Using FullyConnected operator in place of linalg_gemm to perform same operation"""
-    op_sym = getattr(symbol, op_name, None)
-    alpha = float(old_attr.get('alpha', 1.0))
-    beta = float(old_attr.get('beta', 1.0))
-    trans_a = int(old_attr.get('transA', 0))
-    trans_b = int(old_attr.get('transB', 0))
-    if trans_a:
-        inputs[0] = symbol.transpose(inputs[0], axes=(1, 0))
-    if not trans_b:
-        inputs[1] = symbol.transpose(inputs[1], axes=(1, 0))
-    new_inputs = [alpha*inputs[0], inputs[1], beta*inputs[2]]
-    new_attr = {'num_hidden' : proto_obj._params[inputs[2].name].shape[0]}
-    return op_sym, new_attr, new_inputs
diff --git a/python/mxnet/contrib/onnx/onnx2mx/import_model.py b/python/mxnet/contrib/onnx/onnx2mx/import_model.py
deleted file mode 100644
index d060b08..0000000
--- a/python/mxnet/contrib/onnx/onnx2mx/import_model.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-"""Functions for importing ONNX models to MXNet and for checking metadata"""
-# pylint: disable=no-member
-
-from .import_onnx import GraphProto
-
-def import_model(model_file):
-    """Imports the ONNX model file, passed as a parameter, into MXNet symbol and parameters.
-    Operator support and coverage -
-    https://cwiki.apache.org/confluence/display/MXNET/ONNX+Operator+Coverage
-
-    Parameters
-    ----------
-    model_file : str
-        ONNX model file name
-
-    Returns
-    -------
-    sym : :class:`~mxnet.symbol.Symbol`
-        MXNet symbol object
-
-    arg_params : dict of ``str`` to :class:`~mxnet.ndarray.NDArray`
-        Dict of converted parameters stored in ``mxnet.ndarray.NDArray`` format
-
-    aux_params : dict of ``str`` to :class:`~mxnet.ndarray.NDArray`
-        Dict of converted parameters stored in ``mxnet.ndarray.NDArray`` format
-
-    Notes
-    -----
-    This method is available when you ``import mxnet.contrib.onnx``
-
-    """
-    graph = GraphProto()
-
-    try:
-        import onnx
-    except ImportError:
-        raise ImportError("Onnx and protobuf need to be installed. "
-                          + "Instructions to install - https://github.com/onnx/onnx")
-    # loads model file and returns ONNX protobuf object
-    model_proto = onnx.load_model(model_file)
-    model_opset_version = max([x.version for x in model_proto.opset_import])
-    sym, arg_params, aux_params = graph.from_onnx(model_proto.graph, opset_version=model_opset_version)
-    return sym, arg_params, aux_params
-
-def get_model_metadata(model_file):
-    """
-    Returns the name and shape information of input and output tensors of the given ONNX model file.
-
-    Notes
-    -----
-    This method is available when you ``import mxnet.contrib.onnx``
-
-    Parameters
-    ----------
-    model_file : str
-        ONNX model file name
-
-    Returns
-    -------
-    model_metadata : dict
-        A dictionary object mapping various metadata to its corresponding value.
-        The dictionary will have the following template::
-
-          'input_tensor_data' : list of tuples representing the shape of the input paramters
-          'output_tensor_data' : list of tuples representing the shape of the output of the model
-    """
-    graph = GraphProto()
-
-    try:
-        import onnx
-    except ImportError:
-        raise ImportError("Onnx and protobuf need to be installed. "
-                          + "Instructions to install - https://github.com/onnx/onnx")
-    model_proto = onnx.load_model(model_file)
-    metadata = graph.get_graph_metadata(model_proto.graph)
-    return metadata
diff --git a/python/mxnet/contrib/onnx/onnx2mx/import_onnx.py b/python/mxnet/contrib/onnx/onnx2mx/import_onnx.py
deleted file mode 100644
index c2be83d..0000000
--- a/python/mxnet/contrib/onnx/onnx2mx/import_onnx.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-# pylint: disable=invalid-name,too-many-locals,no-self-use
-""" Support import export formats."""
-import numpy as np
-from .... import symbol
-from .... import ndarray as nd
-from ....base import string_types
-from ._import_helper import _convert_map as convert_map
-
-class GraphProto(object): # pylint: disable=too-few-public-methods
-    """A helper class for handling mxnet symbol copying from pb2.GraphProto.
-    Definition: https://github.com/onnx/onnx/blob/master/onnx/onnx.proto
-    """
-    def __init__(self):
-        self._nodes = {}
-        self._params = {}
-        self._num_input = 0
-        self._num_param = 0
-        self.aux_dict = {}
-        self.arg_dict = {}
-        self.model_metadata = {}
-        self.opset_version = 0
-
-    def _convert_operator(self, node_name, op_name, attrs, inputs):
-        """Convert from onnx operator to mxnet operator.
-        The converter must specify conversions explicitly for incompatible name, and
-        apply handlers to operator attributes.
-
-        Parameters
-        ----------
-        :param node_name : str
-            name of the node to be translated.
-        :param op_name : str
-            Operator name, such as Convolution, FullyConnected
-        :param attrs : dict
-            Dict of operator attributes
-        :param inputs: list
-            list of inputs to the operator
-        Returns
-        -------
-        :return mxnet_sym
-            Converted mxnet symbol
-        """
-        if op_name in convert_map:
-            op_name, new_attrs, inputs = convert_map[op_name](attrs, inputs, self)
-        else:
-            raise NotImplementedError("Operator {} not implemented.".format(op_name))
-        if isinstance(op_name, string_types):
-            new_op = getattr(symbol, op_name, None)
-            if not new_op:
-                raise RuntimeError("Unable to map op_name {} to sym".format(op_name))
-            if node_name is None:
-                mxnet_sym = new_op(*inputs, **new_attrs)
-            else:
-                mxnet_sym = new_op(name=node_name, *inputs, **new_attrs)
-            return mxnet_sym
-        return op_name
-
-    def from_onnx(self, graph, opset_version):
-        """Construct symbol from onnx graph.
-
-        Parameters
-        ----------
-        graph : onnx protobuf object
-            The loaded onnx graph
-
-        Returns
-        -------
-        sym :symbol.Symbol
-            The returned mxnet symbol
-        params : dict
-            A dict of name: nd.array pairs, used as pretrained weights
-        """
-        self.opset_version = opset_version
-        # get input, output shapes
-        self.model_metadata = self.get_graph_metadata(graph)
-        # parse network inputs, aka parameters
-        for init_tensor in graph.initializer:
-            if not init_tensor.name.strip():
-                raise ValueError("Tensor's name is required.")
-            self._params[init_tensor.name] = self._parse_array(init_tensor)
-
-        # converting GraphProto message
-        for i in graph.input:
-            if i.name in self._params:
-                # i is a param instead of input
-                self._nodes[i.name] = symbol.Variable(name=i.name,
-                                                      shape=self._params[i.name].shape)
-            else:
-                self._nodes[i.name] = symbol.Variable(name=i.name)
-
-        # constructing nodes, nodes are stored as directed acyclic graph
-        # converting NodeProto message
-        for node in graph.node:
-            op_name = node.op_type
-            node_name = node.name.strip()
-            node_name = node_name if node_name else None
-            onnx_attr = self._parse_attr(node.attribute)
-            inputs = [self._nodes[i] for i in node.input]
-            mxnet_sym = self._convert_operator(node_name, op_name, onnx_attr, inputs)
-
-            for k, i in zip(list(node.output), range(len(mxnet_sym.list_outputs()))):
-                self._nodes[k] = mxnet_sym[i]
-
-            # splitting params into args and aux params
-            for args in mxnet_sym.list_arguments():
-                if args in self._params:
-                    self.arg_dict.update({args: nd.array(self._params[args])})
-            for aux in mxnet_sym.list_auxiliary_states():
-                if aux in self._params:
-                    self.aux_dict.update({aux: nd.array(self._params[aux])})
-
-        # now return the outputs
-        out = [self._nodes[i.name] for i in graph.output]
-        if len(out) > 1:
-            out = symbol.Group(out)
-        else:
-            out = out[0]
-        return out, self.arg_dict, self.aux_dict
-
-    def get_graph_metadata(self, graph):
-        """
-        Get the model metadata from a given onnx graph.
-        """
-        _params = set()
-        for tensor_vals in graph.initializer:
-            _params.add(tensor_vals.name)
-
-        input_data = []
-        for graph_input in graph.input:
-            if graph_input.name not in _params:
-                shape = [val.dim_value for val in graph_input.type.tensor_type.shape.dim]
-                input_data.append((graph_input.name, tuple(shape)))
-
-        output_data = []
-        for graph_out in graph.output:
-            shape = [val.dim_value for val in graph_out.type.tensor_type.shape.dim]
-            output_data.append((graph_out.name, tuple(shape)))
-        metadata = {'input_tensor_data' : input_data,
-                    'output_tensor_data' : output_data
-                   }
-        return metadata
-
-    def graph_to_gluon(self, graph, ctx, opset_version):
-        """Construct SymbolBlock from onnx graph.
-
-        Parameters
-        ----------
-        graph : onnx protobuf object
-            The loaded onnx graph
-        ctx : Context or list of Context
-            Loads the model into one or many context(s).
-
-        Returns
-        -------
-        sym_block :gluon.nn.SymbolBlock
-            The returned gluon SymbolBlock
-        """
-        sym, arg_params, aux_params = self.from_onnx(graph, opset_version)
-        metadata = self.get_graph_metadata(graph)
-        data_names = [input_tensor[0] for input_tensor in metadata['input_tensor_data']]
-        data_inputs = [symbol.var(data_name) for data_name in data_names]
-
-        from ....gluon import SymbolBlock
-        net = SymbolBlock(outputs=sym, inputs=data_inputs)
-        net_params = net.collect_params()
-        for param in arg_params:
-            if param in net_params:
-                net_params[param].shape = arg_params[param].shape
-                net_params[param]._load_init(arg_params[param], ctx=ctx)
-        for param in aux_params:
-            if param in net_params:
-                net_params[param].shape = aux_params[param].shape
-                net_params[param]._load_init(aux_params[param], ctx=ctx)
-        return net
-
-    def _parse_array(self, tensor_proto):
-        """Grab data in TensorProto and convert to numpy array."""
-        try:
-            from onnx.numpy_helper import to_array
-        except ImportError:
-            raise ImportError("Onnx and protobuf need to be installed. "
-                              + "Instructions to install - https://github.com/onnx/onnx")
-        if len(tuple(tensor_proto.dims)) > 0:
-            np_array = to_array(tensor_proto).reshape(tuple(tensor_proto.dims))
-        else:
-            # If onnx's params are scalar values without dims mentioned.
-            np_array = np.array([to_array(tensor_proto)])
-        return nd.array(np_array)
-
-    def _parse_attr(self, attr_proto):
-        """Convert a list of AttributeProto to a dict, with names as keys."""
-        attrs = {}
-        for a in attr_proto:
-            for f in ['f', 'i', 's']:
-                if a.HasField(f):
-                    attrs[a.name] = getattr(a, f)
-                    # Needed for supporting python version  > 3.5
-                    if isinstance(attrs[a.name], bytes):
-                        attrs[a.name] = attrs[a.name].decode(encoding='utf-8')
-            for f in ['floats', 'ints', 'strings']:
-                if list(getattr(a, f)):
-                    assert a.name not in attrs, "Only one type of attr is allowed"
-                    attrs[a.name] = tuple(getattr(a, f))
-            for f in ['t', 'g']:
-                if a.HasField(f):
-                    attrs[a.name] = getattr(a, f)
-            for f in ['tensors', 'graphs']:
-                if list(getattr(a, f)):
-                    raise NotImplementedError("Filed {} is not supported in mxnet.".format(f))
-            if a.name not in attrs:
-                raise ValueError("Cannot parse attribute: \n{}\n.".format(a))
-        return attrs
diff --git a/python/mxnet/contrib/onnx/onnx2mx/import_to_gluon.py b/python/mxnet/contrib/onnx/onnx2mx/import_to_gluon.py
deleted file mode 100644
index f6e1036..0000000
--- a/python/mxnet/contrib/onnx/onnx2mx/import_to_gluon.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-"""Import ONNX model to gluon interface"""
-# pylint: disable=no-member
-
-from .import_onnx import GraphProto
-
-def import_to_gluon(model_file, ctx):
-    """
-    Imports the ONNX model files, passed as a parameter, into Gluon SymbolBlock object.
-
-    Parameters
-    ----------
-    model_file : str
-        ONNX model file name
-    ctx : Context or list of Context
-        Loads the model into one or many context(s).
-
-    Returns
-    -------
-    sym_block : :class:`~mxnet.gluon.SymbolBlock`
-        A SymbolBlock object representing the given model file.
-
-    Notes
-    -----
-    This method is available when you ``import mxnet.contrib.onnx``
-
-    """
-    graph = GraphProto()
-    try:
-        import onnx
-    except ImportError:
-        raise ImportError("Onnx and protobuf need to be installed. Instructions to"
-                          + " install - https://github.com/onnx/onnx#installation")
-    model_proto = onnx.load_model(model_file)
-    model_opset_version = max([x.version for x in model_proto.opset_import])
-    net = graph.graph_to_gluon(model_proto.graph, ctx, model_opset_version)
-    return net
diff --git a/python/mxnet/onnx/README.md b/python/mxnet/onnx/README.md
new file mode 100644
index 0000000..55518a7
--- /dev/null
+++ b/python/mxnet/onnx/README.md
@@ -0,0 +1,97 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+
+# ONNX Export Support for MXNet
+
+### Overview
+[ONNX](https://onnx.ai/), or Open Neural Network Exchange, is an open source deep learning model format that acts as a framework neutral graph representation between DL frameworks or between training and inference. With the ability to export models to the ONNX format, MXNet users can enjoy faster inference and a wider range of deployment device choices, including edge and mobile devices where MXNet installation may be constrained. Popular hardware-accelerated and/or cross-platform ONNX r [...]
+
+### ONNX Versions Supported
+ONNX 1.7 & 1.8
+
+### Installation
+From MXNet 1.9 release and on, the ONNX export module has become an offical, built-in feature in MXNet. You can access the module at `mxnet.onnx`.
+
+If you are a user of earlier MXNet versions and do not want to upgrade MXNet, you can still enjoy the latest ONNX support by pulling the MXNet source code and building the wheel for only the mx2onnx module. Just do `cd python/mxnet/onnx` and then build the wheel with `python3 -m build`. You should be able to find the wheel under `python/mxnet/onnx/dist/mx2onnx-0.0.0-py3-none-any.whl` and install it with `pip install mx2onnx-0.0.0-py3-none-any.whl`. You can then access the module with `im [...]
+
+### APIs
+The main API is `export_model`, which, as the name suggests, exports an MXNet model to the ONNX format.
+
+```python
+mxnet.onnx.export_model(sym, params, in_shapes=None, in_types=np.float32,
+                 onnx_file_path='model.onnx', verbose=False, dynamic=False,
+                 dynamic_input_shapes=None, run_shape_inference=False, input_type=None,
+                 input_shape=None)
+```
+
+Parameters:
+
+    sym : str or symbol object
+        Path to the MXNet json file or Symbol object
+    params : str or dict or list of dict
+        str - Path to the MXNet params file
+        dict - MXNet params dictionary (Including both arg_params and aux_params)
+        list - list of length 2 that contains MXNet arg_params and aux_params
+    in_shapes : List of tuple
+        Input shape of the model e.g [(1,3,224,224)]
+    in_types : data type or list of data types
+        Input data type e.g. np.float32, or [np.float32, np.int32]
+    onnx_file_path : str
+        Path where to save the generated onnx file
+    verbose : Boolean
+        If True will print logs of the model conversion
+    dynamic: Boolean
+        If True will allow for dynamic input shapes to the model
+    dynamic_input_shapes: list of tuple
+        Specifies the dynamic input_shapes. If None then all dimensions are set to None
+    run_shape_inference : Boolean
+        If True will run shape inference on the model
+    input_type : data type or list of data types
+        This is the old name of in_types. We keep this parameter name for backward compatibility
+    input_shape : List of tuple
+        This is the old name of in_shapes. We keep this parameter name for backward compatibility
+    large_model : Boolean
+        Whether to export a model that is larger than 2 GB. If true will save param tensors in separate
+        files along with .onnx model file. This feature is supported since onnx 1.8.0
+
+Returns:
+
+    onnx_file_path : str
+        Onnx file path
+
+#### Model with Multiple Input
+When the model has multiple inputs, all the input shapes and dtypes must be provided with `in_shapes` and `in_dtypes`. Note that the shape/dtype in `in_shapes`/`in_dtypes` must follow the same order as in the MXNet model symbol file. If `in_dtypes` is provided as a single data type, then that type will be applied to all input nodes.
+
+#### Dynamic Shape Input
+We can set `dynamic=True` to turn on support for dynamic input shapes. Note that even with dynamic shapes, a set of static input shapes still need to be specified in `in_shapes`; on top of that, we'll also need to specify which dimensions of the input shapes are dynamic in `dynamic_input_shapes`. We can simply set the dynamic dimensions as `None`, e.g. `(1, 3, None, None)`, or use strings in place of the `None`'s for better understandability in the exported onnx graph, e.g. `(1, 3, 'Heig [...]
+
+```python
+# The batch dimension will be dynamic in this case
+in_shapes = [(1, 3, 224, 224)]
+dynamic_input_shapes = [(None, 3, 224, 224)]
+mx.onnx.export_model(mx_sym, mx_params, in_shapes, in_types, onnx_file,
+                     dynamic=True, dynamic_input_shapes=dynamic_input_shapes)
+```
+
+#### Export Large Model
+Users can set `large_model=True` to export models that are larger than 2GB. In this case, all parameter tensors will be saved into separate files along with the .onnx model file.
+
+### Operator Support Matrix
+We have implemented export logics for a wide range of MXNet operators, and thus supported most CV and NLP use cases. Below is our most up-to-date operator support matrix.
+
+|MXNet Op|ONNX Version|
+|:-|:-:|
+|TODO|TODO|
diff --git a/python/mxnet/contrib/onnx/mx2onnx/__init__.py b/python/mxnet/onnx/__init__.py
similarity index 90%
copy from python/mxnet/contrib/onnx/mx2onnx/__init__.py
copy to python/mxnet/onnx/__init__.py
index 779ce86..85b81b3 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/__init__.py
+++ b/python/mxnet/onnx/__init__.py
@@ -18,6 +18,4 @@
 # coding: utf-8
 """ONNX Export module"""
 
-from . import export_model
-from . import export_onnx
-from . import _op_translations
+from .mx2onnx import export_model, get_operator_support
diff --git a/python/mxnet/contrib/onnx/mx2onnx/LICENSE b/python/mxnet/onnx/mx2onnx/LICENSE
similarity index 100%
rename from python/mxnet/contrib/onnx/mx2onnx/LICENSE
rename to python/mxnet/onnx/mx2onnx/LICENSE
diff --git a/python/mxnet/contrib/onnx/mx2onnx/__init__.py b/python/mxnet/onnx/mx2onnx/__init__.py
similarity index 82%
copy from python/mxnet/contrib/onnx/mx2onnx/__init__.py
copy to python/mxnet/onnx/mx2onnx/__init__.py
index 779ce86..339d74d 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/__init__.py
+++ b/python/mxnet/onnx/mx2onnx/__init__.py
@@ -18,6 +18,6 @@
 # coding: utf-8
 """ONNX Export module"""
 
-from . import export_model
-from . import export_onnx
-from . import _op_translations
+from ._export_model import export_model, get_operator_support
+from ._op_translations import _op_translations_opset12
+from ._op_translations import _op_translations_opset13
diff --git a/python/mxnet/contrib/onnx/mx2onnx/_export_helper.py b/python/mxnet/onnx/mx2onnx/_export_helper.py
similarity index 100%
rename from python/mxnet/contrib/onnx/mx2onnx/_export_helper.py
rename to python/mxnet/onnx/mx2onnx/_export_helper.py
diff --git a/python/mxnet/onnx/mx2onnx/_export_model.py b/python/mxnet/onnx/mx2onnx/_export_model.py
new file mode 100644
index 0000000..fbfadde
--- /dev/null
+++ b/python/mxnet/onnx/mx2onnx/_export_model.py
@@ -0,0 +1,163 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+#pylint: disable-msg=too-many-arguments
+
+"""Exports an MXNet model to the ONNX model format"""
+import logging
+import numpy as np
+
+from mxnet.base import string_types
+from mxnet import symbol
+from ._export_onnx import MXNetGraph
+from ._export_helper import load_module
+
+
+def get_operator_support(opset_version=None):
+    """Return a list of MXNet operators supported by the current/specified opset
+    """
+    try:
+        from onnx.defs import onnx_opset_version
+    except ImportError:
+        raise ImportError("Onnx and protobuf need to be installed. "
+                          + "Instructions to install - https://github.com/onnx/onnx")
+    if opset_version is None:
+        opset_version = onnx_opset_version()
+    all_versions = range(opset_version, 11, -1)
+    ops = set()
+    for ver in all_versions:
+        if ver in MXNetGraph.registry_:
+            ops.update(MXNetGraph.registry_[ver].keys())
+    ops = list(ops)
+    ops.sort()
+    return ops
+
+
+def export_model(sym, params, in_shapes=None, in_types=np.float32,
+                 onnx_file_path='model.onnx', verbose=False, dynamic=False,
+                 dynamic_input_shapes=None, run_shape_inference=False, input_type=None,
+                 input_shape=None, large_model=False):
+    """Exports the MXNet model file, passed as a parameter, into ONNX model.
+    Accepts both symbol,parameter objects as well as json and params filepaths as input.
+    Operator support and coverage -
+    https://github.com/apache/incubator-mxnet/tree/v1.x/python/mxnet/onnx#operator-support-matrix
+
+    Parameters
+    ----------
+    sym : str or symbol object
+        Path to the json file or Symbol object
+    params : str or dict or list of dict
+        str - Path to the params file
+        dict - params dictionary (Including both arg_params and aux_params)
+        list - list of length 2 that contains arg_params and aux_params
+    in_shapes : List of tuple
+        Input shape of the model e.g [(1,3,224,224)]
+    in_types : data type or list of data types
+        Input data type e.g. np.float32, or [np.float32, np.int32]
+    onnx_file_path : str
+        Path where to save the generated onnx file
+    verbose : Boolean
+        If True will print logs of the model conversion
+    dynamic: Boolean
+        If True will allow for dynamic input shapes to the model
+    dynamic_input_shapes: list of tuple
+        Specifies the dynamic input_shapes. If None then all dimensions are set to None
+    run_shape_inference : Boolean
+        If True will run shape inference on the model
+    input_type : data type or list of data types
+        This is the old name of in_types. We keep this parameter name for backward compatibility
+    input_shape : List of tuple
+        This is the old name of in_shapes. We keep this parameter name for backward compatibility
+    large_model : Boolean
+        Whether to export a model that is larger than 2 GB. If true will save param tensors in separate
+        files along with .onnx model file. This feature is supported since onnx 1.8.0
+
+    Returns
+    -------
+    onnx_file_path : str
+        Onnx file path
+
+    Notes
+    -----
+    This method is available when you ``import mxnet.onnx``
+
+    """
+
+    try:
+        import onnx
+        from onnx import helper, mapping, shape_inference
+        from onnx.defs import onnx_opset_version
+    except ImportError:
+        raise ImportError("Onnx and protobuf need to be installed. "
+                          + "Instructions to install - https://github.com/onnx/onnx")
+
+    if input_type is not None:
+        in_types = input_type
+
+    if input_shape is not None:
+        in_shapes = input_shape
+
+    converter = MXNetGraph()
+    opset_version = onnx_opset_version()
+
+    if not isinstance(in_types, list):
+        in_types = [in_types for _ in range(len(in_shapes))]
+    in_types_t = [mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(i_t)] for i_t in in_types]
+    assert len(in_types) == len(in_shapes), "The lengths of in_types and in_shapes must equal"
+    # if input parameters are strings(file paths), load files and create symbol parameter objects
+    if isinstance(sym, string_types) and isinstance(params, string_types):
+        logging.info("Converting json and weight file to sym and params")
+        sym_obj, params_obj = load_module(sym, params)
+        onnx_graph = converter.create_onnx_graph_proto(sym_obj, params_obj, in_shapes,
+                                                       in_types_t,
+                                                       verbose=verbose, opset_version=opset_version,
+                                                       dynamic=dynamic, dynamic_input_shapes=dynamic_input_shapes)
+    elif isinstance(sym, symbol.Symbol) and isinstance(params, dict):
+        onnx_graph = converter.create_onnx_graph_proto(sym, params, in_shapes,
+                                                       in_types_t,
+                                                       verbose=verbose, opset_version=opset_version,
+                                                       dynamic=dynamic, dynamic_input_shapes=dynamic_input_shapes)
+    elif isinstance(sym, symbol.Symbol) and isinstance(params, list) and len(params) == 2:
+        # when params contains arg_params and aux_params
+        p = {}
+        p.update(params[0])
+        p.update(params[1])
+        onnx_graph = converter.create_onnx_graph_proto(sym, p, in_shapes,
+                                                       in_types_t,
+                                                       verbose=verbose, opset_version=opset_version,
+                                                       dynamic=dynamic, dynamic_input_shapes=dynamic_input_shapes)
+    else:
+        raise ValueError("Input sym and params should either be files or objects")
+
+    # Create the model (ModelProto)
+    onnx_model = helper.make_model(onnx_graph)
+
+    # Run shape inference on the model. Due to ONNX bug/incompatibility this may or may not crash
+    if run_shape_inference:
+        try:
+            onnx_model = shape_inference.infer_shapes(onnx_model)
+        except: # pylint: disable=bare-except
+            logging.info("Shape inference failed, original export is kept.")
+
+    if large_model:
+        from onnx.external_data_helper import convert_model_to_external_data
+        convert_model_to_external_data(onnx_model, all_tensors_to_one_file=False, location=onnx_file_path+'.data')
+
+    onnx.save_model(onnx_model, onnx_file_path)
+    onnx.checker.check_model(onnx_file_path)
+    return onnx_file_path
diff --git a/python/mxnet/onnx/mx2onnx/_export_onnx.py b/python/mxnet/onnx/mx2onnx/_export_onnx.py
new file mode 100644
index 0000000..36ac96e
--- /dev/null
+++ b/python/mxnet/onnx/mx2onnx/_export_onnx.py
@@ -0,0 +1,455 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Based on
+# https://github.com/NVIDIA/mxnet_to_onnx/blob/master/mx2onnx_converter/mx2onnx_converter.py#
+#  Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+#  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+#  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# coding: utf-8
+# pylint: disable=invalid-name,too-many-locals,no-self-use,too-many-arguments,
+# pylint: disable=maybe-no-member,too-many-nested-blocks,logging-not-lazy
+# pylint: disable=cell-var-from-loop
+"""MXNet to ONNX graph converter functions"""
+import logging
+import json
+
+import numpy as np
+from mxnet import ndarray as nd
+
+
+class MXNetGraph(object):
+    """Class to convert MXNet to ONNX graph"""
+    registry_ = {}
+    input_output_maps_ = {}
+
+    def __init__(self):
+        # topologically sorted nodes
+        self.nodes = []
+        self.input_tensors = []
+        self.output_tensors = []
+
+    @staticmethod
+    def register(op_name, opset_version=12):
+        """Register operators"""
+        def wrapper(func):
+            """Helper function to map functions"""
+            try:
+                import onnx as _
+                op_map = MXNetGraph.registry_.setdefault(opset_version, {})
+                op_map[op_name] = func
+            except ImportError:
+                pass
+            return func
+
+        return wrapper
+
+    @staticmethod
+    def convert_layer(node, **kwargs):
+        """Convert MXNet layer to ONNX"""
+        try:
+            from onnx.defs import onnx_opset_version
+        except ImportError:
+            raise ImportError("Onnx and protobuf need to be installed. "
+                              + "Instructions to install - https://github.com/onnx/onnx")
+
+        op = str(node["op"])
+        opset_version = kwargs.get("opset_version", onnx_opset_version())
+        if opset_version < 12:
+            logging.warning('Your ONNX op set version is %s, '  % str(opset_version) +
+                            'which is lower than then lowest tested op set (12), please consider '
+                            'updating ONNX')
+            opset_version = 12
+        # Fallback to older opset versions if op is not registered in current version
+        convert_func = None
+        for op_version in range(opset_version, 11, -1):
+            if op_version not in MXNetGraph.registry_ or op not in MXNetGraph.registry_[op_version]:
+                continue
+            convert_func = MXNetGraph.registry_[op_version][op]
+            break
+
+        # The conversion logic is not implemented
+        if convert_func is None:
+            raise AttributeError("No conversion function registered for op type %s yet." % op)
+
+        ret = convert_func(node, **kwargs)
+        # in case the conversion function does not specify the returned dtype, we just return None
+        # as the second value
+        if isinstance(ret, list):
+            return ret, None
+        else:
+            return ret
+
+    @staticmethod
+    def split_params(sym, params):
+        """Helper function to split params dictionary into args and aux params
+
+        Parameters
+        ----------
+        sym : :class:`~mxnet.symbol.Symbol`
+            MXNet symbol object
+        params : dict of ``str`` to :class:`~mxnet.ndarray.NDArray`
+            Dict of converted parameters stored in ``mxnet.ndarray.NDArray`` format
+
+        Returns
+        -------
+        arg_params : dict of ``str`` to :class:`~mxnet.ndarray.NDArray`
+            Dict of converted parameters stored in ``mxnet.ndarray.NDArray`` format
+        aux_params : dict of ``str`` to :class:`~mxnet.ndarray.NDArray`
+            Dict of converted parameters stored in ``mxnet.ndarray.NDArray`` format
+        """
+        arg_params = {}
+        aux_params = {}
+        for args in sym.list_arguments():
+            if args in params:
+                arg_params.update({args: nd.array(params[args])})
+        for aux in sym.list_auxiliary_states():
+            if aux in params:
+                aux_params.update({aux: nd.array(params[aux])})
+        return arg_params, aux_params
+
+    @staticmethod
+    def get_outputs(sym, params, in_shapes, output_label, in_types, dynamic=False,
+                    dynamic_input_shapes=None):
+        """Helper function to collect the output names, types, and shapes
+
+        Parameters
+        ----------
+        sym : :class:`~mxnet.symbol.Symbol`
+            MXNet symbol object
+        params : dict of ``str`` to :class:`~mxnet.ndarray.NDArray`
+            Dict of converted parameters stored in ``mxnet.ndarray.NDArray`` format
+        in_shapes : list of tuple
+            Input shapes
+        out_label : ``str``
+            Name of label typically used in loss that may be left in graph. This name is
+            removed from list of inputs required by symbol
+        in_types : list of Int
+            Input ONNX data types
+        dynamic : Boolean
+            If True will allow for dynamic input shapes to the model
+        dynamic_input_shapes: list of tuple
+            Specifies the dynamic input_shapes. If None then all dimensions are set to None
+
+        Returns
+        in_shapes : list of tuple
+            Updated input shapes
+        graph_outputs : dict ``str`` to dict
+            This maps output name to {'shape':tuple, 'dtype':Int}
+        -------
+        """
+        from onnx import mapping
+        import re
+
+        # Collect graph output names
+        out_names = list()
+        for name in sym.list_outputs():
+            if name.endswith('_state_output'): # handel special cases for RNN operator
+                out_names.append(name[:-len('_state_output')]+'1')
+            elif name.endswith('_statecell_output'): # handel special cases for RNN operator
+                out_names.append(name[:-len('_statecell_output')]+'2')
+            elif name.endswith('_output'):
+                out_names.append(name[:-len('_output')])
+            elif name.endswith('_out'):
+                out_names.append(name[:-len('_out')])
+            elif re.search('.*_output[0-9]$', name):
+                out_names.append(name[:-len('_output0')]+name[-1])
+            else:
+                logging.info("output '%s' does not end with '_output'", name)
+                out_names.append(name)
+
+        # Collect graph output shapes
+        # Remove any input listed in params from sym.list_inputs() and bind them to the input shapes provided
+        # by user. Also remove output_label, which is the name of the label symbol that may have been used
+        # as the label for loss during training.
+        inputs = {n: tuple(s) for n, s in
+                  zip([n for n in sym.list_inputs() if n not in params and n != output_label],
+                      in_shapes)}
+        # Add params and their shape to list of inputs
+        inputs.update({n: v.shape for n, v in params.items() if n in sym.list_inputs()})
+        # Provide input data as well as input params to infer_shape()
+        _, out_shapes, _ = sym.infer_shape(**inputs)
+        if dynamic:
+            # Keep the dimensionality of the output shapes but change the values to None
+            out_shapes = [tuple(None for _ in i_s) for i_s in out_shapes]
+
+            if dynamic_input_shapes is None:
+                # Set all dimensions to None
+                in_shapes = [tuple(None for _ in i_s) for i_s in in_shapes]
+            else:
+                assert len(in_shapes) == len(dynamic_input_shapes), "The length of " \
+                    "dynamic_input_shapes must equal to the length of in_shapes."
+                for i_s, d_i_s in zip(in_shapes, dynamic_input_shapes):
+                    assert len(i_s) == len(d_i_s), "The dimensionality " \
+                        "of each shape must match."
+                in_shapes = dynamic_input_shapes
+        else:
+            assert dynamic_input_shapes is None, "dynamic_input_shapes is specified. Please " \
+                "set dynamic_input_shapes=True to enable dynamic input shapes"
+
+        # Collect graph output types
+        # Remove any input listed in params from sym.list_inputs() and bind them to the input types provided
+        # by user. Also remove output_label
+        in_dtypes = {n: mapping.TENSOR_TYPE_TO_NP_TYPE[t] for n, t in
+                     zip([n for n in sym.list_inputs() if n not in params and n != output_label],
+                         in_types)}
+        # Add params and their types to list of inputs
+        in_dtypes.update({n: v.dtype for n, v in params.items() if n in sym.list_inputs()})
+        _, out_type, _ = sym.infer_type(**in_dtypes)
+        out_types = [mapping.NP_TYPE_TO_TENSOR_TYPE[o(0).dtype] for o in out_type]
+
+        # Make sure the types, names, and shapes all align up
+        assert len(out_types) == len(out_names) == len(out_shapes)
+
+        # Bind output shapes/types with output names
+        graph_outputs = {n: {'shape': s, 'dtype': d} for n, s, d in zip(out_names, out_shapes, out_types)}
+
+        return in_shapes, graph_outputs
+
+    @staticmethod
+    def convert_weights_to_numpy(weights_dict):
+        """Convert weights to numpy"""
+        return dict([(k.replace("arg:", "").replace("aux:", ""), v.asnumpy())
+                     for k, v in weights_dict.items()])
+
+    def create_onnx_graph_proto(self, sym, params, in_shapes, in_types, verbose=False, opset_version=None,
+                                dynamic=True, dynamic_input_shapes=None):
+        """Convert MXNet graph to ONNX graph
+
+        Parameters
+        ----------
+        sym : :class:`~mxnet.symbol.Symbol`
+            MXNet symbol object
+        params : dict of ``str`` to :class:`~mxnet.ndarray.NDArray`
+            Dict of converted parameters stored in ``mxnet.ndarray.NDArray`` format
+        in_shapes : List of tuple
+            Input shape of the model e.g [(1,3,224,224)]
+        in_types : List of Int
+            Input ONNX data types
+        verbose : Boolean
+            If true will print logs of the model conversion
+        opset_version : Int
+            ONNX opset version to use for export, defaults to latest supported by onnx package
+        dynamic: Boolean
+            If True will allow for dynamic input shapes to the model
+        dynamic_input_shapes: list of tuple
+            Specifies the dynamic input_shapes. If None then all dimensions are set to None
+
+        Returns
+        -------
+        graph : GraphProto
+            ONNX graph
+        """
+        try:
+            from onnx import (helper, NodeProto, ValueInfoProto, TensorProto)
+            from onnx.helper import make_tensor_value_info
+            from onnx.defs import onnx_opset_version
+        except ImportError:
+            raise ImportError("Onnx and protobuf need to be installed. "
+                              + "Instructions to install - https://github.com/onnx/onnx")
+
+        if opset_version is None:
+            opset_version = onnx_opset_version()
+
+        # When MXNet model is saved to json file , MXNet adds a node for label.
+        # The name of this node is, name of the last node + "_label" ( i.e if last node
+        # name is "Softmax", this node will have a name "Softmax_label". Also, the new node
+        # will always be second last node in the json graph.
+        # Deriving the output_label name.
+        output_label = sym.get_internals()[len(sym.get_internals()) - 1].name + "_label"
+
+        weights = MXNetGraph.convert_weights_to_numpy(params)
+
+        mx_graph = json.loads(sym.tojson())["nodes"]
+
+        class NodeOutput:
+            def __init__(self, name, dtype):
+                self.name = name
+                self.dtype = np.dtype(dtype)
+
+        initializer = []
+        all_processed_nodes = []
+        onnx_processed_nodes = []
+        onnx_processed_inputs = []
+        onnx_processed_outputs = []
+        outputs_lookup = []
+
+        # Determine graph output names, shapes, and dtypes. Also update in_shapes
+        in_shapes, graph_outputs = MXNetGraph.get_outputs(sym, params, in_shapes, output_label,
+                                                          in_types, dynamic, dynamic_input_shapes)
+        appeared_names = set()
+        graph_input_idx = 0
+        for idx, node in enumerate(mx_graph):
+            op = node["op"]
+            # check if the current node has the same name as nodes before
+            if node["name"] in appeared_names:
+                node["name"] = 'idx_' + str(idx) + '_' + node["name"]
+            else:
+                appeared_names.add(node["name"])
+            name = node["name"]
+            if verbose:
+                logging.info("Converting idx: %d, op: %s, name: %s", idx, op, name)
+
+            # A node is an input node if its op_name is "null" and is not
+            # in params dict
+            if op == "null" and name not in params:
+                # Handle graph input
+
+                # Skip output_label node, as this node is not part of graph
+                # Refer to "output_label" assignment above for more details.
+                if name == output_label:
+                    continue
+
+                converted, dtypes = MXNetGraph.convert_layer(
+                    node,
+                    is_input=True,
+                    mx_graph=mx_graph,
+                    weights=weights,
+                    in_shape=in_shapes[graph_input_idx],
+                    in_type=in_types[graph_input_idx],
+                    proc_nodes=all_processed_nodes,
+                    initializer=initializer,
+                    outputs_lookup=outputs_lookup)
+                graph_input_idx += 1
+            else:
+                # Handle graph layers
+                converted, dtypes = MXNetGraph.convert_layer(
+                    node,
+                    is_input=False,
+                    mx_graph=mx_graph,
+                    weights=weights,
+                    proc_nodes=all_processed_nodes,
+                    initializer=initializer,
+                    outputs_lookup=outputs_lookup,
+                    idx=idx,
+                    opset_version=opset_version
+                )
+            if isinstance(converted, list):
+                # Collect all the node's output names
+                node_possible_names = [name] + [name + str(i) for i in range(100)]
+                node_output_names = []
+                # Collect all the graph's output names
+                graph_output_names = []
+                # Iterate for all converted nodes
+                for converted_node in converted:
+                    # If converted node is ValueInfoProto, add it in inputs
+                    if isinstance(converted_node, ValueInfoProto):
+                        onnx_processed_inputs.append(converted_node)
+                    # If converted node is NodeProto, add it in processed nodes list
+                    elif isinstance(converted_node, NodeProto):
+                        onnx_processed_nodes.append(converted_node)
+                        # some operators have multiple outputs,
+                        # therefore, check all output node names
+                        node_names = list(converted_node.output)
+                        for nodename in node_names:
+                            if nodename in node_possible_names:
+                                node_output_names.append(nodename)
+                            if nodename in graph_outputs:
+                                graph_output_names.append(nodename)
+                                if verbose:
+                                    logging.info("Output node is: %s", nodename)
+                    elif isinstance(converted_node, TensorProto):
+                        raise ValueError("Did not expect TensorProto")
+                    else:
+                        raise ValueError("node is of an unrecognized type: %s" % type(node))
+
+                    all_processed_nodes.append(converted_node)
+
+                # if node_output_names is empty then we use the last returned node as output
+                if not node_output_names:
+                    node_output_names = [converted[-1].name]
+                # process node outputs (sort by output index)
+                def str2int(s, l):
+                    if len(s) == l:
+                        return -1
+                    else:
+                        return int(s[l:])
+
+                node_output_names = sorted(node_output_names, key=lambda x: str2int(x, len(name)))
+
+                # match the output names to output dtypes
+                if dtypes is not None:
+                    assert len(node_output_names) == len(dtypes)
+                    node_outputs = [NodeOutput(node_output_names[i], dtypes[i])
+                                    for i in range(len(dtypes))]
+                else:
+                    # in case dtypes is None, we just default to the dtype of the first input
+                    assert len(node["inputs"]) > 0
+                    first_input = node["inputs"][0]
+                    first_input_dtype = outputs_lookup[first_input[0]][first_input[1]].dtype
+                    node_outputs = [NodeOutput(n, first_input_dtype)
+                                    for n in node_output_names]
+                outputs_lookup.append(node_outputs)
+
+                # process graph outputs (sort by alphabetical order)
+                graph_output_names.sort()
+                for nodename in graph_output_names:
+                    onnx_processed_outputs.append(
+                        make_tensor_value_info(
+                            name=nodename,
+                            elem_type=graph_outputs[nodename]['dtype'],
+                            shape=graph_outputs[nodename]['shape']
+                        )
+                    )
+
+            else:
+                logging.info("Operator converter function should always return a list")
+
+        # sometimes the graph output can also be in the intializer
+        for i in initializer:
+            if i.name in graph_outputs:
+                onnx_processed_outputs.append(
+                    make_tensor_value_info(
+                        name=i.name,
+                        elem_type=graph_outputs[i.name]['dtype'],
+                        shape=graph_outputs[i.name]['shape']
+                    )
+                )
+
+        graph = helper.make_graph(
+            onnx_processed_nodes,
+            "mxnet_converted_model",
+            onnx_processed_inputs,
+            onnx_processed_outputs
+        )
+
+        graph.initializer.extend(initializer)
+
+        return graph
diff --git a/python/mxnet/contrib/onnx/onnx2mx/__init__.py b/python/mxnet/onnx/mx2onnx/_op_translations/__init__.py
similarity index 87%
rename from python/mxnet/contrib/onnx/onnx2mx/__init__.py
rename to python/mxnet/onnx/mx2onnx/_op_translations/__init__.py
index d0411df..ba26e20 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/__init__.py
+++ b/python/mxnet/onnx/mx2onnx/_op_translations/__init__.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-"""ONNX Import module"""
-from . import import_model
-from . import import_onnx
-from . import import_to_gluon
+"""ONNX export op translation"""
+
+from . import _op_translations_opset12
+from . import _op_translations_opset13
diff --git a/python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset12.py b/python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset12.py
new file mode 100644
index 0000000..7e0cd8d
--- /dev/null
+++ b/python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset12.py
@@ -0,0 +1,5349 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Based on
+#  https://github.com/NVIDIA/mxnet_to_onnx/blob/master/mx2onnx_converter/
+# mx2onnx_converter_functions.py
+#  Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+#  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+#  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# coding: utf-8
+# pylint: disable=too-many-locals,no-else-return,too-many-lines
+# pylint: disable=anomalous-backslash-in-string,eval-used
+# pylint: disable=too-many-function-args
+"""
+Conversion Functions for common layers.
+Add new functions here with a decorator.
+"""
+
+import re
+import logging
+import numpy as np
+from .._export_onnx import MXNetGraph as mx_op
+try:
+    import onnx
+except ImportError:
+    onnx = None
+
+
+def parse_helper(attrs, attrs_name, alt_value=None):
+    """Helper function to parse operator attributes in required format."""
+    tuple_re = re.compile(r'\([0-9L|,| ]+\)')
+    if not attrs:
+        return alt_value
+    attrs_str = None if attrs.get(attrs_name) is None else str(attrs.get(attrs_name))
+    if attrs_str is None:
+        return alt_value
+    attrs_match = tuple_re.search(attrs_str)
+    if attrs_match is not None:
+        if attrs_match.span() == (0, len(attrs_str)):
+            dims = eval(attrs_str)
+            return dims
+        else:
+            raise AttributeError("Malformed %s dimensions: %s" % (attrs_name, str(attrs_str)))
+    return alt_value
+
+def transform_padding(pad_width):
+    """Helper function to convert padding format for pad operator.
+    """
+    num_pad_values = len(pad_width)
+    onnx_pad_width = [0]*num_pad_values
+
+    start_index = 0
+    # num_pad_values will always be multiple of 2
+    end_index = int(num_pad_values/2)
+    for idx in range(0, num_pad_values):
+        if idx % 2 == 0:
+            onnx_pad_width[start_index] = pad_width[idx]
+            start_index += 1
+        else:
+            onnx_pad_width[end_index] = pad_width[idx]
+            end_index += 1
+
+    return onnx_pad_width
+
+
+def convert_string_to_list(string_val):
+    """Helper function to convert string to list.
+     Used to convert shape attribute string to list format.
+    """
+    result_list = []
+
+    list_string = string_val.split(',')
+    for val in list_string:
+        val = str(val.strip())
+        val = val.replace("(", "")
+        val = val.replace(")", "")
+        val = val.replace("L", "")
+        val = val.replace("[", "")
+        val = val.replace("]", "")
+        if val == "None":
+            result_list.append(None)
+        elif val != "":
+            result_list.append(int(val))
+
+    return result_list
+
+def get_boolean_attribute_value(attrs, attr_name):
+    """ Helper function to convert a string version
+    of Boolean attributes to integer for ONNX.
+    Takes attribute dictionary and attr_name as
+    parameters.
+    """
+    return 1 if attrs.get(attr_name, 0) in ["True", "1"] else 0
+
+def get_inputs(node, kwargs):
+    """Helper function to get inputs"""
+    name = node["name"]
+    outputs_lookup = kwargs["outputs_lookup"]
+    inputs = node["inputs"]
+    attrs = node.get("attrs", {})
+
+    input_nodes = []
+    for ip in inputs:
+        input_node_name = outputs_lookup[ip[0]][ip[1]].name
+        input_nodes.append(input_node_name)
+
+    return name, input_nodes, attrs
+
+def get_input_dtypes(node, kwargs):
+    outputs_lookup = kwargs['outputs_lookup']
+    inputs = node['inputs']
+    input_dtypes = []
+    for ip in inputs:
+        input_node_dtype = outputs_lookup[ip[0]][ip[1]].dtype
+        input_dtypes.append(input_node_dtype)
+    return input_dtypes
+
+def create_basic_op_node(op_name, node, kwargs):
+    """Helper function to create a basic operator
+    node that doesn't contain op specific attrs"""
+    name, input_nodes, _ = get_inputs(node, kwargs)
+    node = onnx.helper.make_node(
+        op_name,
+        input_nodes,
+        [name],
+        name=name
+    )
+    return [node]
+
+def create_const_scalar_node(input_name, value, kwargs):
+    """Helper function to create a tensor value node and a
+    initializer tensor node with constant value."""
+    from onnx.helper import make_tensor
+    initializer = kwargs["initializer"]
+    dtype = value.dtype
+    if dtype == 'float16':
+        # when using float16, we must convert it to np.uint16 view first
+        value = np.float16(value).view(np.uint16)
+    input_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+    tensor_node = make_tensor(input_name, input_type, (), ([value]))
+    initializer.append(tensor_node)
+
+def create_const_node(input_name, value, kwargs):
+    """Helper function to create a tensor value node and a
+    initializer tensor node with constant value."""
+    from onnx.helper import make_tensor
+    initializer = kwargs["initializer"]
+    dtype = value.dtype
+    if dtype == 'float16':
+        # when using float16, we must convert it to np.uint16 view first
+        value = np.float16(value).view(np.uint16)
+    input_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+    input_shape = value.shape
+    tensor_node = make_tensor(input_name, input_type, input_shape, value)
+    initializer.append(tensor_node)
+
+def create_tensor(tensor_list, tensor_name, initializer, dtype='int64'):
+    """Helper function to create a tensor value node and a
+    initializer tensor node with constant value."""
+    tensor_np = np.array(tensor_list, dtype=dtype)
+    data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[tensor_np.dtype]
+    dims = np.shape(tensor_np)
+    if dtype == np.float16:
+        tensor_np = tensor_np.view(dtype=np.uint16)
+    tensor = onnx.helper.make_tensor(
+        name=tensor_name,
+        data_type=data_type,
+        dims=dims,
+        vals=tensor_np.flatten().tolist(),
+        raw=False
+    )
+    initializer.append(tensor)
+
+
+@mx_op.register("null")
+def convert_weights_and_inputs(node, **kwargs):
+    """Helper function to convert weights and inputs.
+    """
+    name, _, _ = get_inputs(node, kwargs)
+    if kwargs["is_input"] is False:
+        weights = kwargs["weights"]
+        initializer = kwargs["initializer"]
+        np_arr = weights[name]
+        data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np_arr.dtype]
+        dims = np.shape(np_arr)
+
+        tensor_node = onnx.helper.make_tensor_value_info(name, data_type, dims)
+
+        from onnx import numpy_helper
+        tensor = numpy_helper.from_array(np_arr, name=name)
+        initializer.append(tensor)
+
+        return [tensor_node], (np_arr.dtype,)
+    else:
+        dtype_t = kwargs["in_type"]
+        dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[dtype_t]
+        tval_node = onnx.helper.make_tensor_value_info(name, dtype_t, kwargs["in_shape"])
+        return [tval_node], (dtype,)
+
+
+@mx_op.register('Convolution')
+def convert_convolution(node, **kwargs):
+    """Map MXNet's convolution operator attributes to onnx's Conv operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    kernel = convert_string_to_list(attrs.get('kernel', '()'))
+    stride = convert_string_to_list(attrs.get('stride', '()'))
+    dilate = convert_string_to_list(attrs.get('dilate', '()'))
+    pad = convert_string_to_list(attrs.get('pad', '()'))
+    num_group = int(attrs.get('num_group', 1))
+    no_bias = attrs.get('no_bias', 'False')
+    layout = attrs.get('layout', 'NCHW')
+
+    if layout not in ['NCHW', 'NCDHW']:
+        raise NotImplementedError('Convolution currently does not support layout not in '
+                                  '[\'NCHW\', \'NCDHW\']')
+
+    if no_bias in ['True', '1']:
+        assert len(input_nodes) == 2, 'Convolution takes 2 input if no_bias==True'
+    else:
+        assert len(input_nodes) == 3, 'Convolution takes 3 input if no_bias==False'
+
+    kwargs_ = {}
+    if kernel:
+        kwargs_['kernel_shape'] = tuple(kernel)
+    if pad:
+        kwargs_['pads'] = tuple(pad) + tuple(pad)
+    if stride:
+        kwargs_['strides'] = stride
+    if dilate:
+        kwargs_['dilations'] = dilate
+
+    nodes = [
+        make_node('Conv', input_nodes, [name], group=num_group, **kwargs_)
+    ]
+
+    return nodes
+
+
+@mx_op.register('Deconvolution')
+def convert_deconvolution(node, **kwargs):
+    """Map MXNet's deconvolution operator attributes to onnx's ConvTranspose operator
+    and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    kernel_shape = convert_string_to_list(attrs.get('kernel', '()'))
+    strides = convert_string_to_list(attrs.get('stride', '()'))
+    pads = convert_string_to_list(attrs.get('pad', '()'))
+    group = int(attrs.get("num_group", 1))
+    dilations = convert_string_to_list(attrs.get('dilate', '()'))
+    output_padding = convert_string_to_list(attrs.get('adj', '()'))
+    layout = attrs.get('layout', 'NCHW')
+    target_shape = attrs.get('target_shape', '')
+    no_bias = attrs.get('no_bias', 'False')
+
+    pads = pads + pads
+
+    if target_shape not in ['', 'None']:
+        raise NotImplementedError('Deconvolution currently does not support target_shape')
+
+    if layout not in ['NCHW', 'NCDHW', 'NCW']:
+        raise NotImplementedError('Deconvolution currently does not support layout not in '
+                                  '[\'NCHW\', \'NCDHW\', \'NCW\']')
+
+    if no_bias in ['1', 'True']:
+        assert len(input_nodes) == 2, 'Deconvolution takes 2 input if no_bias==True'
+    else:
+        assert len(input_nodes) == 3, 'Deconvolution takes 3 input if no_bias==False'
+
+    kwargs_ = {}
+    if kernel_shape:
+        kwargs_['kernel_shape'] = kernel_shape
+    if pads:
+        kwargs_['pads'] = pads
+    if strides:
+        kwargs_['strides'] = strides
+    if dilations:
+        kwargs_['dilations'] = dilations
+    if output_padding:
+        kwargs_['output_padding'] = output_padding
+
+    deconv_node = onnx.helper.make_node(
+        "ConvTranspose",
+        inputs=input_nodes,
+        outputs=[name],
+        group=group,
+        **kwargs_
+    )
+
+    return [deconv_node]
+
+
+@mx_op.register('Crop')
+def convert_crop(node, **kwargs):
+    """Map MXNet's crop operator attributes to onnx's Slice operator
+    """
+    from onnx.helper import make_node
+    name, inputs, attrs = get_inputs(node, kwargs)
+
+    num_inputs = len(inputs)
+    y, x = convert_string_to_list(attrs.get('offset', '(0, 0)')) # pylint: disable=unbalanced-tuple-unpacking
+    h, w = convert_string_to_list(attrs.get('h_w', '(0, 0)')) # pylint: disable=unbalanced-tuple-unpacking
+    center_crop = attrs.get('center_crop', 'False')
+
+    if center_crop in ['True', '1']:
+        raise NotImplementedError('Crop does not currently support center_crop==True')
+
+    nodes = []
+    create_tensor([y, x], name+'_starts', kwargs['initializer'])
+    create_tensor([2, 3], name+'_axes', kwargs['initializer'])
+    if num_inputs == 1:
+        create_tensor([y + h, x + w], name+'_ends', kwargs['initializer'])
+    else:
+        create_tensor([0], name+'_0', kwargs['initializer'])
+        create_tensor([2], name+'_2', kwargs['initializer'])
+        create_tensor([4], name+'_4', kwargs['initializer'])
+        nodes += [
+            make_node('Shape', [inputs[1]], [name+'_shape']),
+            make_node('Slice', [name+'_shape', name+'_2', name+'_4', name+'_0'], [name+'_h_w']),
+            make_node('Add', [name+'_starts', name+'_h_w'], [name+'_ends'])
+
+        ]
+    nodes += [
+        make_node('Slice', [inputs[0], name+'_starts', name+'_ends', name+'_axes'], [name])
+    ]
+
+    return nodes
+
+@mx_op.register("FullyConnected")
+def convert_fully_connected(node, **kwargs):
+    """Map MXNet's FullyConnected operator attributes to onnx's Gemm operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    dtype = input_dtypes[0]
+    flatten = get_boolean_attribute_value(attrs, 'flatten')
+    no_bias = get_boolean_attribute_value(attrs, 'no_bias')
+    num_hidden = int(attrs.get('num_hidden'))
+
+    nodes = []
+    if flatten:
+        nodes += [
+            make_node('Flatten', [input_nodes[0]], [name+'_data_flattened'])
+        ]
+    else:
+        nodes += [
+            make_node('Shape', [input_nodes[0]], [name+'_orig_shape']),
+            make_node('Shape', [name+'_orig_shape'], [name+'_dim']),
+            make_node('Flatten', [input_nodes[0]], [name+'_data_flattened'], axis=-1),
+        ]
+
+    in_nodes = [name+'_data_flattened', input_nodes[1]]
+
+    if no_bias:
+        create_const_scalar_node(name+'_bias', np.int32(0).astype(dtype), kwargs)
+        in_nodes.append(name+'_bias')
+    else:
+        in_nodes.append(input_nodes[2])
+
+    if flatten:
+        nodes += [
+            make_node('Gemm', in_nodes, [name], alpha=1.0, beta=1.0, transA=0, transB=1, name=name)
+        ]
+    else:
+        create_tensor([0], name+'_0', kwargs['initializer'])
+        create_tensor([1], name+'_1', kwargs['initializer'])
+        create_tensor([num_hidden], name+'_num_hidden', kwargs['initializer'])
+        nodes += [
+            make_node('Gemm', in_nodes, [name+'_gemm'], alpha=1.0, beta=1.0, transA=0, transB=1),
+            make_node('Sub', [name+'_dim', name+'_1'], [name+'dim_minus_1']),
+            make_node('Slice', [name+'_orig_shape', name+'_0', name+'dim_minus_1'],
+                      [name+'_shape_sliced']),
+            make_node('Concat', [name+'_shape_sliced', name+'_num_hidden'],
+                      [name+'_shape_new'], axis=0),
+            make_node('Reshape', [name+'_gemm', name+'_shape_new'], [name], name=name)
+        ]
+
+    return nodes
+
+
+@mx_op.register("BatchNorm")
+def convert_batchnorm(node, **kwargs):
+    """Map MXNet's BatchNorm operator attributes to onnx's BatchNormalization operator
+    and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    momentum = float(attrs.get("momentum", 0.9))
+    eps = float(attrs.get("eps", 0.001))
+    axis = int(attrs.get("axis", 1))
+
+    if axis != 1:
+        raise NotImplementedError("batchnorm axis != 1 is currently not supported.")
+
+    bn_node = onnx.helper.make_node(
+        "BatchNormalization",
+        input_nodes,
+        [name],
+        name=name,
+        epsilon=eps,
+        momentum=momentum
+        # MXNet computes mean and variance per channel for batchnorm.
+        # Default for onnx is across all spatial features. Relying on default
+        # ONNX behavior of spatial=1 for ONNX opset 8 and below. As the spatial
+        # attribute is deprecated in opset 9 and above, not explicitly encoding it.
+    )
+    return [bn_node]
+
+
+@mx_op.register("tanh")
+@mx_op.register("_npi_tanh")
+def convert_tanh(node, **kwargs):
+    """Map MXNet's tanh operator attributes to onnx's Tanh operator
+    and return the created node.
+    """
+    return create_basic_op_node('Tanh', node, kwargs)
+
+@mx_op.register("cos")
+@mx_op.register("_npi_cos")
+def convert_cos(node, **kwargs):
+    """Map MXNet's cos operator attributes to onnx's Cos operator
+    and return the created node.
+    """
+    return create_basic_op_node('Cos', node, kwargs)
+
+@mx_op.register("sin")
+@mx_op.register("_npi_sin")
+def convert_sin(node, **kwargs):
+    """Map MXNet's sin operator attributes to onnx's Sin operator
+    and return the created node.
+    """
+    return create_basic_op_node('Sin', node, kwargs)
+
+@mx_op.register("tan")
+@mx_op.register("_npi_tan")
+def convert_tan(node, **kwargs):
+    """Map MXNet's tan operator attributes to onnx's tan operator
+    and return the created node.
+    """
+    return create_basic_op_node('Tan', node, kwargs)
+
+@mx_op.register("arccos")
+@mx_op.register("_npi_arccos")
+def convert_acos(node, **kwargs):
+    """Map MXNet's acos operator attributes to onnx's acos operator
+    and return the created node.
+    """
+    return create_basic_op_node('Acos', node, kwargs)
+
+@mx_op.register("arcsin")
+@mx_op.register("_npi_arcsin")
+def convert_asin(node, **kwargs):
+    """Map MXNet's asin operator attributes to onnx's asin operator
+    and return the created node.
+    """
+    return create_basic_op_node('Asin', node, kwargs)
+
+@mx_op.register("arctan")
+@mx_op.register("_npi_arctan")
+def convert_atan(node, **kwargs):
+    """Map MXNet's atan operator attributes to onnx's atan operator
+    and return the created node.
+    """
+    return create_basic_op_node('Atan', node, kwargs)
+
+#Basic neural network functions
+@mx_op.register("sigmoid")
+@mx_op.register("_npx_sigmoid")
+def convert_sigmoid(node, **kwargs):
+    """Map MXNet's sigmoid operator attributes to onnx's Sigmoid operator
+    and return the created node.
+    """
+    return create_basic_op_node('Sigmoid', node, kwargs)
+
+@mx_op.register("relu")
+@mx_op.register("_npx_relu")
+def convert_relu(node, **kwargs):
+    """Map MXNet's relu operator attributes to onnx's Relu operator
+    and return the created node.
+    """
+    return create_basic_op_node('Relu', node, kwargs)
+
+@mx_op.register("Activation")
+def convert_activation(node, **kwargs):
+    """Map MXNet's Activation operator attributes to onnx's Tanh/Relu operator
+    and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    act_type = attrs["act_type"]
+
+    # Creating a dictionary here, but if this titlecase pattern
+    # mxnet_name.title()
+    act_types = {
+        "tanh": "Tanh",
+        "relu": "Relu",
+        "sigmoid": "Sigmoid",
+        "softrelu": "Softplus",
+        "softsign": "Softsign"
+    }
+
+    act_name = act_types.get(act_type)
+    if act_name:
+        node = onnx.helper.make_node(
+            act_name,
+            input_nodes,
+            [name],
+            name=name
+        )
+    else:
+        raise AttributeError(
+            "Activation %s not implemented or recognized in the converter" % act_type
+        )
+
+    return [node]
+
+
+@mx_op.register("Pad")
+def convert_pad(node, **kwargs):
+    """Map MXNet's pad operator attributes to onnx's Pad operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    opset_version = kwargs["opset_version"]
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    dtype = input_dtypes[0]
+
+    mxnet_pad_width = convert_string_to_list(attrs.get("pad_width"))
+    onnx_pad_width = transform_padding(mxnet_pad_width)
+
+    pad_mode = attrs.get("mode")
+    pad_value = float(attrs.get("constant_value", 0.0))
+    pad_value = dtype.type(pad_value)
+
+    if opset_version >= 11:
+        # starting with opset 11, pads and constant_value are inputs instead of attributes
+        create_const_node(name+"_pads", np.array(onnx_pad_width, dtype='int64'), kwargs)
+        nodes = []
+        if pad_mode == "constant":
+            create_const_scalar_node(name+"_const", pad_value, kwargs)
+            nodes += [
+                make_node("Pad", [input_nodes[0], name+"_pads", name+"_const"], [name], mode=pad_mode, name=name)
+            ]
+        else:
+            nodes += [
+                make_node("Pad", [input_nodes[0], name+"_pads"], [name], mode=pad_mode, name=name)
+            ]
+        return nodes
+    else:
+        if pad_mode == "constant":
+            node = onnx.helper.make_node(
+                'Pad',
+                inputs=input_nodes,
+                outputs=[name],
+                mode='constant',
+                value=pad_value,
+                pads=onnx_pad_width,
+                name=name
+            )
+        else:
+            node = onnx.helper.make_node(
+                'Pad',
+                inputs=input_nodes,
+                outputs=[name],
+                mode=pad_mode,
+                pads=onnx_pad_width,
+                name=name
+            )
+        return [node]
+
+
+def create_helper_trans_node(node_name, input_node):
+    """create extra transpose node for dot operator"""
+    trans_node = onnx.helper.make_node(
+        'Transpose',
+        inputs=[input_node],
+        outputs=[node_name],
+        name=node_name
+    )
+    return trans_node
+
+
+# Note that due to ONNX limitation, the behavior for when inputs > 2-D is different from that of
+# MXNet
+@mx_op.register("dot")
+def convert_dot(node, **kwargs):
+    """Map MXNet's dot operator attributes to onnx's
+    MatMul and Transpose operators based on the values set for
+    transpose_a, transpose_b attributes."""
+    logging.warning('Converting dot operator... Please note that due to ONNX limitation, the '
+                    'behavior for when inputs > 2-D is different from that of MXNet dot.')
+
+    name, inputs, attrs = get_inputs(node, kwargs)
+    trans_a = get_boolean_attribute_value(attrs, "transpose_a")
+    trans_b = get_boolean_attribute_value(attrs, "transpose_b")
+
+    nodes = []
+    input_nodes = []
+    if trans_a:
+        nodes.append(create_helper_trans_node(name+"_a", inputs[0]))
+        input_nodes.append(name+"_a")
+    else:
+        input_nodes.append(inputs[0])
+
+    if trans_b:
+        nodes.append(create_helper_trans_node(name+"_b", inputs[1]))
+        input_nodes.append(name+"_b")
+    else:
+        input_nodes.append(inputs[1])
+
+    nodes.append(onnx.helper.make_node('MatMul', input_nodes, [name], name=name))
+    return nodes
+
+
+def transpose_last_two_dim(name, kwargs):
+    """Helper function to transpose the last two dims of the input tensor
+    """
+    from onnx.helper import make_node
+    create_tensor([0], name+'_0', kwargs['initializer'])
+    create_tensor([1], name+'_1', kwargs['initializer'])
+    create_tensor([8], name+'_8', kwargs['initializer'])
+    perm = [i for i in range(8)]
+    perm[6], perm[7] = 7, 6
+    nodes = [
+        make_node('Shape', [name], [name+'_shape']),
+        make_node('Shape', [name+'_shape'], [name+'_dim']),
+        make_node('Sub', [name+'_8', name+'_dim'], [name+'_sub']),
+        make_node('Concat', [name+'_sub', name+'_0'], [name+'_concat'], axis=0),
+        make_node('Pad', [name+'_shape', name+'_concat', name+'_1'], [name+'_shape_8_dim']),
+        make_node('Reshape', [name, name+'_shape_8_dim'], [name+'_data_8_dim']),
+        make_node('Transpose', [name+'_data_8_dim'], [name+'_data_t'], perm=perm),
+        make_node('Shape', [name+'_data_t'], [name+'_new_shape_']),
+        make_node('Slice', [name+'_new_shape_', name+'_sub', name+'_8', name+'_0'],
+                  [name+'_new_shape']),
+        make_node('Reshape', [name+'_data_t', name+'_new_shape'], [name+'_transposed']),
+    ]
+
+    return nodes
+
+
+@mx_op.register("_linalg_gemm2")
+def convert_linalg_gemm2(node, **kwargs):
+    """Map MXNet's _linalg_gemm2 operator attributes to onnx's
+    MatMul and Transpose operators based on the values set for
+    transpose_a, transpose_b attributes.
+    Return multiple nodes created.
+    """
+    from onnx.helper import make_node
+    name, inputs, attrs = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    dtype = input_dtypes[0]
+
+    # Getting the attributes and assigning default values.
+    alpha = float(attrs.get('alpha', 1.0))
+    axis = attrs.get('axis', 'None')
+    trans_a = get_boolean_attribute_value(attrs, 'transpose_a')
+    trans_b = get_boolean_attribute_value(attrs, 'transpose_b')
+
+    if axis != 'None':
+        raise NotImplementedError('_linalg_gemm2 does not currently support axis!=None')
+
+    nodes = []
+    input_nodes = []
+    if trans_a:
+        nodes += transpose_last_two_dim(inputs[0], kwargs)
+        input_nodes.append(inputs[0]+'_transposed')
+    else:
+        input_nodes.append(inputs[0])
+
+    if trans_b:
+        nodes += transpose_last_two_dim(inputs[1], kwargs)
+        input_nodes.append(inputs[1]+'_transposed')
+    else:
+        input_nodes.append(inputs[1])
+
+    if alpha == 1:
+        nodes += [
+            make_node('MatMul', input_nodes, [name])
+        ]
+        return nodes
+
+    create_const_scalar_node(name+"_alpha", dtype.type(alpha), kwargs)
+    nodes += [
+        make_node('MatMul', input_nodes, [name+'_matmul']),
+        make_node('Mul', [name+'_matmul', name+'_alpha'], [name])
+    ]
+    return nodes
+
+@mx_op.register('Pooling')
+def convert_pooling(node, **kwargs):
+    """Map MXNet's Pooling operator attributes to onnx's
+    MaxPool/AveragePool/GlobalMaxPool/GlobalAveragePool operators
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    kernel = convert_string_to_list(attrs.get('kernel', '()'))
+    pool_type = attrs.get('pool_type', 'max')
+    global_pool = attrs.get('global_pool', 'False')
+    global_pool = global_pool in ['True', '1']
+    _ = attrs.get('cudnn_off', 'False')
+    pooling_convention = attrs.get('pooling_convention', 'valid')
+    stride = convert_string_to_list(attrs.get('stride', '()'))
+    pad = convert_string_to_list(attrs.get('pad', '()'))
+    p_value = attrs.get('p_value', '0')
+    if p_value != 'None':
+        p_value = int(p_value)
+    count_include_pad = attrs.get('count_include_pad', 'True')
+    layout = attrs.get('layout', 'NCHW')
+
+    if pooling_convention == 'same':
+        raise NotImplementedError('Pooling currently does not support '
+                                  'pooling_convention==\'same\'')
+    if pool_type == 'sum':
+        raise NotImplementedError('Pooling currently does not support pool_type==\'sum\'')
+    if pool_type == 'lp' and not global_pool and pooling_convention != 'valid':
+        raise NotImplementedError('Pooling currently does not support '
+                                  'pooling_convention!=\'valid\' when pool_type==\'lp\' and global_pool==False')
+
+    if layout not in ['NCHW', 'NCDHW']:
+        raise NotImplementedError('Pooling currently does not support layout not in '
+                                  '[\'NCHW\', \'NCDHW\']')
+
+    kwargs_ = {}
+    if kernel:
+        kwargs_['kernel_shape'] = tuple(kernel)
+    if pad:
+        kwargs_['pads'] = tuple(pad) + tuple(pad)
+    if stride:
+        kwargs_['strides'] = stride
+
+    ceil_mode = 1 if pooling_convention == 'full' else 0
+    count_include_pad = 1 if count_include_pad == 'True' else 0
+
+    nodes = []
+    if pool_type == 'avg' and not global_pool:
+        nodes += [
+            make_node('AveragePool', [input_nodes[0]], [name], ceil_mode=ceil_mode,
+                      count_include_pad=count_include_pad, **kwargs_)
+        ]
+    elif pool_type == 'max' and not global_pool:
+        nodes += [
+            make_node('MaxPool', [input_nodes[0]], [name], ceil_mode=ceil_mode, **kwargs_)
+        ]
+    elif pool_type == 'lp' and not global_pool:
+        nodes += [
+            make_node('LpPool', [input_nodes[0]], [name], p=p_value, **kwargs_)
+        ]
+    elif pool_type == 'avg' and global_pool:
+        nodes += [
+            make_node('GlobalAveragePool', [input_nodes[0]], [name])
+        ]
+    elif pool_type == 'max' and global_pool:
+        nodes += [
+            make_node('GlobalMaxPool', [input_nodes[0]], [name])
+        ]
+    elif pool_type == 'lp' and global_pool:
+        nodes += [
+            make_node('GlobalLpPool', [input_nodes[0]], [name], p=p_value)
+        ]
+    else:
+        raise NotImplementedError('Unknown pool_type in Pooling')
+
+    return nodes
+
+
+@mx_op.register("exp")
+@mx_op.register("_npi_exp")
+def convert_exp(node, **kwargs):
+    """Map MXNet's exp operator attributes to onnx's Exp operator
+    and return the created node.
+    """
+    return create_basic_op_node('Exp', node, kwargs)
+
+@mx_op.register("_copy")
+def convert_copy(node, **kwargs):
+    """Map MXNet's _copy operator attributes to onnx's Identity operator
+    and return the created node.
+    """
+    return create_basic_op_node('Identity', node, kwargs)
+
+@mx_op.register("identity")
+def convert_identity(node, **kwargs):
+    """Map MXNet's identity operator attributes to onnx's Identity operator
+    and return the created node.
+    """
+    return create_basic_op_node('Identity', node, kwargs)
+
+@mx_op.register("InstanceNorm")
+def convert_instancenorm(node, **kwargs):
+    """Map MXNet's InstanceNorm operator attributes to onnx's InstanceNormalization operator
+    based on the input node's attributes and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    eps = float(attrs.get("eps", 0.001))
+
+    node = onnx.helper.make_node(
+        'InstanceNormalization',
+        inputs=input_nodes,
+        outputs=[name],
+        name=name,
+        epsilon=eps)
+
+    return [node]
+
+@mx_op.register("LeakyReLU")
+def convert_leakyrelu(node, **kwargs):
+    """Map MXNet's LeakyReLU operator attributes to onnx's Elu/LeakyRelu/PRelu operators
+    based on the input node's attributes and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    act_type = attrs.get("act_type", "leaky")
+    alpha = float(attrs.get("slope", 0.25))
+
+    act_name = {"elu": "Elu", "leaky": "LeakyRelu", "prelu": "PRelu",
+                "selu": "Selu"}
+
+    if act_type in ("prelu", "selu"):
+        node = onnx.helper.make_node(
+            act_name[act_type],
+            inputs=input_nodes,
+            outputs=[name],
+            name=name)
+    elif act_type in ('gelu',):
+        sqrt2 = np.float32(1.4142135623730951)
+        create_const_scalar_node(name+"_sqrt2", sqrt2, kwargs)
+        create_const_scalar_node(name+"_one", np.float32(1.0), kwargs)
+        create_const_scalar_node(name+"_half", np.float32(0.5), kwargs)
+        nodes = [
+            make_node("Div", [input_nodes[0], name+"_sqrt2"], [name+"_div0_out"]),
+            make_node("Erf", [name+"_div0_out"], [name+"_erf0_out"]),
+            make_node("Add", [name+"_erf0_out", name+"_one"], [name+"_add0_out"]),
+            make_node("Mul", [input_nodes[0], name+"_add0_out"], [name+"_mul0_out"]),
+            make_node("Mul", [name+"_mul0_out", name+"_half"], [name], name=name)
+        ]
+        return nodes
+    else:
+        node = onnx.helper.make_node(
+            act_name[act_type],
+            inputs=input_nodes,
+            outputs=[name],
+            name=name,
+            alpha=alpha)
+
+    return [node]
+
+
+@mx_op.register("softmax")
+def convert_softmax(node, **kwargs):
+    """Map MXNet's softmax operator attributes to onnx's Softmax operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    from onnx import TensorProto
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    axis = int(attrs.get("axis", -1))
+    temperature = str(attrs.get("temperature", 'None'))
+    if temperature == 'None':
+        temperature = 1.
+    else:
+        temperature = float(temperature)
+
+    use_length = str(attrs.get("use_length", 'None'))
+    use_length = use_length in ['1', 'True']
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+    data = input_nodes[0]
+
+    # use op set 11 ONNX Softmax
+    if axis == -1 and temperature == 1.:
+        nodes = []
+        if use_length:
+            # magic number, this is fp16 min
+            create_tensor([-65500.0], name+"_mask_val", kwargs["initializer"], dtype=dtype)
+            create_tensor([1], name+"_1", kwargs["initializer"])
+            create_const_scalar_node(name+"_0_s", np.int64(0), kwargs)
+            create_const_scalar_node(name+"_1_s", np.int64(1), kwargs)
+            nodes += [
+                make_node("Shape", [data], [name+"_shape"]),
+                make_node("Shape", [name+"_shape"], [name+"_dim"]),
+                make_node("Sub", [name+"_dim", name+"_1"], [name+"_dim_m1"]),
+                make_node("Slice", [name+"_shape", name+"_dim_m1", name+"_dim"],
+                          [name+"_dim_last_"]),
+                make_node("Squeeze", [name+"_dim_last_"], [name+"_dim_last"], axes=[0]),
+                make_node("Range", [name+"_0_s", name+"_dim_last", name+"_1_s"], [name+"_range"]),
+                make_node("Cast", [input_nodes[1]], [name+"_len"], to=int(TensorProto.INT64)),
+                make_node("Unsqueeze", [name+"_len"], [name+"_len_unsqueezed"], axes=[-1]),
+                make_node("Less", [name+"_range", name+"_len_unsqueezed"], [name+"_less"]),
+                make_node("Where", [name+'_less', data, name+"_mask_val"], [name+"_data_masked"])
+            ]
+            data = name+"_data_masked"
+
+        nodes += [
+            make_node("Softmax", [data], [name], axis=-1)
+        ]
+
+        return nodes
+
+    create_tensor([temperature], name+"_tmp", kwargs["initializer"], dtype=dtype)
+    nodes = [
+        make_node("Div", [data, name+"_tmp"], [name+'_data']),
+        make_node("Exp", [name+'_data'], [name+"_exp_out"]),
+        make_node("ReduceSum", [name+"_exp_out"], [name+"_rsum_out"], axes=[axis], keepdims=1),
+    ]
+    if len(input_nodes) == 1:
+        nodes += [
+            make_node("Div", [name+"_exp_out", name+"_rsum_out"], [name], name=name),
+        ]
+        return nodes
+    elif use_length:
+        length = input_nodes[1]
+
+        create_tensor([axis], name+"_axis", kwargs["initializer"])
+        create_tensor([0], name+"_0", kwargs["initializer"])
+        create_tensor([1], name+"_1", kwargs["initializer"])
+        create_const_scalar_node(name+'_-1_s', np.int64(-1), kwargs)
+        create_const_scalar_node(name+'_0_s', np.int64(0), kwargs)
+        create_const_scalar_node(name+'_1_s', np.int64(1), kwargs)
+        nodes += [
+            # cast data type
+            make_node("Cast", [length], [name+"_length"], to=int(TensorProto.INT64)),
+            make_node("Cast", [name+"_0"], [name+"_0_itype"], to=dtype_t),
+            make_node("Cast", [name+"_1"], [name+"_1_itype"], to=dtype_t),
+            # softmax output
+            make_node("Div", [name+"_exp_out", name+"_rsum_out"], [name+"_div1_out"]),
+            # update axis
+            make_node("Shape", [data], [name+"_shape0_out"]),
+            make_node("Shape", [name+"_shape0_out"], [name+"_in_dim"]),
+            make_node("Add", [name+"_in_dim", name+"_axis"], [name+"_dim+axis"]),
+            make_node("Less", [name+"_axis", name+"_0_s"], [name+"_less0_out"]),
+            make_node("Where", [name+"_less0_out", name+"_dim+axis", name+"_axis"], [name+"_final_axis"]),
+            # data mask
+            make_node("Add", [name+"_final_axis", name+"_1_s"], [name+"_final_axis+1"]),
+            make_node("Slice", [name+"_shape0_out", name+"_final_axis", name+"_final_axis+1"], [name+"_axis_dim"]),
+            make_node("Squeeze", [name+"_axis_dim"], [name+"_axis_dim_s"], axes=[0]),
+            make_node("Range", [name+"_0_s", name+"_axis_dim_s", name+"_1_s"], [name+"_range0_out"]),
+            # one hot for axis
+            make_node("Squeeze", [name+"_in_dim"], [name+"_in_dim_s"], axes=[0]),
+            make_node("Range", [name+"_0_s", name+"_in_dim_s", name+"_1_s"], [name+"_range1_out"]),
+            make_node("Equal", [name+"_range1_out", name+"_final_axis"], [name+"_equal_out"]),
+            make_node("Cast", [name+"_equal_out"], [name+"_one_hot"], to=int(TensorProto.INT64)),
+            # reshape data mask for less
+            make_node("Sub", [name+"_axis_dim_s", name+"_1_s"], [name+"_sub0_out"]),
+            make_node("Mul", [name+"_one_hot", name+"_sub0_out"], [name+"_mul0_out"]),
+            make_node("Add", [name+"_mul0_out", name+"_1_s"], [name+"_add0_out"]),
+            make_node('Reshape', [name+"_range0_out", name+"_add0_out"], [name+"_reshape0_out"]),
+            # reshape length for less
+            make_node("Mul", [name+"_one_hot", name+"_-1_s"], [name+"_mul1_out"]),
+            make_node("Add", [name+"_mul1_out", name+"_1_s"], [name+"_add1_out"]),
+            make_node("Sub", [name+"_shape0_out", name+"_1_s"], [name+"_sub1_out"]),
+            make_node("Mul", [name+"_add1_out", name+"_sub1_out"], [name+"_mul2_out"]),
+            make_node("Add", [name+"_mul2_out", name+"_1_s"], [name+"_add2_out"]),
+            make_node('Reshape', [name+"_length", name+"_add2_out"], [name+"_reshape1_out"]),
+            # mask output
+            make_node("Less", [name+"_reshape0_out", name+"_reshape1_out"], [name+"_less_out"]),
+            make_node("Cast", [name+"_less_out"], [name+"_mask"], to=dtype_t),
+            make_node("Mul", [name+"_div1_out", name+"_mask"], [name+"_mul3_out"]),
+            make_node("ReduceSum", [name+"_mul3_out"], [name+"_rsum1_out"], axes=[axis], keepdims=1),
+            make_node("Equal", [name+"_rsum1_out", name+"_0_itype"], [name+"_equal1_out"]),
+            make_node("Where", [name+"_equal1_out", name+"_1_itype", name+"_rsum1_out"], [name+"_where_out"]),
+            make_node("Div", [name+"_mul3_out", name+"_where_out"], [name], name=name)
+        ]
+        return nodes
+
+    else:
+        raise NotImplementedError("use_length must be true when both data and length are paased in.")
+
+# There's also mx.sym.softmax(), which doesn't do cross-entropy loss,
+# just softmax for inference - hence the name convert_softmax_output.
+@mx_op.register("SoftmaxOutput")
+def convert_softmax_output(node, **kwargs):
+    """Map MXNet's SoftmaxOutput operator attributes to onnx's Softmax operator
+    and return the created node.
+    """
+    name = node["name"]
+
+    input1 = kwargs["outputs_lookup"][node["inputs"][0][0]][node["inputs"][0][1]].name
+
+    softmax_node = onnx.helper.make_node(
+        "Softmax",
+        [input1],
+        [name],
+        axis=1,
+        name=name
+    )
+
+    return [softmax_node]
+
+@mx_op.register("LogisticRegressionOutput")
+def convert_logistic_regression_output(node, **kwargs):
+    """Map MXNet's SoftmaxOutput operator attributes to onnx's Softmax operator
+    and return the created node.
+    """
+    name = node["name"]
+    input1 = kwargs["outputs_lookup"][node["inputs"][0][0]][node["inputs"][0][1]].name
+
+    sigmoid_node = onnx.helper.make_node(
+        "Sigmoid",
+        [input1],
+        [name],
+        name=name
+    )
+    return [sigmoid_node]
+
+@mx_op.register("BlockGrad")
+def convert_blockgrad(node, **kwargs):
+    """ Skip operator  """
+    return create_basic_op_node('Identity', node, kwargs)
+
+@mx_op.register("MakeLoss")
+def convert_makeloss(node, **kwargs):
+    """ Skip operator  """
+    return create_basic_op_node('Identity', node, kwargs)
+
+@mx_op.register('Concat')
+@mx_op.register('_npi_concatenate')
+def convert_concat(node, **kwargs):
+    """Map MXNet's Concat operator attributes to onnx's Concat operator
+    and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    if 'dim' in attrs:
+        axis = int(attrs.get('dim', 1))
+    else:
+        axis = int(attrs.get('axis', 1))
+    concat_node = onnx.helper.make_node(
+        'Concat',
+        input_nodes,
+        [name],
+        axis=axis,
+        name=name
+    )
+    return [concat_node]
+
+
+@mx_op.register("transpose")
+@mx_op.register('_npi_transpose')
+def convert_transpose(node, **kwargs):
+    """Map MXNet's transpose operator attributes to onnx's Transpose operator
+    and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    axes = attrs.get("axes", ())
+    if axes == 'None':
+        axes = ()
+    if axes:
+        axes = tuple(map(int, re.findall(r'\d+', axes)))
+
+        transpose_node = onnx.helper.make_node(
+            "Transpose",
+            input_nodes,
+            [name],
+            perm=axes,
+            name=name
+        )
+    else:
+        transpose_node = onnx.helper.make_node(
+            "Transpose",
+            input_nodes,
+            [name],
+            name=name
+        )
+
+    return [transpose_node]
+
+
+@mx_op.register("LRN")
+def convert_lrn(node, **kwargs):
+    """Map MXNet's LRN operator attributes to onnx's LRN operator
+    and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    alpha = float(attrs.get("alpha", 0.0001))
+    beta = float(attrs.get("beta", 0.75))
+    bias = float(attrs.get("knorm", 1.0))
+    size = int(attrs.get("nsize"))
+
+    lrn_node = onnx.helper.make_node(
+        "LRN",
+        inputs=input_nodes,
+        outputs=[name],
+        name=name,
+        alpha=alpha,
+        beta=beta,
+        bias=bias,
+        size=size
+    )
+
+    return [lrn_node]
+
+
+@mx_op.register("L2Normalization")
+def convert_l2normalization(node, **kwargs):
+    """Map MXNet's L2Normalization operator attributes to onnx's LpNormalization operator
+    and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    mode = attrs.get("mode", "instance")
+
+    if mode != "channel":
+        raise AttributeError("L2Normalization: ONNX currently supports channel mode only")
+
+    l2norm_node = onnx.helper.make_node(
+        "LpNormalization",
+        input_nodes,
+        [name],
+        axis=1,  # channel only
+        name=name
+    )
+    return [l2norm_node]
+
+
+@mx_op.register("Dropout")
+def convert_dropout(node, **kwargs):
+    """Map MXNet's Dropout operator attributes to onnx's Dropout operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    _ = float(attrs.get("p", 0.5))
+    _ = convert_string_to_list(attrs.get("axes", "None"))
+    mode = attrs.get('mode', 'training')
+
+    if mode != 'training':
+        raise NotImplementedError("Dropout does not currently support mode!=\'training\'")
+
+    nodes = [
+        make_node('Identity', [input_nodes[0]], [name])
+    ]
+
+    return nodes
+
+
+@mx_op.register("Flatten")
+def convert_flatten(node, **kwargs):
+    """Map MXNet's Flatten operator attributes to onnx's Flatten operator
+    and return the created node.
+    """
+    return create_basic_op_node('Flatten', node, kwargs)
+
+@mx_op.register("clip")
+def convert_clip(node, **kwargs):
+    """Map MXNet's Clip operator attributes to onnx's Clip operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    opset_version = kwargs["opset_version"]
+
+    a_min = float(attrs.get('a_min', -np.inf))
+    a_max = float(attrs.get('a_max', np.inf))
+
+    if opset_version >= 11:
+        # opset >= 11 requires min/max to be inputs
+        input_dtype = get_input_dtypes(node, kwargs)[0]
+        create_const_scalar_node(name+"_min", np.float32(a_min).astype(input_dtype), kwargs)
+        create_const_scalar_node(name+"_max", np.float32(a_max).astype(input_dtype), kwargs)
+        nodes = [
+            make_node("Clip", [input_nodes[0], name+"_min", name+"_max"], [name], name=name)
+        ]
+    else:
+        nodes = [
+            make_node("Clip", input_nodes, [name], name=name, min=a_min, max=a_max)
+        ]
+    return nodes
+
+
+def scalar_op_helper(node, op_name, reverse=False, **kwargs):
+    """Helper function for scalar arithmetic operations"""
+    from onnx import numpy_helper
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+    scalar_value = float(attrs.get('scalar', '1'))
+    if str(dtype).startswith('int'):
+        # This irregular dtype inference is made to be consistent with MXNet 2.0 behavior
+        is_int = attrs.get('is_int', '1')
+        if is_int in ['0', 'False']:
+            if op_name == 'Div':
+                dtype = np.dtype('float32')
+            else:
+                dtype = np.dtype('float64')
+            dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+        else:
+            scalar_value = int(scalar_value)
+    else:
+        if dtype == 'float16':
+            # when using float16, we must convert it to np.uint16 view first
+            scalar_value = np.float16(scalar_value).view(np.uint16)
+    scalar_value = [scalar_value]
+
+    initializer = kwargs["initializer"]
+    flag = True
+    # If the input value is in initializer, just multiply with scalar input
+    # and create a new initializer
+    for i in initializer:
+        if i.name == input_nodes[0]:
+            if op_name == 'Mul':
+                new_initializer = numpy_helper.to_array(i) * scalar_value[0]
+            elif op_name == 'Sub':
+                if reverse:
+                    new_initializer = scalar_value[0] - numpy_helper.to_array(i)
+                else:
+                    new_initializer = numpy_helper.to_array(i) - scalar_value[0]
+            elif op_name == 'Add':
+                new_initializer = numpy_helper.to_array(i) + scalar_value[0]
+            elif op_name == 'Div':
+                if reverse:
+                    new_initializer = scalar_value[0] / numpy_helper.to_array(i)
+                else:
+                    new_initializer = numpy_helper.to_array(i) / scalar_value[0]
+            elif op_name == 'Pow':
+                new_initializer = numpy_helper.to_array(i) ** scalar_value[0]
+            flag = False
+            break
+
+    # else create a new tensor of the scalar value, add it in initializer
+    if flag is True:
+        nodes = []
+        if input_dtypes[0] != dtype:
+            nodes += [
+                make_node('Cast', [input_nodes[0]], [name+'_cast'], to=dtype_t)
+            ]
+            input_nodes[0] = name+'_cast'
+
+        dims = np.shape(scalar_value)
+        scalar_op_name = "scalar_op" + str(kwargs["idx"])
+        tensor_node = onnx.helper.make_tensor_value_info(scalar_op_name, dtype_t, dims)
+        print('in op trans', scalar_value)
+        initializer.append(
+            onnx.helper.make_tensor(
+                name=scalar_op_name,
+                data_type=dtype_t,
+                dims=dims,
+                vals=scalar_value,
+                raw=False,
+            )
+        )
+        # reverse op
+        if reverse:
+            nodes += [
+                make_node(op_name, [scalar_op_name, input_nodes[0]], [name])
+            ]
+        else:
+            nodes += [
+                make_node(op_name, [input_nodes[0], scalar_op_name], [name])
+            ]
+        return nodes, (dtype,)
+    else:
+        dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[new_initializer.dtype]
+        dims = np.shape(new_initializer)
+
+        tensor_node = onnx.helper.make_tensor_value_info(name, dtype_t, dims)
+
+        initializer.append(
+            onnx.helper.make_tensor(
+                name=name,
+                data_type=dtype_t,
+                dims=dims,
+                vals=new_initializer.flatten(),
+                raw=False,
+            )
+        )
+        return [tensor_node], (dtype,)
+
+
+# Convert scalar value into node and pass it as input to mul_node
+@mx_op.register("_mul_scalar")
+@mx_op.register("_npi_multiply_scalar")
+def convert_mul_scalar(node, **kwargs):
+    """Map MXNet's _mul_scalar operator attributes to onnx's Mul operator.
+    Creates a new node for the input scalar value, adds it to the initializer
+    and return multiple created nodes.
+    """
+    return scalar_op_helper(node, 'Mul', **kwargs)
+
+
+# Convert scalar value into node and pass it as input to mul_node
+@mx_op.register("_minus_scalar")
+@mx_op.register("_npi_subtract_scalar")
+def convert_minus_scalar(node, **kwargs):
+    """Map MXNet's _minus_scalar operator attributes to onnx's Minus operator.
+    Creates a new node for the input scalar value, adds it to the initializer
+    and return multiple created nodes.
+    """
+    return scalar_op_helper(node, 'Sub', **kwargs)
+
+@mx_op.register("_rminus_scalar")
+@mx_op.register("_npi_rsubtract_scalar")
+def convert_rminus_scalar(node, **kwargs):
+    """Map MXNet's _rminus_scalar operator attributes to onnx's Sub operator.
+    Creates a new node for the input scalar value, adds it to the initializer
+    and return multiple created nodes.
+    """
+    return scalar_op_helper(node, 'Sub', reverse=True, **kwargs)
+
+# Convert scalar value into node and pass it as input to mul_node
+@mx_op.register("_plus_scalar")
+@mx_op.register("_npi_add_scalar")
+def convert_add_scalar(node, **kwargs):
+    """Map MXNet's _plus_scalar operator attributes to onnx's Add operator.
+    Creates a new node for the input scalar value, adds it to the initializer
+    and return multiple created nodes.
+    """
+    return scalar_op_helper(node, 'Add', **kwargs)
+
+# Convert scalar value into node and pass it as input to mul_node
+@mx_op.register("_div_scalar")
+@mx_op.register("_npi_true_divide_scalar")
+def convert_div_scalar(node, **kwargs):
+    """Map MXNet's _div_scalar operator attributes to onnx's Div operator.
+    Creates a new node for the input scalar value, adds it to the initializer
+    and return multiple created nodes.
+    """
+    return scalar_op_helper(node, 'Div', **kwargs)
+
+@mx_op.register("_rdiv_scalar")
+@mx_op.register("_npi_rtrue_divide_scalar")
+def convert_rdiv_scalar(node, **kwargs):
+    """Map MXNet's _rdiv_scalar operator attributes to onnx's Div operator.
+    Creates a new node for the input scalar value, adds it to the initializer
+    and return multiple created nodes.
+    """
+    return scalar_op_helper(node, 'Div', reverse=True, **kwargs)
+
+@mx_op.register("_power_scalar")
+@mx_op.register("_npi_power_scalar")
+def convert_pow_scalar(node, **kwargs):
+    """Map MXNet's _pow_scalar operator attributes to onnx's Pow operator.
+    Creates a new node for the input scalar value, adds it to the initializer
+    and return multiple created nodes.
+    """
+    return scalar_op_helper(node, 'Pow', **kwargs)
+
+# Sorting and Searching
+@mx_op.register("argmax")
+def convert_argmax(node, **kwargs):
+    """Map MXNet's argmax operator attributes to onnx's ArgMax operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    axis = str(attrs.get('axis', 'None'))
+    keepdims = get_boolean_attribute_value(attrs, 'keepdims')
+
+    input_dtype = get_input_dtypes(node, kwargs)[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[input_dtype]
+
+    if axis == 'None':
+        create_tensor([-1], name+'_-1', kwargs['initializer'])
+        if keepdims:
+            create_tensor([1], name+'_1', kwargs['initializer'])
+            nodes = [
+                make_node('Shape', [input_nodes[0]], [name+'_shape']),
+                make_node('Shape', [name+'_shape'], [name+'_dim']),
+                make_node('Tile', [name+'_1', name+'_dim'], [name+'_tile']),
+                make_node('Reshape', [input_nodes[0], name+'_-1'], [name+'_reshape']),
+                make_node('ArgMax', [name+'_reshape'], [name+'_argmax'], axis=0, keepdims=True,),
+                make_node('Reshape', [name+'_argmax', name+'_tile'], [name+'_ret']),
+                make_node('Cast', [name+'_ret'], [name], to=dtype_t, name=name)
+            ]
+        else:
+            nodes = [
+                make_node('Reshape', [input_nodes[0], name+'_-1'], [name+'_reshape']),
+                make_node('ArgMax', [name+'_reshape'], [name+'_argmax'], axis=0, keepdims=True,),
+                make_node('Cast', [name+'_argmax'], [name], to=dtype_t, name=name)
+            ]
+    else:
+        axis = int(axis)
+        nodes = [
+            make_node('ArgMax', [input_nodes[0]], [name+'_argmax'], axis=axis, keepdims=keepdims,),
+            make_node('Cast', [name+'_argmax'], [name], to=dtype_t, name=name)
+        ]
+    return nodes
+
+
+@mx_op.register("argmin")
+def convert_argmin(node, **kwargs):
+    """Map MXNet's argmin operator attributes to onnx's ArgMin operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    axis = str(attrs.get('axis', 'None'))
+    keepdims = get_boolean_attribute_value(attrs, 'keepdims')
+
+    input_dtype = get_input_dtypes(node, kwargs)[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[input_dtype]
+
+    if axis == 'None':
+        create_tensor([-1], name+'_-1', kwargs['initializer'])
+        if keepdims:
+            create_tensor([1], name+'_1', kwargs['initializer'])
+            nodes = [
+                make_node('Shape', [input_nodes[0]], [name+'_shape']),
+                make_node('Shape', [name+'_shape'], [name+'_dim']),
+                make_node('Tile', [name+'_1', name+'_dim'], [name+'_tile']),
+                make_node('Reshape', [input_nodes[0], name+'_-1'], [name+'_reshape']),
+                make_node('ArgMin', [name+'_reshape'], [name+'_argmin'], axis=0, keepdims=True,),
+                make_node('Reshape', [name+'_argmin', name+'_tile'], [name+'_ret']),
+                make_node('Cast', [name+'_ret'], [name], to=dtype_t, name=name)
+            ]
+        else:
+            nodes = [
+                make_node('Reshape', [input_nodes[0], name+'_-1'], [name+'_reshape']),
+                make_node('ArgMin', [name+'_reshape'], [name+'_argmin'], axis=0, keepdims=True,),
+                make_node('Cast', [name+'_argmin'], [name], to=dtype_t, name=name)
+            ]
+    else:
+        axis = int(axis)
+        nodes = [
+            make_node('ArgMin', [input_nodes[0]], [name+'_argmin'], axis=axis, keepdims=keepdims,),
+            make_node('Cast', [name+'_argmin'], [name], to=dtype_t, name=name)
+        ]
+    return nodes
+
+@mx_op.register("_maximum")
+def convert_maximum(node, **kwargs):
+    """Map MXNet's _maximum operator attributes to onnx's Max operator
+    and return the created node.
+    """
+    return create_basic_op_node('Max', node, kwargs)
+
+
+@mx_op.register("_minimum")
+def convert_minimum(node, **kwargs):
+    """Map MXNet's _minimum operator attributes to onnx's Min operator
+    and return the created node.
+    """
+    return create_basic_op_node('Min', node, kwargs)
+
+@mx_op.register("min")
+@mx_op.register("_npi_min")
+def convert_min(node, **kwargs):
+    """Map MXNet's min operator attributes to onnx's ReduceMin operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    mx_axis = str(attrs.get("axis", 'None'))
+    axes = convert_string_to_list(mx_axis) if mx_axis != 'None' else None
+
+    keepdims = get_boolean_attribute_value(attrs, "keepdims")
+
+    if axes is not None:
+        if keepdims:
+            node = make_node('ReduceMin', input_nodes, [name], axes=axes, keepdims=keepdims)
+            return [node]
+        else:
+            create_tensor([1], name+'_1', kwargs['initializer'])
+            nodes = [
+                make_node('ReduceMin', input_nodes, [name+'_reduce'], axes=axes, keepdims=keepdims),
+                make_node('Shape', [name+'_reduce'], [name+'_reduce_shape']),
+                make_node('Concat', [name+'_1', name+'_reduce_shape'], [name+'_concat'], axis=0),
+                make_node('Reshape', [name+'_reduce', name+'_concat'], [name+'_reshape']),
+                make_node('Squeeze', [name+'_reshape'], [name], axes=[0]),
+            ]
+            return nodes
+    else:
+        if keepdims:
+            node = make_node('ReduceMin', input_nodes, [name], keepdims=keepdims)
+            return [node]
+
+        else:
+            create_tensor([1], name+'_1', kwargs['initializer'])
+            nodes = [
+                make_node('ReduceMin', input_nodes, [name+'_rmin'], keepdims=keepdims),
+                make_node('Reshape', [name+'_rmin', name+'_1'], [name])
+            ]
+            return nodes
+
+
+@mx_op.register("max")
+@mx_op.register("_npi_max")
+def convert_max(node, **kwargs):
+    """Map MXNet's max operator attributes to onnx's ReduceMax operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    mx_axis = str(attrs.get("axis", 'None'))
+    axes = convert_string_to_list(mx_axis) if mx_axis != 'None' else None
+
+    keepdims = get_boolean_attribute_value(attrs, "keepdims")
+
+    if axes is not None:
+        if keepdims:
+            node = make_node('ReduceMax', input_nodes, [name], axes=axes, keepdims=keepdims)
+            return [node]
+        else:
+            create_tensor([1], name+'_1', kwargs['initializer'])
+            nodes = [
+                make_node('ReduceMax', input_nodes, [name+'_reduce'], axes=axes, keepdims=keepdims),
+                make_node('Shape', [name+'_reduce'], [name+'_reduce_shape']),
+                make_node('Concat', [name+'_1', name+'_reduce_shape'], [name+'_concat'], axis=0),
+                make_node('Reshape', [name+'_reduce', name+'_concat'], [name+'_reshape']),
+                make_node('Squeeze', [name+'_reshape'], [name], axes=[0]),
+            ]
+            return nodes
+    else:
+        if keepdims:
+            node = make_node('ReduceMax', input_nodes, [name], keepdims=keepdims)
+            return [node]
+
+        else:
+            create_tensor([1], name+'_1', kwargs['initializer'])
+            nodes = [
+                make_node('ReduceMax', input_nodes, [name+'_rmax'], keepdims=keepdims),
+                make_node('Reshape', [name+'_rmax', name+'_1'], [name])
+            ]
+            return nodes
+
+
+@mx_op.register("mean")
+def convert_mean(node, **kwargs):
+    """Map MXNet's mean operator attributes to onnx's ReduceMean operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    mx_axis = str(attrs.get("axis", 'None'))
+    axes = convert_string_to_list(mx_axis) if mx_axis != 'None' else None
+
+    keepdims = get_boolean_attribute_value(attrs, "keepdims")
+
+    if axes is not None:
+        if keepdims:
+            node = make_node('ReduceMean', input_nodes, [name], axes=axes, keepdims=keepdims)
+            return [node]
+        else:
+            create_tensor([1], name+'_1', kwargs['initializer'])
+            nodes = [
+                make_node('ReduceMean', input_nodes, [name+'_reduce'], axes=axes, keepdims=keepdims),
+                make_node('Shape', [name+'_reduce'], [name+'_reduce_shape']),
+                make_node('Concat', [name+'_1', name+'_reduce_shape'], [name+'_concat'], axis=0),
+                make_node('Reshape', [name+'_reduce', name+'_concat'], [name+'_reshape']),
+                make_node('Squeeze', [name+'_reshape'], [name], axes=[0]),
+            ]
+            return nodes
+    else:
+        if keepdims:
+            node = make_node('ReduceMean', input_nodes, [name], keepdims=keepdims)
+            return [node]
+
+        else:
+            create_tensor([1], name+'_1', kwargs['initializer'])
+            nodes = [
+                make_node('ReduceMean', input_nodes, [name+'_reduce'], keepdims=keepdims),
+                make_node('Reshape', [name+'_reduce', name+'_1'], [name])
+            ]
+            return nodes
+
+
+@mx_op.register("prod")
+@mx_op.register("_npi_prod")
+def convert_prod(node, **kwargs):
+    """Map MXNet's prod operator attributes to onnx's ReduceProd operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    mx_axis = str(attrs.get("axis", 'None'))
+    axes = convert_string_to_list(mx_axis) if mx_axis != 'None' else None
+
+    keepdims = get_boolean_attribute_value(attrs, "keepdims")
+
+    if axes is not None:
+        if keepdims:
+            node = make_node('ReduceProd', input_nodes, [name], axes=axes, keepdims=keepdims)
+            return [node]
+        else:
+            create_tensor([1], name+'_1', kwargs['initializer'])
+            nodes = [
+                make_node('ReduceProd', input_nodes, [name+'_reduce'], axes=axes, keepdims=keepdims),
+                make_node('Shape', [name+'_reduce'], [name+'_reduce_shape']),
+                make_node('Concat', [name+'_1', name+'_reduce_shape'], [name+'_concat'], axis=0),
+                make_node('Reshape', [name+'_reduce', name+'_concat'], [name+'_reshape']),
+                make_node('Squeeze', [name+'_reshape'], [name], axes=[0]),
+            ]
+            return nodes
+    else:
+        if keepdims:
+            node = make_node('ReduceProd', input_nodes, [name], keepdims=keepdims)
+            return [node]
+
+        else:
+            create_tensor([1], name+'_1', kwargs['initializer'])
+            nodes = [
+                make_node('ReduceProd', input_nodes, [name+'_reduce'], keepdims=keepdims),
+                make_node('Reshape', [name+'_reduce', name+'_1'], [name])
+            ]
+            return nodes
+
+
+# Arithmetic Operations
+@mx_op.register("elemwise_add")
+def convert_elementwise_add(node, **kwargs):
+    """Map MXNet's elemwise_add operator attributes to onnx's Add operator
+    and return the created node.
+    """
+    return create_basic_op_node('Add', node, kwargs)
+
+
+@mx_op.register("broadcast_add")
+@mx_op.register("_npi_add")
+def covert_broadcast_add(node, **kwargs):
+    """Map MXNet's broadcast_add operator attributes to onnx's Add operator
+    and return the created node.
+    """
+    return create_basic_op_node('Add', node, kwargs)
+
+
+@mx_op.register("elemwise_sub")
+@mx_op.register("_npi_subtract")
+def convert_elementwise_sub(node, **kwargs):
+    """Map MXNet's elemwise_sub operator attributes to onnx's Sub operator
+    and return the created node.
+    """
+    return create_basic_op_node('Sub', node, kwargs)
+
+@mx_op.register("broadcast_sub")
+def covert_broadcast_sub(node, **kwargs):
+    """Map MXNet's broadcast_sub operator attributes to onnx's Sub operator
+    and return the created node.
+    """
+    return create_basic_op_node('Sub', node, kwargs)
+
+@mx_op.register("elemwise_mul")
+@mx_op.register("_npi_multiply")
+def convert_elemwise_mul(node, **kwargs):
+    """Map MXNet's elemwise_mul operator attributes to onnx's Mul operator
+    and return the created node.
+    """
+    return create_basic_op_node('Mul', node, kwargs)
+
+@mx_op.register("broadcast_mul")
+def convert_broadcast_mul(node, **kwargs):
+    """Map MXNet's broadcast_mul operator attributes to onnx's Mul operator
+    and return the created node.
+    """
+    return create_basic_op_node('Mul', node, kwargs)
+
+@mx_op.register("broadcast_minimum")
+def convert_broadcast_min(node, **kwargs):
+    """Map MXNet's broadcast_minimum operator attributes to onnx's Min operator
+    and return the created node.
+    """
+    return create_basic_op_node('Min', node, kwargs)
+
+
+@mx_op.register("broadcast_maximum")
+def convert_broadcast_max(node, **kwargs):
+    """Map MXNet's broadcast_maximum operator attributes to onnx's Min operator
+    and return the created node.
+    """
+    return create_basic_op_node('Max', node, kwargs)
+
+
+@mx_op.register("elemwise_div")
+def convert_elemwise_div(node, **kwargs):
+    """Map MXNet's elemwise_div operator attributes to onnx's Div operator
+    and return the created node.
+    """
+    return create_basic_op_node('Div', node, kwargs)
+
+@mx_op.register("broadcast_div")
+def convert_broadcast_div(node, **kwargs):
+    """Map MXNet's broadcast_div operator attributes to onnx's Div operator
+    and return the created node.
+    """
+    return create_basic_op_node('Div', node, kwargs)
+
+@mx_op.register("negative")
+@mx_op.register("_npi_negative")
+def convert_negative(node, **kwargs):
+    """Map MXNet's negative operator attributes to onnx's Neg operator
+    and return the created node.
+    """
+    return create_basic_op_node('Neg', node, kwargs)
+
+@mx_op.register("abs")
+@mx_op.register("_npi_absolute")
+def convert_abs(node, **kwargs):
+    """Map MXNet's abs operator attributes to onnx's Abs operator
+    and return the created node.
+    """
+    return create_basic_op_node('Abs', node, kwargs)
+
+@mx_op.register("add_n")
+def convert_addn(node, **kwargs):
+    """Map MXNet's add_n operator attributes to onnx's Sum operator
+    and return the created node.
+    """
+    return create_basic_op_node('Sum', node, kwargs)
+
+ # Rounding
+@mx_op.register("ceil")
+@mx_op.register("_npi_ceil")
+def convert_ceil(node, **kwargs):
+    """Map MXNet's ceil operator attributes to onnx's Ceil operator
+    and return the created node.
+    """
+    return create_basic_op_node('Ceil', node, kwargs)
+
+@mx_op.register("floor")
+@mx_op.register("_npi_floor")
+def convert_floor(node, **kwargs):
+    """Map MXNet's floor operator attributes to onnx's Floor operator
+    and return the created node.
+    """
+    return create_basic_op_node('Floor', node, kwargs)
+
+
+@mx_op.register("_npx_reshape")
+def convert_npx_reshape(node, **kwargs):
+    """ reshape
+    """
+    from onnx.helper import make_node
+
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    reverse = attrs.get('reverse', 'False')
+    targ_shape = convert_string_to_list(attrs['newshape'])
+
+    if reverse in ['True', '1']:
+        raise NotImplementedError('conversion of _npx_reshape with reverse==True is not '\
+                                  'implemented yet')
+
+    if [x for x in targ_shape if x in [0, -2, -3, -4, -5, -6]] != []:
+        raise NotImplementedError('conversion of _npx_reshape with 0, -2, -3, -4, -5, -6 is not '\
+                                  'implemented yet')
+
+    create_tensor(targ_shape, name+'_targ_shape', kwargs['initializer'])
+
+    nodes = []
+    nodes += [
+        make_node('Reshape', [input_nodes[0], name+'_targ_shape'], [name])
+    ]
+
+    return nodes
+
+
+# Legacy Reshape
+@mx_op.register("Reshape")
+def convert_reshape(node, **kwargs):
+    """Map MXNet's Reshape operator attributes to onnx's Reshape operator.
+    Converts output shape attribute to output shape tensor
+    and return multiple created nodes.
+    """
+    from onnx.helper import make_node
+
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    reverse = attrs.get('reverse', 'False')
+    targ_shape = convert_string_to_list(attrs["shape"])
+    # In general -2, -3, -4 in the target shape are not supoorted, but there are
+    # a few special cases that we can convert to supported scenarios
+
+    # If -2 and -3 are not used and there is no 0 to the right of -4, then we can just remove -4
+    if -4 in targ_shape and -3 not in targ_shape and -2 not in targ_shape and reverse != 'True':
+        if 0 not in targ_shape:
+            targ_shape = [i for i in targ_shape if i != -4]
+        else:
+            # index of first -4
+            ind_4 = targ_shape.index(-4)
+            # index of last 0
+            ind0 = len(targ_shape) - 1 - targ_shape[::-1].index(0)
+            if ind_4 > ind0:
+                targ_shape = [i for i in targ_shape if i != -4]
+
+    if targ_shape == [-3, 0] and reverse != 'True':
+        targ_shape = [-1, 0]
+        reverse = 'True'
+
+    special_case = False
+    if targ_shape == [0, 0, -3, -3] and reverse != 'True':
+        special_case = True
+        nodes = [
+            make_node('Shape', [input_nodes[0]], [name+'_shape']),
+            make_node('Split', [name+'_shape'], [name+'_dim0', name+'_dim1', name+'_dim2',
+                                                 name+'_dim3', name+'_dim4', name+'_dim5'],
+                      axis=0),
+            make_node('Mul', [name+'_dim2', name+'_dim3'], [name+'_mul_1']),
+            make_node('Mul', [name+'_dim4', name+'_dim5'], [name+'_mul_2']),
+            make_node('Concat', [name+'_dim0', name+'_dim1', name+'_mul_1', name+'_mul_2'],
+                      [name+'_shape_new'], axis=0),
+            make_node('Reshape', [input_nodes[0], name+'_shape_new'], [name], name=name)
+        ]
+
+    if targ_shape == [0, -4, -1, 4, 0, 0] and reverse != 'True':
+        special_case = True
+        create_tensor([4], name+'_4', kwargs['initializer'])
+        nodes = [
+            make_node('Shape', [input_nodes[0]], [name+'_shape']),
+            make_node('Split', [name+'_shape'], [name+'_dim0', name+'_dim1', name+'_dim2',
+                                                 name+'_dim3'], axis=0),
+            make_node('Div', [name+'_dim1', name+'_4'], [name+'_div']),
+            make_node('Concat', [name+'_dim0', name+'_div', name+'_4', name+'_dim2', name+'_dim3'],
+                      [name+'_shape_new'], axis=0),
+            make_node('Reshape', [input_nodes[0], name+'_shape_new'], [name], name=name)
+        ]
+
+    if targ_shape == [0, 0, -4, 2, 2, 0, 0] and reverse != 'True':
+        special_case = True
+        create_tensor([2], name+'_2', kwargs['initializer'])
+        nodes = [
+            make_node('Shape', [input_nodes[0]], [name+'_shape']),
+            make_node('Split', [name+'_shape'], [name+'_dim0', name+'_dim1', name+'_dim2',
+                                                 name+'_dim3', name+'_dim4'], axis=0),
+            make_node('Concat', [name+'_dim0', name+'_dim1', name+'_2', name+'_2',
+                                 name+'_dim3', name+'_dim4'], [name+'_shape_new'], axis=0),
+            make_node('Reshape', [input_nodes[0], name+'_shape_new'], [name], name=name)
+        ]
+
+    if targ_shape == [-4, 1, -1, 0, 0, 0] and reverse != 'True':
+        special_case = True
+        create_tensor([1], name+'_1', kwargs['initializer'])
+        create_tensor([-1], name+'_m1', kwargs['initializer'])
+        nodes = [
+            make_node('Shape', [input_nodes[0]], [name+'_shape']),
+            make_node('Split', [name+'_shape'], [name+'_dim0', name+'_dim1', name+'_dim2',
+                                                 name+'_dim3'], axis=0),
+            make_node('Concat', [name+'_1', name+'_m1', name+'_dim1', name+'_dim2', name+'_dim3'],
+                      [name+'_shape_new'], axis=0),
+            make_node('Reshape', [input_nodes[0], name+'_shape_new'], [name], name=name)
+        ]
+
+    if targ_shape == [-4, 1, 1000, 0, 0] and reverse != 'True':
+        special_case = True
+        create_tensor([1], name+'_1', kwargs['initializer'])
+        create_tensor([1000], name+'_1000', kwargs['initializer'])
+        nodes = [
+            make_node('Shape', [input_nodes[0]], [name+'_shape']),
+            make_node('Split', [name+'_shape'], [name+'_dim0', name+'_dim1', name+'_dim2'], axis=0),
+            make_node('Concat', [name+'_1', name+'_1000', name+'_dim1', name+'_dim2'],
+                      [name+'_shape_new'], axis=0),
+            make_node('Reshape', [input_nodes[0], name+'_shape_new'], [name], name=name)
+        ]
+
+    if targ_shape == [0, -4, 12, -1, 0] and reverse != 'True':
+        special_case = True
+        create_tensor([-1], name+'_m1', kwargs['initializer'])
+        create_tensor([12], name+'_12', kwargs['initializer'])
+        nodes = [
+            make_node('Shape', [input_nodes[0]], [name+'_shape']),
+            make_node('Split', [name+'_shape'], [name+'_dim0', name+'_dim1', name+'_dim2'], axis=0),
+            make_node('Concat', [name+'_dim0', name+'_12', name+'_m1', name+'_dim2'],
+                      [name+'_shape_new'], axis=0),
+            make_node('Reshape', [input_nodes[0], name+'_shape_new'], [name], name=name)
+        ]
+
+    if targ_shape == [0, -4, 16, -1, 0] and reverse != 'True':
+        special_case = True
+        create_tensor([-1], name+'_m1', kwargs['initializer'])
+        create_tensor([16], name+'_16', kwargs['initializer'])
+        nodes = [
+            make_node('Shape', [input_nodes[0]], [name+'_shape']),
+            make_node('Split', [name+'_shape'], [name+'_dim0', name+'_dim1', name+'_dim2'], axis=0),
+            make_node('Concat', [name+'_dim0', name+'_16', name+'_m1', name+'_dim2'],
+                      [name+'_shape_new'], axis=0),
+            make_node('Reshape', [input_nodes[0], name+'_shape_new'], [name], name=name)
+        ]
+
+    if targ_shape == [-3, -1] and reverse != 'True':
+        special_case = True
+        create_tensor([0], name+'_0', kwargs['initializer'])
+        create_tensor([1], name+'_1', kwargs['initializer'])
+        create_tensor([2], name+'_2', kwargs['initializer'])
+        create_tensor([-1], name+'_-1', kwargs['initializer'])
+        nodes = [
+            make_node('Shape', [input_nodes[0]], [name+'_shape']),
+            make_node('Slice', [name+'_shape', name+'_0',
+                                name+'_1'], [name+'_1st_dim']),
+            make_node('Slice', [name+'_shape', name+'_1',
+                                name+'_2'], [name+'_2nd_dim']),
+            make_node('Mul', [name+'_1st_dim', name+'_2nd_dim'], [name+'_mul']),
+            make_node('Concat', [name+'_mul', name+'_-1'], [name+'_shape_new'], axis=0),
+            make_node('Reshape', [input_nodes[0], name+'_shape_new'], [name], name=name),
+        ]
+
+    if special_case:
+        return nodes
+
+    not_supported_shape = [-2, -3, -4]
+    for val in targ_shape:
+        if val in not_supported_shape:
+            raise AttributeError("Reshape: Shape value not supported in ONNX", val)
+
+    create_tensor(targ_shape, name+'_targ_shape', kwargs['initializer'])
+
+    nodes = []
+    if reverse == 'False':
+        nodes += [
+            make_node('Reshape', [input_nodes[0], name+'_targ_shape'], [name], name=name)
+            ]
+    else:
+        create_tensor([0], name+'_0', kwargs['initializer'])
+        create_tensor([1], name+'_1', kwargs['initializer'])
+        nodes += [
+            make_node('Shape', [name+'_targ_shape'], [name+'_targ_dim']),
+            make_node('Shape', [input_nodes[0]], [name+'_orig_shape']),
+            make_node('Shape', [name+'_orig_shape'], [name+'_orig_dim']),
+            make_node('Sub', [name+'_targ_dim', name+'_orig_dim'], [name+'_dim_diff']),
+            make_node('Abs', [name+'_dim_diff'], [name+'_pad_len']),
+            make_node('Less', [name+'_targ_dim', name+'_orig_dim'], [name+'_targ_less_orig']),
+            make_node('Less', [name+'_orig_dim', name+'_targ_dim'], [name+'_orig_less_targ']),
+            make_node('Where', [name+'_targ_less_orig', name+'_pad_len', name+'_0'],
+                      [name+'_targ_pad_len']),
+            make_node('Where', [name+'_orig_less_targ', name+'_pad_len', name+'_0'],
+                      [name+'_orig_pad_len']),
+            make_node('Concat', [name+'_targ_pad_len', name+'_0'], [name+'_targ_pads'], axis=0),
+            make_node('Concat', [name+'_orig_pad_len', name+'_0'], [name+'_orig_pads'], axis=0),
+            make_node('Pad', [name+'_targ_shape', name+'_targ_pads', name+'_1'],
+                      [name+'_targ_shape_padded'], mode='constant'),
+            make_node('Pad', [name+'_orig_shape', name+'_orig_pads', name+'_1'],
+                      [name+'_orig_shape_padded'], mode='constant'),
+            make_node('Equal', [name+'_targ_shape_padded', name+'_0'],
+                      [name+'_targ_shape_0_mask']),
+            make_node('Where', [name+'_targ_shape_0_mask', name+'_orig_shape_padded',
+                                name+'_targ_shape_padded'], [name+'_targ_shape_new']),
+            make_node('Shape', [name+'_targ_shape_new'], [name+'_targ_new_dim']),
+            make_node('Slice', [name+'_targ_shape_new', name+'_targ_pad_len',
+                                name+'_targ_new_dim'], [name+'_targ_shape_final']),
+            make_node('Reshape', [input_nodes[0], name+'_targ_shape_final'], [name], name=name)
+            ]
+
+    return nodes
+
+@mx_op.register("Cast")
+def convert_cast(node, **kwargs):
+    """Map MXNet's Cast operator attributes to onnx's Cast operator
+    and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    dtype = np.dtype(attrs.get('dtype'))
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+    nodes = [
+        onnx.helper.make_node("Cast", input_nodes, [name], to=dtype_t, name=name)
+    ]
+    return nodes, (dtype,)
+
+
+@mx_op.register("slice_axis")
+def convert_slice_axis(node, **kwargs):
+    """Map MXNet's slice_axis operator attributes to onnx's Slice operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    axis = int(attrs.get("axis"))
+    begin = int(attrs.get("begin"))
+    end = attrs.get("end", None)
+
+    nodes = []
+    create_tensor([axis], name+'_axis', kwargs["initializer"])
+    create_tensor([begin], name+'_begin', kwargs["initializer"])
+    if not end or end == 'None':
+        # ONNX doesn't support None for ends. Since ends=None depicts
+        # length of dimension, passing dimension in this case.
+        nodes += [
+            make_node('Shape', [input_nodes[0]], [name+"_data_shape"])
+        ]
+        # corner case when end = None and axis = -1
+        if axis == -1:
+            create_tensor([-1], name+'_-1', kwargs["initializer"])
+            nodes += [
+                make_node('Shape', [name+'_data_shape'], [name+'_data_dim']),
+                make_node('Add', [name+'_data_dim', name+'_-1'], [name+'_axis_max']),
+                make_node('Slice', [name+'_data_shape', name+'_axis_max', name+'_data_dim'], [name+'_end']),
+            ]
+        else:
+            create_tensor([axis+1], name+"_axis_plus_1", kwargs["initializer"])
+            nodes += [
+                make_node('Slice', [name+'_data_shape', name+'_axis', name+'_axis_plus_1'],
+                          [name+"_end"])
+            ]
+    else:
+        create_tensor([int(end)], name+'_end', kwargs["initializer"])
+
+    nodes += [
+        make_node('Slice', [input_nodes[0], name+'_begin', name+'_end', name+'_axis'],
+                  [name], name=name)
+        ]
+
+    return nodes
+
+
+@mx_op.register('SliceChannel')
+def convert_slice_channel(node, **kwargs):
+    """Map MXNet's SliceChannel operator attributes to onnx's Squeeze or Split
+    operator based on squeeze_axis attribute
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    num_outputs = int(attrs.get('num_outputs'))
+    axis = int(attrs.get('axis', 1))
+    squeeze_axis = attrs.get('squeeze_axis', 'False')
+
+    nodes = []
+    if squeeze_axis in ['True', '1']:
+        nodes += [
+            make_node('Split', [input_nodes[0]], [name+str(i)+'_' for i in range(num_outputs)],
+                      axis=axis)
+        ]
+        for i in range(num_outputs):
+            nodes += [
+                make_node('Squeeze', [name+str(i)+'_'], [name+str(i)], axes=[axis])
+            ]
+    else:
+        nodes += [
+            make_node('Split', [input_nodes[0]], [name+str(i) for i in range(num_outputs)],
+                      axis=axis)
+        ]
+
+    return nodes
+
+@mx_op.register("expand_dims")
+def convert_expand_dims(node, **kwargs):
+    """Map MXNet's expand_dims operator attributes to onnx's Unsqueeze operator
+    and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    axis = int(attrs.get("axis"))
+
+    node = onnx.helper.make_node(
+        "Unsqueeze",
+        input_nodes,
+        [name],
+        axes=[axis],
+        name=name,
+    )
+    return [node]
+
+@mx_op.register("squeeze")
+@mx_op.register("_npi_squeeze")
+def convert_squeeze(node, **kwargs):
+    """Map MXNet's squeeze operator attributes to onnx's squeeze operator
+    and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    mx_axis = str(attrs.get("axis", 'None'))
+    axes = convert_string_to_list(mx_axis) if mx_axis != 'None' else None
+
+    if not axes:
+        node = onnx.helper.make_node(
+            "Squeeze",
+            input_nodes,
+            [name],
+            name=name
+        )
+    else:
+        node = onnx.helper.make_node(
+            "Squeeze",
+            input_nodes,
+            [name],
+            axes=axes,
+            name=name,
+        )
+    return [node]
+
+
+@mx_op.register("log")
+@mx_op.register("_npi_log")
+def convert_log(node, **kwargs):
+    """Map MXNet's log operator attributes to onnx's Log operator
+    and return the created node.
+    """
+    return create_basic_op_node('Log', node, kwargs)
+
+@mx_op.register("reciprocal")
+@mx_op.register("_npi_reciprocal")
+def convert_reciprocal(node, **kwargs):
+    """Map MXNet's reciprocal operator attributes to onnx's Reciprocal operator
+    and return the created node.
+    """
+    return create_basic_op_node('Reciprocal', node, kwargs)
+
+@mx_op.register("_power")
+@mx_op.register("_npi_power")
+def convert_power(node, **kwargs):
+    """Map MXNet's _power operator attributes to onnx's Pow operator
+    and return the created node.
+    """
+    return create_basic_op_node('Pow', node, kwargs)
+
+@mx_op.register("broadcast_power")
+def convert_broadcast_power(node, **kwargs):
+    """Map MXNet's _power operator attributes to onnx's Pow operator
+    and return the created node.
+    """
+    return create_basic_op_node('Pow', node, kwargs)
+
+@mx_op.register("sqrt")
+@mx_op.register("_npi_sqrt")
+def convert_sqrt(node, **kwargs):
+    """Map MXNet's sqrt operator attributes to onnx's Sqrt operator
+    and return the created node.
+    """
+    return create_basic_op_node('Sqrt', node, kwargs)
+
+@mx_op.register("depth_to_space")
+def convert_depthtospace(node, **kwargs):
+    """Map MXNet's depth_to_space operator attributes to onnx's
+    DepthToSpace operator and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    blksize = int(attrs.get("block_size", 0))
+
+    node = onnx.helper.make_node(
+        "DepthToSpace",
+        input_nodes,
+        [name],
+        blocksize=blksize,
+        name=name,
+    )
+    return [node]
+
+@mx_op.register("space_to_depth")
+def convert_spacetodepth(node, **kwargs):
+    """Map MXNet's space_to_depth operator attributes to onnx's
+    SpaceToDepth operator and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    blksize = int(attrs.get("block_size", 0))
+
+    node = onnx.helper.make_node(
+        "SpaceToDepth",
+        input_nodes,
+        [name],
+        blocksize=blksize,
+        name=name,
+    )
+    return [node]
+
+@mx_op.register("square")
+@mx_op.register("_npi_square")
+def convert_square(node, **kwargs):
+    """Map MXNet's square operator attributes to onnx's Pow operator
+    and return the created node.
+    """
+    name, input_nodes, _ = get_inputs(node, kwargs)
+
+    initializer = kwargs["initializer"]
+    data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype('int64')]
+
+    power2_name = "square_tensor" + str(kwargs["idx"])
+    tensor_node = onnx.helper.make_tensor_value_info(power2_name, data_type, (1,))
+    initializer.append(
+        onnx.helper.make_tensor(
+            name=power2_name,
+            data_type=data_type,
+            dims=(1,),
+            vals=[2],
+            raw=False,
+        )
+    )
+
+    input_nodes.append(power2_name)
+
+    node = onnx.helper.make_node(
+        "Pow",
+        input_nodes,
+        [name],
+        name=name
+    )
+    return [tensor_node, node]
+
+# sum_axis is equivalent to sum in MXNet
+@mx_op.register("sum")
+@mx_op.register("sum_axis")
+@mx_op.register("_npi_sum")
+def convert_sum(node, **kwargs):
+    """Map MXNet's sum operator attributes to onnx's ReduceSum operator
+    and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    mx_axis = attrs.get("axis", None)
+    axes = convert_string_to_list(str(mx_axis)) if mx_axis is not None else None
+
+    keepdims = get_boolean_attribute_value(attrs, "keepdims")
+    print(axes)
+    if axes != [None]:
+        node = onnx.helper.make_node(
+            'ReduceSum',
+            inputs=input_nodes,
+            outputs=[name],
+            axes=axes,
+            keepdims=keepdims,
+            name=name
+        )
+    else:
+        node = onnx.helper.make_node(
+            'ReduceSum',
+            inputs=input_nodes,
+            outputs=[name],
+            keepdims=keepdims,
+            name=name
+        )
+    return [node]
+
+
+@mx_op.register("shape_array")
+def convert_shape(node, **kwargs):
+    """Map MXNet's shape_array operator attributes to onnx's Shape operator
+    and return the created node.
+    """
+    return create_basic_op_node('Shape', node, kwargs)
+
+
+@mx_op.register("hard_sigmoid")
+def convert_hardsigmoid(node, **kwargs):
+    """Map MXNet's hard_sigmoid operator attributes to onnx's HardSigmoid operator
+    and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    # Converting to float32
+    alpha = float(attrs.get("alpha", 0.2))
+    beta = float(attrs.get("beta", 0.5))
+
+    node = onnx.helper.make_node(
+        'HardSigmoid',
+        input_nodes,
+        [name],
+        alpha=alpha,
+        beta=beta,
+        name=name
+    )
+    return [node]
+
+@mx_op.register("broadcast_lesser")
+def convert_broadcast_lesser(node, **kwargs):
+    """Map MXNet's broadcast_lesser operator attributes to onnx's Less operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, _ = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    nodes = [
+        make_node('Less', [input_nodes[0], input_nodes[1]], [name+'_lt']),
+        make_node('Cast', [name+'_lt'], [name], to=dtype_t)
+    ]
+
+    return nodes
+
+
+@mx_op.register("broadcast_lesser_equal")
+def convert_broadcast_lesser_equal(node, **kwargs):
+    """Map MXNet's broadcast_lesser_equal operator
+    """
+    from onnx.helper import make_node
+    name, input_nodes, _ = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    nodes = [
+        make_node('LessOrEqual', [input_nodes[0], input_nodes[1]], [name+'_lt']),
+        make_node('Cast', [name+'_lt'], [name], to=dtype_t)
+    ]
+
+    return nodes
+
+
+@mx_op.register("broadcast_greater_equal")
+def convert_broadcast_greater_equal(node, **kwargs):
+    """Map MXNet's broadcast_greater_equal operator
+    """
+    from onnx.helper import make_node
+    name, input_nodes, _ = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    nodes = [
+        make_node('GreaterOrEqual', [input_nodes[0], input_nodes[1]], [name+'_gt']),
+        make_node('Cast', [name+'_gt'], [name], to=dtype_t)
+    ]
+
+    return nodes
+
+
+@mx_op.register("broadcast_greater")
+def convert_broadcast_greater(node, **kwargs):
+    """Map MXNet's broadcast_greater operator attributes to onnx's Greater operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, _ = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    nodes = [
+        make_node('Greater', [input_nodes[0], input_nodes[1]], [name+'_gt']),
+        make_node('Cast', [name+'_gt'], [name], to=dtype_t)
+    ]
+
+    return nodes
+
+
+@mx_op.register("broadcast_equal")
+def convert_broadcast_equal(node, **kwargs):
+    """Map MXNet's broadcast_equal operator attributes to onnx's Equal operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, _ = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    nodes = [
+        make_node("Equal", input_nodes, [name+"_equal"]),
+        make_node("Cast", [name+"_equal"], [name], name=name, to=int(dtype_t))
+    ]
+    return nodes
+
+
+@mx_op.register("broadcast_not_equal")
+def convert_broadcast_not_equal(node, **kwargs):
+    """Map MXNet's broadcast_not_equal operator attributes to onnx's Equal operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, _ = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    nodes = [
+        make_node("Equal", input_nodes, [name+"_equal"]),
+        make_node("Not", [name+"_equal"], [name+"_not"]),
+        make_node("Cast", [name+"_not"], [name], name=name, to=int(dtype_t))
+    ]
+    return nodes
+
+
+@mx_op.register("broadcast_logical_and")
+def convert_broadcast_logical_and(node, **kwargs):
+    """Map MXNet's broadcast logical and operator attributes to onnx's And operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    from onnx import TensorProto
+    name, input_nodes, _ = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+    nodes = [
+        make_node("Cast", [input_nodes[0]], [name+"_cast0"], to=int(TensorProto.BOOL)),
+        make_node("Cast", [input_nodes[1]], [name+"_cast1"], to=int(TensorProto.BOOL)),
+        make_node("And", [name+"_cast0", name+"_cast1"], [name+"_and"]),
+        make_node("Cast", [name+"_and"], [name], name=name, to=int(dtype_t))
+    ]
+    return nodes
+
+
+@mx_op.register("broadcast_logical_or")
+def convert_broadcast_logical_or(node, **kwargs):
+    """Map MXNet's broadcast logical or operator attributes to onnx's Or operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    from onnx import TensorProto
+    name, input_nodes, _ = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+    nodes = [
+        make_node("Cast", [input_nodes[0]], [name+"_cast0"], to=int(TensorProto.BOOL)),
+        make_node("Cast", [input_nodes[1]], [name+"_cast1"], to=int(TensorProto.BOOL)),
+        make_node("Or", [name+"_cast0", name+"_cast1"], [name+"_or"]),
+        make_node("Cast", [name+"_or"], [name], name=name, to=int(dtype_t))
+    ]
+    return nodes
+
+
+@mx_op.register("broadcast_logical_xor")
+def convert_broadcast_logical_xor(node, **kwargs):
+    """Map MXNet's broadcast logical xor operator attributes to onnx's Xor operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    from onnx import TensorProto
+    name, input_nodes, _ = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+    nodes = [
+        make_node("Cast", [input_nodes[0]], [name+"_cast0"], to=int(TensorProto.BOOL)),
+        make_node("Cast", [input_nodes[1]], [name+"_cast1"], to=int(TensorProto.BOOL)),
+        make_node("Xor", [name+"_cast0", name+"_cast1"], [name+"_xor"]),
+        make_node("Cast", [name+"_xor"], [name], name=name, to=int(dtype_t))
+    ]
+    return nodes
+
+
+@mx_op.register("logical_not")
+def convert_logical_not(node, **kwargs):
+    """Map MXNet's logical not operator attributes to onnx's Not operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    from onnx import TensorProto
+    name, input_nodes, _ = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+    nodes = [
+        make_node("Cast", [input_nodes[0]], [name+"_cast"], to=int(TensorProto.BOOL)),
+        make_node("Not", [name+"_cast"], [name+"_not"]),
+        make_node("Cast", [name+"_not"], [name], name=name, to=int(dtype_t))
+    ]
+    return nodes
+
+
+@mx_op.register("size_array")
+def convert_size(node, **kwargs):
+    """Map MXNet's size_array operator attributes to onnx's Size operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, _ = get_inputs(node, kwargs)
+
+    create_tensor([1], name+'_1', kwargs['initializer'])
+    nodes = [
+        make_node('Size', [input_nodes[0]], [name+'_size']),
+        make_node('Reshape', [name+'_size', name+'_1'], [name], name=name)
+    ]
+    return nodes
+
+
+@mx_op.register("log_softmax")
+def convert_logsoftmax(node, **kwargs):
+    """Map MXNet's log_softmax operator attributes to onnx's LogSoftMax operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    # Converting to int
+    axis = int(attrs.get("axis", -1))
+    temp = attrs.get('temperature', 'None')
+    use_length = attrs.get('use_length', 'False')
+
+    if temp != 'None':
+        raise AttributeError('LogSoftMax currently does not support temperature!=None')
+
+    if use_length in ['1', 'True']:
+        raise AttributeError('LogSoftMax currently does not support use_length==True')
+
+    nodes = [
+        make_node('Exp', [input_nodes[0]], [name+'_exp']),
+        make_node('ReduceSum', [name+'_exp'], [name+'_rsum'], axes=[axis], keepdims=1),
+        make_node('Div', [name+'_exp', name+'_rsum'], [name+'_div']),
+        make_node('Log', [name+'_div'], [name])
+    ]
+
+    return nodes
+
+@mx_op.register("norm")
+def convert_norm(node, **kwargs):
+    """Map MXNet's norm operator attributes to onnx's ReduceL1 and ReduceL2 operators
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    mx_axis = attrs.get("axis", None)
+    axes = convert_string_to_list(str(mx_axis)) if mx_axis else None
+
+    keepdims = get_boolean_attribute_value(attrs, "keepdims")
+    ord = int(attrs.get("ord", 2))
+
+    if ord not in [1, 2]:
+        raise AttributeError("norm export operator only supports ord=1 or ord=2.")
+
+    onnx_op_name = "ReduceL1" if ord == 1 else "ReduceL2"
+
+    if axes:
+        if keepdims:
+            reduce_node = make_node(onnx_op_name, input_nodes, [name], axes=axes, keepdims=keepdims)
+            return [reduce_node]
+        else:
+            create_tensor([1], name+'_1', kwargs['initializer'])
+            nodes = [
+                make_node(onnx_op_name, input_nodes, [name+'_norm'], axes=axes, keepdims=keepdims),
+                make_node('Shape', [name+'_norm'], [name+'_norm_shape']),
+                make_node('Concat', [name+'_1', name+'_norm_shape'], [name+'_concat'], axis=0),
+                make_node('Reshape', [name+'_norm', name+'_concat'], [name+'_reshape']),
+                make_node('Squeeze', [name+'_reshape'], [name], axes=[0]),
+            ]
+            return nodes
+    else:
+
+        if keepdims:
+            reduce_node = make_node(onnx_op_name, input_nodes, [name], keepdims=keepdims)
+            return [reduce_node]
+        else:
+            create_tensor([1], name+'_1', kwargs['initializer'])
+            nodes = [
+                make_node(onnx_op_name, input_nodes, [name+'_norm'], keepdims=keepdims),
+                make_node('Reshape', [name+'_norm', name+'_1'], [name])
+            ]
+            return nodes
+
+
+@mx_op.register("_sample_multinomial")
+def convert_multinomial(node, **kwargs):
+    """Map MXNet's multinomial operator attributes to onnx's
+    Multinomial operator and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(attrs.get("dtype", 'int32'))]
+    sample_size = convert_string_to_list(attrs.get("shape", '1'))
+    if len(sample_size) < 2:
+        sample_size = sample_size[-1]
+    else:
+        raise AttributeError("ONNX currently supports integer sample_size only")
+    node = onnx.helper.make_node(
+        "Multinomial",
+        input_nodes,
+        [name],
+        dtype=dtype,
+        sample_size=sample_size,
+        name=name,
+    )
+    return [node]
+
+
+@mx_op.register("_random_uniform")
+def convert_random_uniform(node, **kwargs):
+    """Map MXNet's random_uniform operator attributes to onnx's RandomUniform
+    operator and return the created node.
+    """
+    name, _, attrs = get_inputs(node, kwargs)
+
+    # Converting to float32
+    low = float(attrs.get("low", 0))
+    high = float(attrs.get("high", 1.0))
+    shape = convert_string_to_list(attrs.get('shape', '[]'))
+    dtype = np.dtype(attrs.get('dtype', 'float32'))
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    node = onnx.helper.make_node(
+        'RandomUniform',
+        [],
+        [name],
+        low=low,
+        high=high,
+        dtype=dtype_t,
+        shape=shape,
+        name=name
+    )
+    return [node], (dtype,)
+
+
+@mx_op.register("_random_normal")
+def convert_random_normal(node, **kwargs):
+    """Map MXNet's random_normal operator attributes to onnx's RandomNormal
+    operator and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    # Converting to float32
+    mean = float(attrs.get("loc", 0))
+    scale = float(attrs.get("scale", 1.0))
+    shape = convert_string_to_list(attrs.get('shape', '[]'))
+    dtype = np.dtype(attrs.get('dtype', 'float32'))
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    node = onnx.helper.make_node(
+        'RandomNormal',
+        input_nodes,
+        [name],
+        mean=mean,
+        scale=scale,
+        dtype=dtype_t,
+        shape=shape,
+        name=name
+    )
+    return [node], (dtype,)
+
+
+@mx_op.register("ROIPooling")
+def convert_roipooling(node, **kwargs):
+    """Map MXNet's ROIPooling operator attributes to onnx's MaxRoiPool
+    operator and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    pooled_shape = convert_string_to_list(attrs.get('pooled_size'))
+    scale = float(attrs.get("spatial_scale"))
+
+    node = onnx.helper.make_node(
+        'MaxRoiPool',
+        input_nodes,
+        [name],
+        pooled_shape=pooled_shape,
+        spatial_scale=scale,
+        name=name
+    )
+    return [node]
+
+
+@mx_op.register("tile")
+def convert_tile(node, **kwargs):
+    """Map MXNet's Tile operator attributes to onnx's Tile
+    operator and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    data = input_nodes[0]
+    reps = convert_string_to_list(attrs["reps"])
+
+    create_tensor([0], name+'_0', kwargs['initializer'])
+    create_tensor([1], name+'_1', kwargs['initializer'])
+    create_tensor(reps, name+'_reps', kwargs['initializer'], dtype='int64')
+    create_tensor([len(reps)], name+'_reps_len', kwargs['initializer'])
+
+    nodes = [
+        make_node('Shape', [data], [name+'_data_shape']),
+        make_node('Shape', [name+'_data_shape'], [name+'_data_dim']),
+        make_node('Max', [name+'_data_dim', name+'_reps_len'], [name+'_max']),
+        make_node('Sub', [name+'_max', name+'_data_dim'], [name+'_data_diff']),
+        make_node('Concat', [name+'_data_diff', name+'_0'], [name+'_concat0_out'], axis=0),
+        make_node('Pad', [name+'_data_shape', name+'_concat0_out', name+'_1'], [name+'_data_shape_pad']),
+        make_node('Reshape', [data, name+'_data_shape_pad'], [name+'_data']),
+        make_node('Sub', [name+'_max', name+'_reps_len'], [name+'_reps_diff']),
+        make_node('Concat', [name+'_reps_diff', name+'_0'], [name+'_concat1_out'], axis=0),
+        make_node('Pad', [name+'_reps', name+'_concat1_out', name+'_1'], [name+'_reps_pad']),
+        make_node('Tile', [name+'_data', name+'_reps_pad'], [name], name=name),
+    ]
+
+    return nodes
+
+
+@mx_op.register("broadcast_to")
+@mx_op.register("_npi_broadcast_to")
+def convert_broadcast_to(node, **kwargs):
+    """Map MXNet's broadcast_to operator attributes to onnx's Expand
+    operator and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    shape_list = convert_string_to_list(attrs["shape"])
+
+    initializer = kwargs["initializer"]
+    output_shape_np = np.array(shape_list, dtype='int64')
+    data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[output_shape_np.dtype]
+    dims = np.shape(output_shape_np)
+
+    output_shape_name = "expand_attr_tensor" + str(kwargs["idx"])
+    tensor_node = onnx.helper.make_tensor_value_info(output_shape_name, data_type, dims)
+
+    initializer.append(
+        onnx.helper.make_tensor(
+            name=output_shape_name,
+            data_type=data_type,
+            dims=dims,
+            vals=shape_list,
+            raw=False,
+        )
+    )
+
+    input_nodes.append(output_shape_name)
+    expand_node = onnx.helper.make_node(
+        "Expand",
+        input_nodes,
+        [name],
+        name=name
+    )
+
+    return [tensor_node, expand_node]
+
+
+@mx_op.register('topk')
+def convert_topk(node, **kwargs):
+    """Map MXNet's topk operator attributes to onnx's TopK operator
+    and return the created node.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    opset_version = kwargs['opset_version']
+    if opset_version < 11:
+        raise AttributeError('ONNX opset 11 or greater is required to export this operator')
+
+    axis = int(attrs.get('axis', '-1'))
+    k = int(attrs.get('k', '1'))
+    ret_type = attrs.get('ret_typ', 'indices')
+    is_ascend = attrs.get('is_ascend', 'False')
+    is_ascend = is_ascend in ['1', 'True']
+    dtype = attrs.get('dtype', 'float32')
+
+    if ret_type == 'mask':
+        raise NotImplementedError('topk does not currently support ret_type==\'mask\'')
+
+    create_tensor([k], name+'_k', kwargs['initializer'])
+
+    nodes = []
+
+    if ret_type == 'both':
+        if dtype == 'int64':
+            nodes += [
+                make_node('TopK', [input_nodes[0], name+'_k'], [name+'0', name+'1'], axis=axis,
+                          largest=(not is_ascend), sorted=1),
+            ]
+        else:
+            nodes += [
+                make_node('TopK', [input_nodes[0], name+'_k'], [name+'0', name+'_1_i'], axis=axis,
+                          largest=(not is_ascend), sorted=1),
+                make_node('Cast', [name+'_1_i'], [name+'1'],
+                          to=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)])
+            ]
+    elif ret_type == 'value':
+        nodes += [
+            make_node('TopK', [input_nodes[0], name+'_k'], [name+'0', name+'_'], axis=axis,
+                      largest=(not is_ascend), sorted=1),
+        ]
+    else:
+        if dtype == 'int64':
+            nodes += [
+                make_node('TopK', [input_nodes[0], name+'_k'], [name+'_', name], axis=axis,
+                          largest=(not is_ascend), sorted=1),
+            ]
+        else:
+            nodes += [
+                make_node('TopK', [input_nodes[0], name+'_k'], [name+'__', name+'_tmp'], axis=axis,
+                          largest=(not is_ascend), sorted=1),
+                make_node('Cast', [name+'_tmp'], [name],
+                          to=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)])
+            ]
+
+    return nodes
+
+
+@mx_op.register("take")
+def convert_take(node, **kwargs):
+    """Map MXNet's Take operator attributes to onnx's Gather operator.
+    """
+    from onnx.helper import make_node
+    from onnx import TensorProto
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    axis = int(attrs.get('axis', 0))
+    mode = str(attrs.get('mode', 'clip'))
+
+    data = input_nodes[0]
+    indices = input_nodes[1]
+
+    nodes = [
+        make_node('Cast', [indices], [name+'_indices'], to=int(TensorProto.INT64)),
+    ]
+
+    if mode == 'raise':
+        nodes += [
+            make_node('Gather', [data, name+'_indices'], [name], axis=axis, name=name)
+        ]
+
+        return nodes
+
+    create_tensor([-1], name+'_-1', kwargs["initializer"])
+    nodes += [
+        make_node('Shape', [data], [name+'_data_shape']),
+    ]
+
+    # corner case
+    if axis == -1:
+        nodes += [
+            make_node('Shape', [name+'_data_shape'], [name+'_data_dim']),
+            make_node('Add', [name+'_data_dim', name+'_-1'], [name+'_axis_max']),
+            make_node('Slice', [name+'_data_shape', name+'_axis_max', name+'_data_dim'], [name+'_slice0_out']),
+        ]
+
+    else:
+        create_tensor([axis], name+'_axis', kwargs["initializer"])
+        create_tensor([axis+1], name+'_axis+1', kwargs["initializer"])
+        nodes += [
+            make_node('Slice', [name+'_data_shape', name+'_axis', name+'_axis+1'], [name+'_slice0_out']),
+        ]
+
+    if mode == 'clip':
+        create_tensor([0], name+'_0', kwargs["initializer"])
+        nodes += [
+            make_node('Add', [name+'_slice0_out', name+'_-1'], [name+'_max']),
+            make_node('Greater', [name+'_indices', name+'_max'], [name+'_max_mask']),
+            make_node('Where', [name+'_max_mask', name+'_max', name+'_indices'], [name+'_where0_out']),
+            make_node('Less', [name+'_indices', name+'_0'], [name+'_min_mask']),
+            make_node('Where', [name+'_min_mask', name+'_0', name+'_where0_out'], [name+'_where1_out']),
+            make_node('Gather', [data, name+'_where1_out'], [name], axis=axis, name=name)
+        ]
+
+    elif mode == 'wrap':
+        nodes += [
+            make_node('Mod', [name+'_indices', name+'_slice0_out'], [name+'_mod0_out']),
+            make_node('Gather', [data, name+'_mod0_out'], [name], axis=axis, name=name)
+        ]
+
+    else:
+        raise NotImplementedError("mode must be clip, wrap or raise.")
+
+    return nodes
+
+
+@mx_op.register("LayerNorm")
+def convert_layer_norm(node, **kwargs):
+    """Map MXNet's LayerNorm operator attributes to onnx operators.
+    """
+    from onnx.helper import make_node
+    from onnx import TensorProto
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    dtype = input_dtypes[0]
+
+    axes = int(attrs.get('axis', -1))
+    eps = attrs.get('eps', 9.99999975e-06)
+
+    create_tensor([axes], name+"_axes", kwargs["initializer"])
+    create_tensor([axes+1], name+"_axes+1", kwargs["initializer"])
+    create_const_scalar_node(name+'_0_s', np.int64(0), kwargs)
+    create_const_scalar_node(name+'_1_s', np.int64(1), kwargs)
+    create_const_scalar_node(name+"_2_s", np.int64(2).astype(dtype), kwargs)
+    create_const_scalar_node(name+"_eps", np.float32(eps), kwargs)
+
+    nodes = [
+        make_node("ReduceMean", [input_nodes[0]], [name+"_rm0_out"], axes=[axes]),
+        make_node("Sub", [input_nodes[0], name+"_rm0_out"], [name+"_sub0_out"]),
+        make_node("Pow", [name+"_sub0_out", name+"_2_s"], [name+"_pow0_out"]),
+        make_node("ReduceMean", [name+"_pow0_out"], [name+"_rm1_out"], axes=[axes]),
+        make_node("Add", [name+"_rm1_out", name+"_eps"], [name+"_add0_out"]),
+        make_node("Sqrt", [name+"_add0_out"], [name+"_sqrt0_out"]),
+        make_node("Div", [name+"_sub0_out", name+"_sqrt0_out"], [name+"_div0_out"]),
+    ]
+
+    if axes == -1:
+        nodes += [
+            make_node("Mul", [name+"_div0_out", input_nodes[1]], [name+"_mul0_out"]),
+            # make_node("Add", [name+"_mul0_out", input_nodes[2]], [name])
+            # the Add operator triggers a weird NaN issue in onnxruntime
+            # a workaround is to use Neg + Sub
+            make_node('Neg', [input_nodes[2]], [name+'_neg']),
+            make_node("Sub", [name+"_mul0_out", name+'_neg'], [name])
+        ]
+    else:
+        nodes += [
+            make_node("Shape", [input_nodes[0]], [name+"_shape0_out"]),
+            make_node("Shape", [name+"_shape0_out"], [name+"_in_dim"]),
+            make_node("Squeeze", [name+"_in_dim"], [name+"_in_dim_s"], axes=[0]),
+            make_node("Range", [name+"_0_s", name+"_in_dim_s", name+"_1_s"], [name+"_range"]),
+            make_node("Equal", [name+"_range", name+"_axes"], [name+"_equal"]),
+            make_node("Cast", [name+"_equal"], [name+"_one_hot"], to=int(TensorProto.INT64)),
+            make_node("Slice", [name+"_shape0_out", name+"_axes", name+"_axes+1"], [name+"_slice_out"]),
+            make_node("Squeeze", [name+"_slice_out"], [name+"_slice_out_s"], axes=[0]),
+            make_node("Sub", [name+"_slice_out_s", name+"_1_s"], [name+"_sub1_out"]),
+            make_node("Mul", [name+"_one_hot", name+"_sub1_out"], [name+"_mul0_out"]),
+            make_node("Add", [name+"_mul0_out", name+"_1_s"], [name+"_add1_out"]),
+            make_node('Reshape', [input_nodes[1], name+"_add1_out"], [name+"gamma_exp"]),
+            make_node('Reshape', [input_nodes[2], name+"_add1_out"], [name+"beta_exp"]),
+            make_node('Expand', [name+"gamma_exp", name+"_shape0_out"], [name+"gamma_exp1"]),
+            make_node('Expand', [name+"beta_exp", name+"_shape0_out"], [name+"beta_exp1"]),
+            make_node("Mul", [name+"_div0_out", name+"gamma_exp1"], [name+"_mul1_out"]),
+            make_node("Add", [name+"_mul1_out", name+"beta_exp1"], [name], name=name)
+        ]
+
+    return nodes
+
+
+@mx_op.register("_contrib_interleaved_matmul_selfatt_qk")
+def convert_matmul_selfatt_qk(node, **kwargs):
+    """Map MXNet's _contrib_interleaved_matmul_selfatt_qk operator
+    """
+    from onnx.helper import make_node
+    from onnx import TensorProto
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    heads = int(attrs.get('heads'))
+
+    # a, b, c, d, e are seq_len, batch_size, num_heads, 3, head_dim respectively
+    create_tensor([0], name+"_0", kwargs["initializer"])
+    create_tensor([1], name+"_1", kwargs["initializer"])
+    create_tensor([1], name+"_1_f", kwargs["initializer"], dtype='float32')
+    create_tensor([2], name+"_2", kwargs["initializer"])
+    create_tensor([3], name+"_3", kwargs["initializer"])
+    create_tensor([heads], name+"_c", kwargs["initializer"])
+    create_tensor([3], name+"_d", kwargs["initializer"])
+    nodes = [
+        make_node('Shape', [input_nodes[0]], [name+"_data_shape"]),
+        make_node('Slice', [name+'_data_shape', name+'_0', name+'_1'], [name+"_a"]),
+        make_node('Slice', [name+'_data_shape', name+'_1', name+'_2'], [name+"_b"]),
+        make_node('Slice', [name+'_data_shape', name+'_2', name+'_3'], [name+"_cde"]),
+        make_node('Div', [name+'_cde', name+'_c'], [name+'_de']),
+        make_node('Div', [name+'_de', name+'_d'], [name+'_e']),
+        make_node('Cast', [name+'_e'], [name+'_e_f'], to=int(TensorProto.FLOAT)),
+        make_node('Sqrt', [name+'_e_f'], [name+'_sqrt_e']),
+        make_node('Div', [name+'_1_f', name+'_sqrt_e'], [name+'_1_over_sqrt_e']),
+        make_node('Mul', [name+'_b', name+'_c'], [name+'_bc']),
+
+        make_node("Concat", [name+'_a', name+'_b', name+'_c', name+'_d', name+'_e'], \
+            [name+'_shape0'], axis=0),
+        make_node("Concat", [name+'_0', name+'_0', name+'_0', name+'_0', name+'_0'], \
+            [name+'_slice_start0'], axis=0),
+        make_node("Concat", [name+'_a', name+'_b', name+'_c', name+'_1', name+'_e'], \
+            [name+'_slice_end0'], axis=0),
+        make_node("Concat", [name+'_a', name+'_b', name+'_c', name+'_e'], \
+            [name+'_shape1'], axis=0),
+        make_node("Concat", [name+'_bc', name+'_a', name+'_e'], \
+            [name+'_shape2'], axis=0),
+        make_node("Concat", [name+'_0', name+'_0', name+'_0', name+'_1', name+'_0'], \
+            [name+'_slice_start1'], axis=0),
+        make_node("Concat", [name+'_a', name+'_b', name+'_c', name+'_2', name+'_e'], \
+            [name+'_slice_end1'], axis=0),
+
+        make_node('Reshape', [input_nodes[0], name+'_shape0'], [name+'_reshape0_out']),
+        make_node('Slice', [name+'_reshape0_out', name+'_slice_start0', name+'_slice_end0'], \
+            [name+'_slice0_out']),
+        make_node('Reshape', [name+'_slice0_out', name+'_shape1'], [name+'_reshape1_out']),
+        make_node('Transpose', [name+'_reshape1_out'], [name+'_transpose0_out'], \
+            perm=(1, 2, 0, 3)),
+        make_node('Reshape', [name+'_transpose0_out', name+'_shape2'], [name+'_reshape2_out']),
+        make_node('Mul', [name+'_reshape2_out', name+'_1_over_sqrt_e'], [name+'_mul0_out']),
+        make_node('Slice', [name+'_reshape0_out', name+'_slice_start1', name+'_slice_end1'], \
+            [name+'_slice1_out']),
+        make_node('Reshape', [name+'_slice1_out', name+'_shape1'], [name+'_reshape3_out']),
+        make_node('Transpose', [name+'_reshape3_out'], [name+'_transpose1_out'], \
+            perm=(1, 2, 0, 3)),
+        make_node('Reshape', [name+'_transpose1_out', name+'_shape2'], [name+'_reshape4_out']),
+        make_node('Transpose', [name+'_reshape4_out'], [name+'_transpose2_out'], \
+            perm=(0, 2, 1)),
+        make_node('MatMul', [name+'_mul0_out', name+'_transpose2_out'], [name], name=name)
+    ]
+
+    return nodes
+
+@mx_op.register("_contrib_interleaved_matmul_selfatt_valatt")
+def convert_contrib_interleaved_matmul_selfatt_valatt(node, **kwargs):
+    """Map MXNet's _contrib_interleaved_matmul_selfatt_valatt operator attributes to onnx's operator.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    qkv = input_nodes[0]
+    att = input_nodes[1]
+    num_heads = int(attrs.get('heads'))
+
+    create_tensor([num_heads], name+"_const_num_heads", kwargs["initializer"])
+    create_tensor([0], name+"_const_0", kwargs["initializer"])
+    create_tensor([1], name+"_const_1", kwargs["initializer"])
+    create_tensor([2], name+"_const_2", kwargs["initializer"])
+    create_tensor([3], name+"_const_3", kwargs["initializer"])
+    create_tensor([4], name+"_const_4", kwargs["initializer"])
+    create_tensor([5], name+"_const_5", kwargs["initializer"])
+    create_tensor([0, 0, num_heads, 3, -1], name+"_reshape0_shape", kwargs["initializer"])
+    create_tensor([0, 0, 0, 2, 0], name+"_slice_start", kwargs["initializer"])
+    create_tensor([0, 0, 0, -1], name+"_reshape1_shape", kwargs["initializer"])
+    create_tensor([0, 0, -1], name+"_reshape4_shape", kwargs["initializer"])
+
+    nodes = [
+        make_node("Shape", [qkv], [name+"_shape_qkv"]),
+        make_node("Slice", [name+"_shape_qkv", name+"_const_0", name+"_const_1"], [name+"_qkv_d0"]),
+        make_node("Slice", [name+"_shape_qkv", name+"_const_1", name+"_const_2"], [name+"_qkv_d1"]),
+        make_node("Slice", [name+"_shape_qkv", name+"_const_2", name+"_const_3"], [name+"_qkv_d2"]),
+        make_node('Mul', [name+"_qkv_d1", name+'_const_num_heads'], [name+'_mul_out']),
+        make_node("Reshape", [qkv, name+"_reshape0_shape"], [name+"_reshape0_output"]),
+        make_node("Shape", [name+"_reshape0_output"], [name+"_shape_reshape0"]),
+        make_node("Slice", [name+"_shape_reshape0", name+"_const_4", name+"_const_5"], [name+"_d4"]),
+        make_node("Concat", [name+"_mul_out", name+"_qkv_d0", name+"_d4"], [name+"_reshape2_shape"], axis=0),
+        make_node("Concat", [name+"_qkv_d1", name+"_const_num_heads", name+"_qkv_d0", name+"_d4"], \
+            [name+"_reshape3_shape"], axis=0),
+        make_node("Concat", [name+"_qkv_d0", name+"_qkv_d1", name+"_qkv_d2", name+"_const_3", name+"_d4"], \
+            [name+"_slice_end"], axis=0),
+        make_node("Slice", [name+"_reshape0_output", name+"_slice_start", name+"_slice_end"], [name+"_slice_output"]),
+        make_node("Reshape", [name+"_slice_output", name+"_reshape1_shape"], [name+"_reshape1_output"]),
+        make_node("Transpose", [name+"_reshape1_output"], [name+"_transpose0_output"], perm=[1, 2, 0, 3]),
+        make_node("Reshape", [name+"_transpose0_output", name+"_reshape2_shape"], [name+"_reshape2_output"]),
+        make_node("MatMul", [att, name+"_reshape2_output"], [name+"_matmul_output"]),
+        make_node("Reshape", [name+"_matmul_output", name+"_reshape3_shape"], [name+"_reshape3_output"]),
+        make_node("Transpose", [name+"_reshape3_output"], [name+"_transpose2_output"], perm=[2, 0, 1, 3]),
+        make_node("Reshape", [name+"_transpose2_output", name+"_reshape4_shape"], [name], name=name)
+    ]
+    return nodes
+
+
+@mx_op.register("broadcast_axis")
+def convert_broadcast_axis(node, **kwargs):
+    """Map MXNet's broadcast_axis
+    """
+    from onnx.helper import make_node
+    from onnx import TensorProto
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    axis = convert_string_to_list(attrs.get('axis', '()'))
+    size = convert_string_to_list(attrs.get('size', '()'))
+    assert len(axis) == len(size)
+
+    shape_name = name+'_shape_0'
+
+    create_tensor([0], name+'_0', kwargs["initializer"])
+    create_tensor([1], name+'_1', kwargs["initializer"])
+    create_const_scalar_node(name+'_0_s', np.int64(0), kwargs)
+    create_const_scalar_node(name+'_1_s', np.int64(1), kwargs)
+
+    nodes = [
+        make_node('Shape', [input_nodes[0]], [shape_name]),
+        make_node('Shape', [shape_name], [name+'_in_dim']),
+        make_node('Squeeze', [name+'_in_dim'], [name+'_in_dim_s'], axes=[0]),
+        make_node('Range', [name+'_0_s', name+'_in_dim_s', name+'_1_s'], [name+'_range']),
+    ]
+
+    for i, axis in enumerate(axis):
+        if axis not in (0, 1):
+            create_tensor([axis], name+'_'+str(axis), kwargs["initializer"])
+        create_tensor([size[i]-1], name+'_size_'+str(i), kwargs["initializer"])
+        nodes += [
+            make_node('Equal', [name+'_range', name+'_'+str(axis)], [name+'_equal_'+str(i)]),
+            make_node('Cast', [name+'_equal_'+str(i)], [name+'_cast_'+str(i)], to=int(TensorProto.INT64)),
+            make_node('Mul', [name+'_size_'+str(i), name+'_cast_'+str(i)], [name+'_mul_'+str(i)]),
+            make_node('Add', [name+'_mul_'+str(i), name+'_1'], [name+'_add_'+str(i)]),
+            make_node('Mul', [name+'_add_'+str(i), shape_name], [name+'_shape_'+str(i+1)])
+        ]
+        shape_name = name+'_shape_'+str(i+1)
+
+    nodes += [
+        make_node('Expand', [input_nodes[0], shape_name], [name], name=name)
+    ]
+
+    return nodes
+
+
+@mx_op.register("SequenceMask")
+def convert_sequencemask(node, **kwargs):
+    """Map MXNet's SequenceMask operator
+    """
+    from onnx.helper import make_node
+    from onnx import TensorProto
+
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    use_sequence_length = attrs.get('use_sequence_length', 'False')
+    mask_val = float(attrs.get('value', '0'))
+    axis = int(attrs.get('axis', '0'))
+
+    if(use_sequence_length == 'False'):
+        return [make_node('Identity', [input_nodes[0]], [name], name=name)]
+
+    create_tensor([0], name+'_0', kwargs["initializer"])
+    create_tensor([1], name+'_1', kwargs["initializer"])
+    create_tensor([2], name+'_2', kwargs["initializer"])
+    create_const_scalar_node(name+'_0_s', np.int64(0), kwargs)
+    create_const_scalar_node(name+'_1_s', np.int64(1), kwargs)
+    create_const_scalar_node(name+'_2_s', np.int64(2), kwargs)
+    create_tensor([mask_val], name+'_mask_val', kwargs["initializer"], dtype='float32')
+
+    nodes = [
+        make_node('Shape', [input_nodes[0]], [name+'_in_shape']),
+        make_node('Slice', [name+'_in_shape', name+'_0', name+'_1'], [name+'_slice_0']),
+        make_node('Slice', [name+'_in_shape', name+'_1', name+'_2'], [name+'_slice_1']),
+        make_node('Concat', [name+'_slice_0', name+'_1'], [name+'_shape_0'], axis=0),
+        make_node('Shape', [name+'_in_shape'], [name+'_in_dim']),
+        make_node('Squeeze', [name+'_in_dim'], [name+'_in_dim_s'], axes=[0]),
+        make_node('Range', [name+'_0_s', name+'_in_dim_s', name+'_1_s'], [name+'_range_0']),
+        make_node('Less', [name+'_range_0', name+'_2'], [name+'_less_0']),
+        make_node('Where', [name+'_less_0', name+'_in_shape', name+'_1'], [name+'_shape_1'])
+    ]
+
+    if(axis == 0):
+        nodes += [
+            make_node('Squeeze', [name+'_slice_0'], [name+'_max_len'], axes=[0]),
+            make_node('Range', [name+'_0_s', name+'_max_len', name+'_1_s'], [name+'_range_1']),
+            make_node('Reshape', [name+'_range_1', name+'_shape_0'], [name+"_reshape_0"]),
+            make_node('Cast', [input_nodes[1]], [name+'_cast'], to=int(TensorProto.INT64)),
+            make_node('Less', [name+'_reshape_0', name+'_cast'], [name+'_less_1']),
+            make_node('Reshape', [name+'_less_1', name+'_shape_1'], [name+"_reshape_1"]),
+            make_node('Where', [name+'_reshape_1', input_nodes[0], name+'_mask_val'], [name], name=name),
+        ]
+    else:
+        nodes += [
+            make_node('Squeeze', [name+'_slice_1'], [name+'_max_len'], axes=[0]),
+            make_node('Range', [name+'_0_s', name+'_max_len', name+'_1_s'], [name+'_range_1']),
+            make_node('Reshape', [input_nodes[1], name+'_shape_0'], [name+"_reshape_0"]),
+            make_node('Cast', [name+"_reshape_0"], [name+'_cast'], to=int(TensorProto.INT64)),
+            make_node('Less', [name+'_range_1', name+'_cast'], [name+'_less_1']),
+            make_node('Reshape', [name+'_less_1', name+'_shape_1'], [name+"_reshape_1"]),
+            make_node('Where', [name+'_reshape_1', input_nodes[0], name+'_mask_val'], [name], name=name),
+        ]
+    return nodes
+
+
+@mx_op.register("Embedding")
+def convert_embedding(node, **kwargs):
+    """Map MXNet's Embedding operator attributes to onnx's
+    Gather operator."""
+    from onnx.helper import make_node
+    from onnx import TensorProto
+
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    axis = int(attrs.get('axis', 0))
+    dtype = str(attrs.get('dtype', 'float32'))
+
+    nodes = [
+        make_node('Cast', [input_nodes[0]], [name+'_indices_casted'], to=int(TensorProto.INT64)),
+        make_node('Gather', [input_nodes[1], name+'_indices_casted'], [name], axis=axis, name=name)
+    ]
+
+    return nodes, (dtype, )
+
+
+@mx_op.register("stack")
+@mx_op.register("_npi_stack")
+def convert_stack(node, **kwargs):
+    """Map MXNet's stack operator to onnx operators.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    axis = int(attrs.get('axis', 0))
+    idx = 0
+    nodes = []
+    for input_node in input_nodes:
+        nodes.append(onnx.helper.make_node(
+            "Unsqueeze",
+            inputs=[input_node],
+            outputs=[name+"_unsqueeze"+str(idx)],
+            axes=[axis]
+        ))
+        idx += 1
+
+    nodes.append(onnx.helper.make_node(
+        "Concat",
+        inputs=[name+"_unsqueeze"+str(i) for i in range(len(nodes))],
+        outputs=[name],
+        name=name,
+        axis=axis
+    ))
+    return nodes
+
+
+@mx_op.register("slice")
+def convert_slice(node, **kwargs):
+    """Map MXNet's slice operator to onnx Slice operator."""
+    from onnx.helper import make_node
+
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    starts = convert_string_to_list(attrs.get('begin'))
+    ends = convert_string_to_list(attrs.get('end'))
+    steps = convert_string_to_list(attrs.get('step', '[]'))
+
+    assert len(starts) == len(ends)
+    if len(steps) == 0 or (len(steps) == 1 and steps[0] is None):
+        steps = [1 for x in starts]
+    else:
+        assert len(steps) == len(starts)
+    steps = [1 if x is None else x for x in steps]
+    for i, s in enumerate(steps):
+        if s < 0:
+            raise NotImplementedError('slice operator does not support negative steps yet')
+        if starts[i] is None:
+            starts[i] = 0
+        if ends[i] is None:
+            ends[i] = 2**63-1
+
+    axes = [i for i in range(len(starts))]
+
+    create_tensor(axes, name+'_axes', kwargs['initializer'])
+    create_tensor(starts, name+'_starts', kwargs['initializer'])
+    create_tensor(ends, name+'_ends', kwargs['initializer'])
+    create_tensor(steps, name+'_steps', kwargs['initializer'])
+
+    nodes = [
+        make_node("Slice", [input_nodes[0], name+'_starts', name+'_ends', name+'_axes',
+                            name+'_steps'], [name], name=name)
+    ]
+
+    return nodes
+
+
+@mx_op.register("_zeros")
+@mx_op.register("_npi_zeros")
+def convert_zeros(node, **kwargs):
+    """Map MXNet's zeros operator attributes to onnx's ConstantOfShape operator.
+    """
+    from onnx.helper import make_node, make_tensor
+    name, _, attrs = get_inputs(node, kwargs)
+    dtype = attrs.get('dtype')
+    data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)]
+    shape = convert_string_to_list(attrs.get('shape'))
+    # replace 0 with 1
+    shape = [x if x else 1 for x in shape]
+    create_tensor(shape, name+'_shape', kwargs['initializer'])
+    tensor_value = make_tensor(name+'_zero', data_type, [1], [0])
+    nodes = [
+        make_node('ConstantOfShape', [name+'_shape'], [name], name=name, value=tensor_value)
+    ]
+    return nodes, (dtype,)
+
+
+@mx_op.register("_ones")
+@mx_op.register("_npi_ones")
+def convert_ones(node, **kwargs):
+    """Map MXNet's ones operator attributes to onnx's ConstantOfShape operator.
+    """
+    from onnx.helper import make_node, make_tensor
+    name, _, attrs = get_inputs(node, kwargs)
+    dtype = attrs.get('dtype')
+    data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)]
+    shape = convert_string_to_list(attrs.get('shape'))
+    # replace 0 with 1
+    shape = [x if x else 1 for x in shape]
+    create_tensor(shape, name+'_shape', kwargs['initializer'])
+    tensor_value = make_tensor(name+'_one', data_type, [1], [1])
+    nodes = [
+        make_node('ConstantOfShape', [name+'_shape'], [name], name=name, value=tensor_value)
+    ]
+    return nodes, (dtype,)
+
+
+@mx_op.register("zeros_like")
+def convert_zeros_like(node, **kwargs):
+    """Map MXNet's zeros_like operator attributes to onnx's ConstantOfShape operator.
+    """
+    from onnx.helper import make_node, make_tensor
+    name, input_nodes, _ = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+    dtype = np.dtype(input_dtypes[0])
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    # create tensor with shape of input
+    tensor_value = make_tensor(name+"_zero", dtype_t, [1], [0])
+    nodes = [
+        make_node("Shape", [input_nodes[0]], [name+"_shape"]),
+        make_node("ConstantOfShape", [name+"_shape"], [name], name=name, value=tensor_value)
+    ]
+    return nodes
+
+
+@mx_op.register("ones_like")
+def convert_ones_like(node, **kwargs):
+    """Map MXNet's ones_like operator attributes to onnx's ConstantOfShape operator.
+    """
+    from onnx.helper import make_node, make_tensor
+    name, input_nodes, _ = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+    dtype = np.dtype(input_dtypes[0])
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    # create tensor with shape of input
+    tensor_value = make_tensor(name+"_one", dtype_t, [1], [1])
+    nodes = [
+        make_node("Shape", [input_nodes[0]], [name+"_shape"]),
+        make_node("ConstantOfShape", [name+"_shape"], [name], name=name, value=tensor_value)
+    ]
+    return nodes
+
+
+@mx_op.register("_contrib_arange_like")
+def convert_arange_like(node, **kwargs):
+    """Map MXNet's arange_like operator attributes to onnx's Range and Reshape operators.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    opset_version = kwargs['opset_version']
+    if opset_version < 11:
+        raise AttributeError("ONNX opset 11 or greater is required to export this operator")
+
+    # use the same dtype as the that of the input node
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+    axis = attrs.get('axis', 'None')
+    start = attrs.get('start', 0.)
+    step = attrs.get('step', 1.)
+    repeat = int(attrs.get('repeat', 1))
+    if repeat != 1:
+        raise NotImplementedError("arange_like operator with repeat != 1 not yet implemented.")
+
+    create_const_scalar_node(name+"_start", np.dtype(dtype).type(start), kwargs)
+    create_const_scalar_node(name+"_step", np.dtype(dtype).type(step), kwargs)
+    create_const_scalar_node(name+"_half_step", np.dtype(dtype).type(float(step)*0.5), kwargs)
+
+    nodes = []
+    if axis == 'None':
+        # output will be same shape as input
+        nodes += [
+            make_node('Shape', [input_nodes[0]], [name+"_shape0_out"]),
+            make_node("ReduceProd", [name+"_shape0_out"], [name+"_redprod0_out"]),
+            make_node('Squeeze', [name+'_redprod0_out'], [name+'_reshape0_out'], axes=[0]),
+            make_node("Cast", [name+"_reshape0_out"], [name+"_cast0_out"], to=dtype_t),
+            make_node("Mul", [name+"_cast0_out", name+"_step"], [name+"_mul0_out"]),
+            make_node("Add", [name+"_mul0_out", name+"_start"], [name+"_add1_out"]),
+            make_node("Sub", [name+"_add1_out", name+"_half_step"], [name+"_sub0_out"]),
+            make_node("Range", [name+"_start", name+"_sub0_out", name+"_step"], [name+"_range0_out"]),
+            make_node("Reshape", [name+"_range0_out", name+"_shape0_out"], [name], name=name)
+        ]
+    else:
+        # determine shape of axis
+        create_tensor([int(axis)], name+"_axis_start", kwargs["initializer"], dtype='int64')
+        create_tensor([int(axis)+1], name+"_axis_end", kwargs["initializer"], dtype='int64')
+        nodes += [
+            make_node('Shape', [input_nodes[0]], [name+"_shape0_out"]),
+            make_node('Slice', [name+"_shape0_out", name+"_axis_start", name+"_axis_end"], [name+"_slice0_out"]),
+            make_node("ReduceProd", [name+"_slice0_out"], [name+"_reprod0_out"]),
+            make_node('Squeeze', [name+'_reprod0_out'], [name+'_reshape0_out'], axes=[0]),
+            make_node("Cast", [name+"_reshape0_out"], [name+"_cast0_out"], to=dtype_t),
+            make_node("Mul", [name+"_cast0_out", name+"_step"], [name+"_mul0_out"]),
+            make_node("Add", [name+"_mul0_out", name+"_start"], [name+"_add1_out"]),
+            make_node("Sub", [name+"_add1_out", name+"_half_step"], [name+"_sub0_out"]),
+            make_node("Range", [name+"_start", name+"_sub0_out", name+"_step"], [name], name=name)
+        ]
+
+    return nodes
+
+
+@mx_op.register("_contrib_BilinearResize2D")
+def convert_contrib_BilinearResize2D(node, **kwargs):
+    """Map MXNet's contrib_BilinearResize2D operator attributes to onnx.
+    """
+    from onnx.helper import make_node
+    from onnx import TensorProto
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    opset_version = kwargs['opset_version']
+    if opset_version < 11:
+        raise AttributeError("ONNX opset 11 or greater is required to export this operator")
+
+    height = int(attrs.get('height', 0))
+    width = int(attrs.get('width', 0))
+
+    scale_height = float(attrs.get('scale_height', 0))
+    scale_width = float(attrs.get('scale_width', 0))
+
+    if height * width == 0 and scale_height * scale_width == 0:
+        raise AttributeError('height, width or scale_height, scale_width cannot be 0')
+
+    mode = attrs.get('mode', 'size')
+    if mode != 'size':
+        raise NotImplementedError('contrib_BilinearResize2D with mode other than "size" is \
+                                   not supported')
+
+    create_tensor([], name+'_roi', kwargs['initializer'], dtype='float32')
+    create_tensor([], name+'_scales_empty', kwargs['initializer'],
+                  dtype='float32')
+
+    nodes = []
+    if scale_height == 0:
+        create_tensor([0], name+'_0', kwargs['initializer'])
+        create_tensor([2], name+'_2', kwargs['initializer'])
+        create_tensor([height, width], name+'_h_w', kwargs['initializer'], dtype='int64')
+        nodes += [
+            make_node('Shape', [input_nodes[0]], [name+'_shape']),
+            make_node('Slice', [name+'_shape', name+'_0', name+'_2'], [name+'_shape_01']),
+            make_node('Concat', [name+'_shape_01', name+'_h_w'], [name+'_sizes'], axis=0),
+        ]
+    else:
+        create_tensor([1, 1, scale_height, scale_width], name+'_scales', kwargs['initializer'],
+                      dtype='float32')
+        nodes += [
+            make_node('Shape', [input_nodes[0]], [name+'_shape']),
+            make_node('Cast', [name+'_shape'], [name+'_shape_f'], to=int(TensorProto.FLOAT)),
+            make_node('Mul', [name+'_shape_f', name+'_scales'], [name+'_sizes_']),
+            make_node('Cast', [name+'_sizes_'], [name+'_sizes'], to=int(TensorProto.INT64)),
+        ]
+    nodes += [
+        make_node('Resize', [input_nodes[0], name+'_roi', name+'_scales_empty', name+'_sizes'], [name],
+                  mode='linear', coordinate_transformation_mode='align_corners', name=name)
+    ]
+
+    return nodes
+
+
+@mx_op.register("_arange")
+@mx_op.register("_npi_arange")
+def convert_arange(node, **kwargs):
+    """Map MXNet's arange operator attributes to onnx's Range operator.
+    """
+    from onnx.helper import make_node
+    name, _, attrs = get_inputs(node, kwargs)
+
+    opset_version = kwargs['opset_version']
+    if opset_version < 11:
+        raise AttributeError("ONNX opset 11 or greater is required to export this operator")
+
+    start = attrs.get('start', 0.)
+    stop = attrs.get('stop')
+    step = attrs.get('step', 1.)
+    dtype = attrs.get('dtype', 'float32')
+    repeat = int(attrs.get('repeat', 1))
+
+    if stop == 'None':
+        stop = start
+        start = 0
+
+    if repeat != 1:
+        raise NotImplementedError("arange operator with repeat != 1 not yet implemented.")
+
+    create_const_scalar_node(name+"_start", np.dtype(dtype).type(start), kwargs)
+    create_const_scalar_node(name+"_stop", np.dtype(dtype).type(stop), kwargs)
+    create_const_scalar_node(name+"_step", np.dtype(dtype).type(step), kwargs)
+
+    nodes = [
+        make_node("Range", [name+"_start", name+"_stop", name+"_step"], [name], name=name)
+    ]
+
+    return nodes, (dtype,)
+
+
+@mx_op.register("reverse")
+def convert_reverse(node, **kwargs):
+    """Map MXNet's reverse operator attributes to ONNX
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    axis = int(attrs.get('axis', 0))
+
+    # Transpose takes perm as a parameter, so we must 'pad' the input to a known dim (8 here)
+    perm = [i for i in range(8)]
+    perm[0], perm[axis] = axis, 0
+
+    create_tensor([8], name+'_8', kwargs['initializer'])
+    create_tensor([0], name+'_0', kwargs['initializer'])
+    create_tensor([1], name+'_1', kwargs['initializer'])
+    create_tensor([-1], name+'_m1', kwargs['initializer'])
+    create_tensor([axis], name+'_axis', kwargs['initializer'])
+    create_tensor([axis+1], name+'_axis_p1', kwargs['initializer'])
+    create_const_scalar_node(name+'_m1_s', np.int64(-1), kwargs)
+
+    nodes = [
+        make_node('Shape', [input_nodes[0]], [name+'_shape']),
+        make_node('Shape', [name+'_shape'], [name+'_dim']),
+        make_node('Sub', [name+'_8', name+'_dim'], [name+'_sub']),
+        make_node('Concat', [name+'_0', name+'_sub'], [name+'_concat'], axis=0),
+        make_node('Pad', [name+'_shape', name+'_concat', name+'_1'], [name+'_shape_8_dim']),
+        make_node('Reshape', [input_nodes[0], name+'_shape_8_dim'], [name+'_data_8_dim']),
+        make_node('Transpose', [name+'_data_8_dim'], [name+'_data_t'], perm=perm),
+        make_node('Slice', [name+'_shape', name+'_axis', name+'_axis_p1'], [name+'_axis_len']),
+        make_node('Sub', [name+'_axis_len', name+'_1'], [name+'_axis_len_m1']),
+        make_node('Squeeze', [name+'_axis_len_m1'], [name+'_axis_len_m1_s'], axes=[0]),
+        make_node('Range', [name+'_axis_len_m1_s', name+'_m1_s', name+'_m1_s'], [name+'_indices']),
+        make_node('Gather', [name+'_data_t', name+'_indices'], [name+'_gather']),
+        make_node('Transpose', [name+'_gather'], [name+'_data_reversed'], perm=perm),
+        make_node('Reshape', [name+'_data_reversed', name+'_shape'], [name], name=name)
+    ]
+
+    return nodes
+
+
+@mx_op.register('repeat')
+def convert_repeat(node, **kwargs):
+    """Map MXNet's repeat operator attributes to onnx's Tile operator.
+    """
+    from onnx.helper import make_node
+    from onnx import TensorProto
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    opset_version = kwargs['opset_version']
+    if opset_version < 11:
+        raise AttributeError('ONNX opset 11 or greater is required to export this operator')
+
+    repeats = int(attrs.get('repeats', 1))
+    axis = attrs.get('axis', 'None')
+
+    if repeats <= 0:
+        raise NotImplementedError('repeat operator does not support parameter repeats==0')
+
+    nodes = []
+    if axis == 'None':
+        create_tensor([repeats], name+'_rep', kwargs['initializer'])
+        create_tensor([1, repeats], name+'_repeats', kwargs['initializer'])
+        nodes += [
+            make_node('Shape', [input_nodes[0]], [name+'_shape']),
+            make_node('ReduceProd', [name+'_shape'], [name+'_size']),
+            make_node('Reshape', [input_nodes[0], name+'_size'], [name+'_flat']),
+            make_node('Unsqueeze', [name+'_flat'], [name+'_unsqueeze'], axes=[-1]),
+            make_node('Tile', [name+'_unsqueeze', name+'_repeats'], [name+'_tile']),
+            make_node('Mul', [name+'_size', name+'_rep'], [name+'_new_size']),
+            make_node('Reshape', [name+'_tile', name+'_new_size'], [name], name=name)
+        ]
+    else:
+        axis = int(axis)
+        repeats -= 1
+        create_tensor([repeats], name+'_repeats', kwargs['initializer'])
+        create_tensor([1], name+'_1', kwargs['initializer'])
+        create_tensor([0], name+'_0', kwargs['initializer'])
+        create_tensor([axis], name+'_axis', kwargs['initializer'])
+        create_const_scalar_node(name+"_0_s", np.int64(0), kwargs)
+        create_const_scalar_node(name+"_1_s", np.int64(1), kwargs)
+        nodes += [
+            make_node('Shape', [input_nodes[0]], [name+'_shape']),
+            make_node('Shape', [name+'_shape'], [name+'_dim']),
+            make_node('Squeeze', [name+'_dim'], [name+'_dim_s'], axes=[0]),
+            make_node('Range', [name+'_0_s', name+'_dim_s', name+'_1_s'], [name+'_range'])
+        ]
+        if axis < 0:
+            nodes += [
+                make_node('Add', [name+'_axis', name+'_dim'], [name+'_true_axis']),
+                make_node('Equal', [name+'_range', name+'_true_axis'], [name+'_one_hot'])
+                ]
+        else:
+            nodes += [
+                make_node('Equal', [name+'_range', name+'_axis'], [name+'_one_hot'])
+                ]
+        nodes += [
+            make_node('Cast', [name+'_one_hot'], [name+'_one_hot_int'], to=int(TensorProto.INT64)),
+            make_node('Mul', [name+'_repeats', name+'_one_hot_int'], [name+'_mul']),
+            make_node('Add', [name+'_mul', name+'_1'], [name+'_add']),
+            make_node('Concat', [name+'_1', name+'_add'], [name+'_repeats_tensor'], axis=0)
+            ]
+        if axis == -1:
+            nodes += [
+                make_node('Concat', [name+'_shape', name+'_1'], [name+'_unsqueeze_shape'], axis=0),
+                make_node('Reshape', [input_nodes[0], name+'_unsqueeze_shape'],
+                          [name+'_unsqueeze'])
+                ]
+        else:
+            nodes += [
+                make_node('Unsqueeze', [input_nodes[0]], [name+'_unsqueeze'], axes=[axis+1])
+                ]
+        nodes += [
+            make_node('Tile', [name+'_unsqueeze', name+'_repeats_tensor'], [name+'_tile']),
+            make_node('Mul', [name+'_shape', name+'_add'], [name+'_new_shape']),
+            make_node('Reshape', [name+'_tile', name+'_new_shape'], [name], name=name)
+            ]
+
+    return nodes
+
+
+@mx_op.register('_contrib_box_nms')
+def convert_contrib_box_nms(node, **kwargs):
+    """Map MXNet's _contrib_box_nms operator to ONNX
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    dtype = input_dtypes[0]
+    #dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    opset_version = kwargs['opset_version']
+    if opset_version < 11:
+        raise AttributeError('ONNX opset 11 or greater is required to export this operator')
+
+    overlap_thresh = float(attrs.get('overlap_thresh', '0.5'))
+    valid_thresh = float(attrs.get('valid_thresh', '0'))
+    topk = int(attrs.get('topk', '-1'))
+    coord_start = int(attrs.get('coord_start', '2'))
+    score_index = int(attrs.get('score_index', '1'))
+    id_index = int(attrs.get('id_index', '-1'))
+    force_suppress = attrs.get('force_suppress', 'True')
+    background_id = int(attrs.get('background_id', '-1'))
+    in_format = attrs.get('in_format', 'corner')
+    out_format = attrs.get('out_format', 'corner')
+
+    center_point_box = 0 if in_format == 'corner' else 1
+
+    if topk == -1:
+        topk = 2**31-1
+
+    if in_format != out_format:
+        raise NotImplementedError('box_nms does not currently support in_fomat != out_format')
+
+    if background_id != -1:
+        raise NotImplementedError('box_nms does not currently support background_id != -1')
+
+    if id_index != -1 or force_suppress == 'False':
+        logging.warning('box_nms: id_idex != -1 or/and force_suppress == False detected. '
+                        'However, due to ONNX limitations, boxes of different categories will NOT '
+                        'be exempted from suppression. This might lead to different behavior than '
+                        'native MXNet')
+
+    create_tensor([coord_start], name+'_cs', kwargs['initializer'])
+    create_tensor([coord_start+4], name+'_cs_p4', kwargs['initializer'])
+    create_tensor([score_index], name+'_si', kwargs['initializer'])
+    create_tensor([score_index+1], name+'_si_p1', kwargs['initializer'])
+    create_tensor([topk], name+'_topk', kwargs['initializer'])
+    create_tensor([overlap_thresh], name+'_ot', kwargs['initializer'], dtype=np.float32)
+    create_tensor([valid_thresh], name+'_vt', kwargs['initializer'], dtype=np.float32)
+    create_tensor([-1], name+'_m1', kwargs['initializer'])
+    create_tensor([-1], name+'_m1_f', kwargs['initializer'], dtype=dtype)
+    create_tensor([0], name+'_0', kwargs['initializer'])
+    create_tensor([1], name+'_1', kwargs['initializer'])
+    create_tensor([2], name+'_2', kwargs['initializer'])
+    create_tensor([3], name+'_3', kwargs['initializer'])
+    create_tensor([0, 1, -1], name+'_scores_shape', kwargs['initializer'])
+    create_tensor([0, 0, 1, 0], name+'_pad', kwargs['initializer'])
+    create_tensor([0, -1], name+'_bat_spat_helper', kwargs['initializer'])
+    create_const_scalar_node(name+"_0_s", np.int64(0), kwargs)
+    create_const_scalar_node(name+"_1_s", np.int64(1), kwargs)
+
+    nodes = [
+        make_node('Shape', [input_nodes[0]], [name+'_shape']),
+        make_node('Shape', [name+'_shape'], [name+'_dim']),
+        make_node('Sub', [name+'_dim', name+'_2'], [name+'_dim_m2']),
+        make_node('Slice', [name+'_shape', name+'_dim_m2', name+'_dim'], [name+'_shape_last2']),
+        make_node('Concat', [name+'_m1', name+'_shape_last2'], [name+'_shape_3d'], axis=0),
+        make_node('Reshape', [input_nodes[0], name+'_shape_3d'], [name+'_data_3d']),
+        make_node('Slice', [name+'_data_3d', name+'_cs', name+'_cs_p4', name+'_m1'],
+                  [name+'_boxes']),
+        make_node('Slice', [name+'_data_3d', name+'_si', name+'_si_p1', name+'_m1'],
+                  [name+'_scores_raw']),
+        make_node('Reshape', [name+'_scores_raw', name+'_scores_shape'], [name+'_scores']),
+        make_node('Shape', [name+'_scores'], [name+'_scores_shape_actual']),
+        make_node('NonMaxSuppression',
+                  [name+'_boxes', name+'_scores', name+'_topk', name+'_ot', name+'_vt'],
+                  [name+'_nms'], center_point_box=center_point_box),
+        make_node('Slice', [name+'_nms', name+'_0', name+'_3', name+'_m1', name+'_2'],
+                  [name+'_nms_sliced']),
+        make_node('GatherND', [name+'_data_3d', name+'_nms_sliced'], [name+'_candidates']),
+        make_node('Pad', [name+'_candidates', name+'_pad', name+'_m1_f'], [name+'_cand_padded']),
+        make_node('Shape', [name+'_nms'], [name+'_nms_shape']),
+        make_node('Slice', [name+'_nms_shape', name+'_0', name+'_1'], [name+'_cand_cnt']),
+        make_node('Squeeze', [name+'_cand_cnt'], [name+'_cc_s'], axes=[0]),
+        make_node('Range', [name+'_0_s', name+'_cc_s', name+'_1_s'], [name+'_cand_indices']),
+        make_node('Slice', [name+'_scores_shape_actual', name+'_0', name+'_3', name+'_m1',
+                            name+'_2'], [name+'_shape_bat_spat']),
+        make_node('Slice', [name+'_shape_bat_spat', name+'_1', name+'_2'], [name+'_spat_dim']),
+        make_node('Expand', [name+'_cand_cnt', name+'_shape_bat_spat'], [name+'_base_indices']),
+        make_node('ScatterND', [name+'_base_indices', name+'_nms_sliced', name+'_cand_indices'],
+                  [name+'_indices']),
+        make_node('TopK', [name+'_indices', name+'_spat_dim'], [name+'_indices_sorted', name+'__'],
+                  largest=0, axis=-1, sorted=1),
+        make_node('Gather', [name+'_cand_padded', name+'_indices_sorted'], [name+'_gather']),
+        make_node('Reshape', [name+'_gather', name+'_shape'], [name+'0'])
+    ]
+
+    return nodes
+
+
+@mx_op.register("_greater_scalar")
+def convert_greater_scalar(node, **kwargs):
+    """Map MXNet's greater_scalar operator attributes to onnx's Greater
+    operator and return the created node.
+    """
+    from onnx.helper import make_node, make_tensor
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    scalar = float(attrs.get('scalar'))
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    if str(dtype).startswith('int'):
+        scalar = int(scalar)
+    else:
+        if dtype == 'float16':
+            # when using float16, we must convert it to np.uint16 view first
+            scalar = np.float16(scalar).view(np.uint16)
+    tensor_value = make_tensor(name+"_scalar", dtype_t, [1], [scalar])
+    nodes = [
+        make_node("Constant", [], [name+"_rhs"], value=tensor_value),
+        make_node("Greater", [input_nodes[0], name+"_rhs"], [name+"_gt"]),
+        make_node("Cast", [name+"_gt"], [name], to=dtype_t, name=name)
+    ]
+    return nodes
+
+
+@mx_op.register("_lesser_scalar")
+def convert_lesser_scalar(node, **kwargs):
+    """Map MXNet's lesser_scalar operator attributes to onnx's Less
+    operator and return the created node.
+    """
+    from onnx.helper import make_node, make_tensor
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    scalar = float(attrs.get('scalar'))
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    if str(dtype).startswith('int'):
+        scalar = int(scalar)
+    else:
+        if dtype == 'float16':
+            # when using float16, we must convert it to np.uint16 view first
+            scalar = np.float16(scalar).view(np.uint16)
+
+    tensor_value = make_tensor(name+"_scalar", dtype_t, [1], [scalar])
+    nodes = [
+        make_node("Constant", [], [name+"_rhs"], value=tensor_value),
+        make_node("Less", [input_nodes[0], name+"_rhs"], [name+"_lt"]),
+        make_node("Cast", [name+"_lt"], [name], to=dtype_t, name=name)
+    ]
+    return nodes
+
+
+@mx_op.register("_equal_scalar")
+def convert_equal_scalar(node, **kwargs):
+    """Map MXNet's equal_scalar operator attributes to onnx.
+    """
+    from onnx.helper import make_node, make_tensor
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    scalar = float(attrs.get('scalar'))
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    if str(dtype).startswith('int'):
+        scalar = int(scalar)
+    else:
+        if dtype == 'float16':
+            # when using float16, we must convert it to np.uint16 view first
+            scalar = np.float16(scalar).view(np.uint16)
+
+    tensor_value = make_tensor(name+"_scalar", dtype_t, [1], [scalar])
+    nodes = [
+        make_node("Constant", [], [name+"_rhs"], value=tensor_value),
+        make_node("Equal", [input_nodes[0], name+"_rhs"], [name+"_eq"]),
+        make_node("Cast", [name+"_eq"], [name], to=dtype_t, name=name)
+    ]
+    return nodes
+
+
+@mx_op.register('where')
+@mx_op.register('_npi_where')
+def convert_where(node, **kwargs):
+    """Map MXNet's where operator attributes to onnx's Where
+    operator and return the created node.
+    """
+    from onnx.helper import make_node
+    from onnx import TensorProto
+    name, input_nodes, _ = get_inputs(node, kwargs)
+    # note that in mxnet the condition tensor can either have the same shape as x and y OR
+    # have shape (first dim of x,)
+    create_tensor([0], name+'_0', kwargs['initializer'])
+    create_tensor([1], name+'_1', kwargs['initializer'])
+    nodes = [
+        make_node('Shape', [input_nodes[0]], [name+'_cond_shape']),
+        make_node('Shape', [name+'_cond_shape'], [name+'_cond_dim']),
+        make_node('Shape', [input_nodes[1]], [name+'_x_shape']),
+        make_node('Shape', [name+'_x_shape'], [name+'_x_dim']),
+        make_node('Sub', [name+'_x_dim', name+'_cond_dim'], [name+'_sub']),
+        make_node('Concat', [name+'_0', name+'_sub'], [name+'_concat'], axis=0),
+        make_node('Pad', [name+'_cond_shape', name+'_concat', name+'_1'], [name+'_cond_new_shape']),
+        make_node('Reshape', [input_nodes[0], name+'_cond_new_shape'], [name+'_cond']),
+        make_node('Cast', [name+'_cond'], [name+'_bool'], to=int(TensorProto.BOOL)),
+        make_node('Where', [name+'_bool', input_nodes[1], input_nodes[2]], [name], name=name)
+    ]
+    return nodes
+
+
+@mx_op.register('_maximum_scalar')
+def convert_maximum_scalar(node, **kwargs):
+    """Map MXNet's _maximum_scalar
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    input_dtypes = get_input_dtypes(node, kwargs)
+    dtype = input_dtypes[0]
+
+    scalar = None
+    if 'float' in str(dtype):
+        scalar = float(attrs.get('scalar', '0'))
+    else:
+        scalar = int(attrs.get('scalar', '0'))
+
+    create_tensor([scalar], name+'_scalar', kwargs['initializer'], dtype=dtype)
+    nodes = [
+        make_node('Max', [input_nodes[0], name+'_scalar'], [name], name=name)
+    ]
+
+    return nodes
+
+@mx_op.register('_minimum_scalar')
+def convert_minimum_scalar(node, **kwargs):
+    """Map MXNet's _minimum_scalar
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    input_dtypes = get_input_dtypes(node, kwargs)
+    dtype = input_dtypes[0]
+
+    scalar = None
+    if 'float' in str(dtype):
+        scalar = float(attrs.get('scalar', '0'))
+    else:
+        scalar = int(attrs.get('scalar', '0'))
+
+    create_tensor([scalar], name+'_scalar', kwargs['initializer'], dtype=dtype)
+    nodes = [
+        make_node('Min', [input_nodes[0], name+'_scalar'], [name], name=name)
+    ]
+
+    return nodes
+
+@mx_op.register("_contrib_box_decode")
+def convert_contrib_box_decode(node, **kwargs):
+    """Map MXNet's _contrib_box_decode operator attributes to onnx's operator.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    input_dtypes = get_input_dtypes(node, kwargs)
+
+    dtype = input_dtypes[0]
+    dtype_t = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[dtype]
+
+    data = input_nodes[0]
+    anchors = input_nodes[1]
+    fmt = attrs.get('format', 'center')
+    std0 = float(attrs.get('std0', '1.'))
+    std1 = float(attrs.get('std1', '1.'))
+    std2 = float(attrs.get('std2', '1.'))
+    std3 = float(attrs.get('std3', '1.'))
+    clip = float(attrs.get('clip', '-1.'))
+
+    if fmt not in ['center', 'corner']:
+        raise NotImplementedError("format must be either corner or center.")
+
+    create_tensor([0], name+'_0', kwargs["initializer"])
+    create_tensor([2], name+'_2', kwargs["initializer"])
+    create_tensor([4], name+'_4', kwargs["initializer"])
+    create_tensor([2], name+'_2f', kwargs["initializer"], dtype='float32')
+    create_tensor([clip], name+'_clip', kwargs["initializer"], dtype='float32')
+    create_tensor([std0, std1, std2, std3], name+'_std_1d', kwargs["initializer"], dtype='float32')
+    create_tensor([1, 4], name+'_std_shape', kwargs["initializer"])
+
+    nodes = [
+        make_node("Cast", [data], [name+'_data'], to=int(onnx.TensorProto.FLOAT)),
+        make_node("Cast", [anchors], [name+'_anchors'], to=int(onnx.TensorProto.FLOAT)),
+        make_node('Reshape', [name+'_std_1d', name+'_std_shape'], [name+'_std']),
+        make_node("Mul", [name+'_data', name+'_std'], [name+'_mul0_out']),
+        make_node('Slice', [name+'_mul0_out', name+'_0', name+'_2', name+'_2'], [name+'_data_xy']),
+        make_node('Slice', [name+'_mul0_out', name+'_2', name+'_4', name+'_2'], [name+'_data_wh']),
+    ]
+
+    if fmt == 'corner':
+        nodes += [
+            make_node('Slice', [name+'_anchors', name+'_0', name+'_2', name+'_2'], [name+'_slice0_out']),
+            make_node('Slice', [name+'_anchors', name+'_2', name+'_4', name+'_2'], [name+'_slice1_out']),
+            make_node('Sub', [name+'_slice1_out', name+'_slice0_out'], [name+'_anchor_wh']),
+            make_node('Div', [name+'_anchor_wh', name+'_2f'], [name+'_div0_out']),
+            make_node("Add", [name+'_slice0_out', name+'_div0_out'], [name+'_anchor_xy']),
+        ]
+    else:
+        nodes += [
+            make_node('Slice', [name+'_anchors', name+'_0', name+'_2', name+'_2'], [name+'_anchor_xy']),
+            make_node('Slice', [name+'_anchors', name+'_2', name+'_4', name+'_2'], [name+'_anchor_wh']),
+        ]
+
+    nodes += [
+        make_node("Mul", [name+'_data_xy', name+'_anchor_wh'], [name+'_mul1_out']),
+        make_node("Add", [name+'_mul1_out', name+'_anchor_xy'], [name+'_add0_out']),
+    ]
+
+    if clip > 0.:
+        nodes += [
+            make_node("Less", [name+"_data_wh", name+"_clip"], [name+"_less0_out"]),
+            make_node('Where', [name+'_less0_out', name+'_data_wh', name+'_clip'], [name+'_where0_out']),
+            make_node("Exp", [name+'_where0_out'], [name+'_exp0_out']),
+        ]
+    else:
+        nodes += [
+            make_node("Exp", [name+'_data_wh'], [name+'_exp0_out']),
+        ]
+
+    nodes += [
+        make_node("Mul", [name+'_exp0_out', name+'_anchor_wh'], [name+'_mul2_out']),
+        make_node('Div', [name+'_mul2_out', name+'_2f'], [name+'_div1_out']),
+        make_node('Sub', [name+'_add0_out', name+'_div1_out'], [name+'_sub0_out']),
+        make_node('Add', [name+'_add0_out', name+'_div1_out'], [name+'_add1_out']),
+        make_node('Concat', [name+'_sub0_out', name+'_add1_out'], [name+'concat0_out'], axis=2),
+        make_node("Cast", [name+'concat0_out'], [name], to=dtype_t, name=name)
+    ]
+
+    return nodes
+
+@mx_op.register("_contrib_AdaptiveAvgPooling2D")
+def convert_contrib_AdaptiveAvgPooling2D(node, **kwargs):
+    """Map MXNet's _contrib_AdaptiveAvgPooling2D operator
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    output_size = attrs.get('output_size', '1')
+    output_size = convert_string_to_list(output_size)
+
+    if len(output_size) <= 2:
+        if output_size[0] != 1 or (len(output_size) == 2 and output_size[1] != 1):
+            raise NotImplementedError("_contrib_AdaptiveAvgPooling2D operator with output_size != 1 \
+                                not yet implemented.")
+    nodes = [
+        make_node("GlobalAveragePool", [input_nodes[0]], [name], name=name)
+    ]
+
+    return nodes
+
+
+@mx_op.register('broadcast_mod')
+@mx_op.register('_npi_mod')
+def convert_broadcast_mod(node, **kwargs):
+    """Map MXNet's broadcast_mod operator
+    """
+    from onnx.helper import make_node
+    name, input_nodes, _ = get_inputs(node, kwargs)
+
+    # The behavior of MXNet mod is a mixture of np.mod and np.fmod
+    # note: the behavior when divison by 0 is supposed to be platform dependent
+    #       but here we set the result to 0 to be consistent with MXNet
+    nodes = [
+        make_node('Sub', [input_nodes[1], input_nodes[1]], [name+'_zero']),
+        make_node('Mod', [input_nodes[0], input_nodes[1]], [name+'_mod'], fmod=1),
+        make_node('Less', [input_nodes[0], name+'_zero'], [name+'_mask_0']),
+        make_node('Less', [input_nodes[1], name+'_zero'], [name+'_mask_1']),
+        make_node('Equal', [name+'_mod', name+'_zero'], [name+'_mask_2_']),
+        make_node('Not', [name+'_mask_2_'], [name+'_mask_2']),
+        make_node('Xor', [name+'_mask_0', name+'_mask_1'], [name+'_mask_']),
+        make_node('And', [name+'_mask_', name+'_mask_2'], [name+'_mask']),
+        make_node('Where', [name+'_mask', input_nodes[1], name+'_zero'], [name+'_adjustment']),
+        make_node('Add', [name+'_mod', name+'_adjustment'], [name+'_adjusted']),
+        make_node('Equal', [input_nodes[1], name+'_zero'], [name+'_mask_div_0']),
+        make_node('Where', [name+'_mask_div_0', name+'_zero', name+'_adjusted'], [name], name=name)
+        ]
+
+    return nodes
+
+
+@mx_op.register("reshape_like")
+def convert_reshape_like(node, **kwargs):
+    """Map MXNet's reshape_like operator attributes to onnx's operator.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    lhs = input_nodes[0]
+    rhs = input_nodes[1]
+
+    lhs_begin = str(attrs.get('lhs_begin', '0'))
+    rhs_begin = str(attrs.get('rhs_begin', '0'))
+    lhs_end = str(attrs.get('lhs_end', 'None'))
+    rhs_end = str(attrs.get('rhs_end', 'None'))
+
+    if lhs_begin == 'None' or rhs_begin == 'None':
+        raise NotImplementedError("lhs_begin and rhs_begin should not be None.")
+
+    lhs_begin = int(lhs_begin)
+    rhs_begin = int(rhs_begin)
+
+    # basic case
+    if lhs_begin == 0 and lhs_end == 'None' and rhs_begin == 0 and rhs_end == 'None':
+        nodes = [
+            make_node('Shape', [rhs], [name+'_shape_rhs']),
+            make_node('Reshape', [lhs, name+'_shape_rhs'], [name], name=name)
+        ]
+        return nodes
+
+    create_tensor([0], name+'_0', kwargs["initializer"])
+    nodes = [
+        make_node('Shape', [lhs], [name+'_lhs_shape']),
+        make_node('Shape', [name+'_lhs_shape'], [name+'_lhs_dim']),
+        make_node('Shape', [rhs], [name+'_rhs_shape']),
+        make_node('Shape', [name+'_rhs_shape'], [name+'_rhs_dim']),
+    ]
+
+    if lhs_begin >= 0:
+        create_tensor([lhs_begin], name+'_lhs_begin', kwargs["initializer"])
+    else:
+        create_tensor([lhs_begin], name+'_lhs_begin_neg', kwargs["initializer"])
+        nodes += [
+            make_node('Add', [name+'_lhs_dim', name+'_lhs_begin_neg'], [name+'_lhs_begin']),
+        ]
+
+    if rhs_begin >= 0:
+        create_tensor([rhs_begin], name+'_rhs_begin', kwargs["initializer"])
+    else:
+        create_tensor([rhs_begin], name+'_rhs_begin_neg', kwargs["initializer"])
+        nodes += [
+            make_node('Add', [name+'_rhs_dim', name+'_rhs_begin_neg'], [name+'_rhs_begin']),
+        ]
+
+    if lhs_end == 'None':
+        nodes += [
+            make_node('Add', [name+'_lhs_dim', name+'_0'], [name+'_lhs_end']),
+        ]
+    else:
+        lhs_end = int(lhs_end)
+        if lhs_end >= 0:
+            create_tensor([lhs_end], name+'_lhs_end', kwargs["initializer"])
+        else:
+            create_tensor([lhs_end], name+'_lhs_end_neg', kwargs["initializer"])
+            nodes += [
+                make_node('Add', [name+'_lhs_dim', name+'_lhs_end_neg'], [name+'_lhs_end']),
+            ]
+
+    if rhs_end == 'None':
+        nodes += [
+            make_node('Add', [name+'_rhs_dim', name+'_0'], [name+'_rhs_end']),
+        ]
+    else:
+        rhs_end = int(rhs_end)
+        if rhs_end >= 0:
+            create_tensor([rhs_end], name+'_rhs_end', kwargs["initializer"])
+        else:
+            create_tensor([rhs_end], name+'_rhs_end_neg', kwargs["initializer"])
+            nodes += [
+                make_node('Add', [name+'_rhs_dim', name+'_rhs_end_neg'], [name+'_rhs_end']),
+            ]
+
+    nodes += [
+        make_node('Slice', [name+'_lhs_shape', name+'_0', name+'_lhs_begin'], [name+'_slice0_out']),
+        make_node('Slice', [name+'_rhs_shape', name+'_rhs_begin', name+'_rhs_end'], [name+'_slice1_out']),
+        make_node('Concat', [name+'_slice0_out', name+'_slice1_out'], [name+'_concat0_out'], axis=0),
+        make_node('Slice', [name+'_lhs_shape', name+'_lhs_end', name+'_lhs_dim'], [name+'_slice2_out']),
+        make_node('Concat', [name+'_concat0_out', name+'_slice2_out'], [name+'_concat1_out'], axis=0),
+        make_node('Reshape', [lhs, name+'_concat1_out'], [name], name=name)
+    ]
+
+    return nodes
+
+
+@mx_op.register("gather_nd")
+def convert_gather_nd(node, **kwargs):
+    """Map MXNet's gather_ND operator attributes to onnx's operator.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, _ = get_inputs(node, kwargs)
+
+    data = input_nodes[0]
+    indices = input_nodes[1]
+
+    # Onnx Transpose operator takes perm as a parameter, so we need to 'pad'
+    # the input to a known dim (8 here)
+    perm = [7] + [i for i in range(1, 7)] + [0]
+
+    create_tensor([0], name+'_0', kwargs['initializer'])
+    create_tensor([1], name+'_1', kwargs['initializer'])
+    create_tensor([8], name+'_8', kwargs['initializer'])
+    nodes = [
+        # Generate 8-d filter
+        make_node('Shape', [indices], [name+'_indices_shape']),
+        make_node('Shape', [name+'_indices_shape'], [name+'_indices_dim']),
+        make_node('Sub', [name+'_8', name+'_indices_dim'], [name+'_sub0_out']),
+        make_node('Concat', [name+'_0', name+'_sub0_out'], [name+'_concat0_out'], axis=0),
+        make_node('Pad', [name+'_indices_shape', name+'_concat0_out', name+'_1'], [name+'_shape_8_dim']),
+        make_node('Reshape', [indices, name+'_shape_8_dim'], [name+'_indices_8_dim']),
+        make_node('Transpose', [name+'_indices_8_dim'], [name+'_transpose0_output'], perm=perm),
+        # Reshape filter to acutall dim for GatherND computation
+        make_node('Slice', [name+'_indices_shape', name+'_0', name+'_1'],
+                  [name+'_slice0_out']),
+        make_node('Slice', [name+'_indices_shape', name+'_1', name+'_indices_dim'],
+                  [name+'_slice1_out']),
+        make_node('Concat', [name+'_slice1_out', name+'_slice0_out'], [name+'_concat1_out'], axis=0),
+        make_node('Reshape', [name+'_transpose0_output', name+'_concat1_out'], [name+'_reshape0_out']),
+        # Cast data type for indicies
+        make_node('Cast', [name+'_reshape0_out'], [name+'_cast0_out'], to=int(onnx.TensorProto.INT64)),
+        make_node('GatherND', [data, name+'_cast0_out'], [name], name=name)
+    ]
+
+    return nodes
+
+
+@mx_op.register('UpSampling')
+def convert_upsampling(node, **kwargs):
+    """Map MXNet's UpSampling operator to onnx.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    scale = int(attrs.get('scale', '1'))
+    sample_type = attrs.get('sample_type')
+    num_args = int(attrs.get('num_args', '1'))
+
+    if num_args > 1:
+        raise NotImplementedError('Upsampling conversion does not currently support num_args > 1')
+
+    if sample_type != 'nearest':
+        raise NotImplementedError('Upsampling conversion does not currently support \
+                                   sample_type != nearest')
+
+    create_tensor([], name+'_roi', kwargs['initializer'], dtype='float32')
+    create_tensor([1, 1, scale, scale], name+'_scales', kwargs['initializer'],
+                  dtype='float32')
+    nodes = [
+        make_node('Resize', [input_nodes[0], name+'_roi', name+'_scales'], [name], mode='nearest',
+                  coordinate_transformation_mode='half_pixel')
+    ]
+
+    return nodes
+
+
+@mx_op.register('SwapAxis')
+def convert_swapaxis(node, **kwargs):
+    """Map MXNet's SwapAxis operator
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    dim1 = int(attrs.get('dim1', '0'))
+    dim2 = int(attrs.get('dim2', '0'))
+
+    if dim1 < 0 or dim2 < 0:
+        raise NotImplementedError('SwapAxis conversion does not support dim1 < 0\
+                                   or dim2 < 0')
+
+    indices = [[dim1], [dim2]]
+    vals = [dim2, dim1]
+    perm = [i for i in range(8)]
+    perm[dim1], perm[dim2] = dim2, dim1
+
+    create_tensor(indices, name+'_ind', kwargs['initializer'])
+    create_tensor(indices[::-1], name+'_ind_rev', kwargs['initializer'])
+    create_tensor(vals, name+'_vals', kwargs['initializer'])
+    create_tensor(perm, name+'_perm', kwargs['initializer'])
+    create_tensor([0], name+'_0', kwargs['initializer'])
+    create_tensor([1], name+'_1', kwargs['initializer'])
+    create_tensor([8], name+'_8', kwargs['initializer'])
+
+    nodes = [
+        make_node('Shape', [input_nodes[0]], [name+'_shape']),
+        make_node('Shape', [name+'_shape'], [name+'_dim']),
+        make_node('Sub', [name+'_8', name+'_dim'], [name+'_sub']),
+        make_node('ScatterND', [name+'_perm', name+'_ind', name+'_vals'],
+                  [name+'_perm_new']),
+        make_node('GatherND', [name+'_shape', name+'_ind'], [name+'_gather']),
+        make_node('ScatterND', [name+'_shape', name+'_ind_rev', name+'_gather'],
+                  [name+'_shape_new']),
+        make_node('Concat', [name+'_0', name+'_sub'], [name+'_pad'], axis=0),
+        make_node('Pad', [name+'_shape', name+'_pad', name+'_1'], [name+'_shape_padded']),
+        make_node('Reshape', [input_nodes[0], name+'_shape_padded'], [name+'_data_padded']),
+        make_node('Transpose', [name+'_data_padded'], [name+'_trans'], perm=perm),
+        make_node('Reshape', [name+'_trans', name+'_shape_new'], [name])
+    ]
+
+    return nodes
+
+
+@mx_op.register('slice_like')
+def convert_slice_like(node, **kwargs):
+    """Map MXNet's slice_like operator to onnx Slice operator."""
+    from onnx.helper import make_node, make_tensor
+    from onnx import TensorProto
+
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    axes = convert_string_to_list(attrs.get('axes', 'None'))
+    zero = make_tensor(name+'_zero', TensorProto.INT64, [1], [0])
+
+    nodes = []
+    if axes == [None]:
+        nodes += [
+            make_node('Shape', [input_nodes[1]], [name+'_shape_1']),
+            make_node('Shape', [name+'_shape_1'], [name+'_dim_1']),
+            make_node('ConstantOfShape', [name+'_dim_1'], [name+'_starts'], value=zero),
+            make_node('Slice', [input_nodes[0], name+'_starts', name+'_shape_1'], [name])
+        ]
+    else:
+        axes = [[i] for i in axes]
+        create_tensor([0], name+'_0', kwargs['initializer'])
+        create_tensor(axes, name+'_axes_', kwargs['initializer'])
+        nodes += [
+            make_node('Shape', [input_nodes[0]], [name+'_shape_0']),
+            make_node('Shape', [input_nodes[1]], [name+'_shape_1']),
+            make_node('Shape', [name+'_shape_0'], [name+'_dim_0']),
+            make_node('Less', [name+'_axes_', name+'_0'], [name+'_less']),
+            make_node('Cast', [name+'_less'], [name+'_mask'], to=int(TensorProto.INT64)),
+            make_node('Mul', [name+'_mask', name+'_dim_0'], [name+'_mul']),
+            make_node('Add', [name+'_axes_', name+'_mul'], [name+'_axes']),
+            make_node('ConstantOfShape', [name+'_dim_0'], [name+'_starts'], value=zero),
+            make_node('GatherND', [name+'_shape_1', name+'_axes'], [name+'_gather']),
+            make_node('ScatterND', [name+'_shape_0', name+'_axes', name+'_gather'],
+                      [name+'_ends']),
+            make_node('Slice', [input_nodes[0], name+'_starts', name+'_ends'], [name])
+            ]
+
+    return nodes
+
+
+@mx_op.register("broadcast_like")
+def convert_broadcast_like(node, **kwargs):
+    """Map MXNet's broadcast_like operator attributes to onnx's operator.
+    """
+    from onnx.helper import make_node
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    lhs = input_nodes[0]
+    rhs = input_nodes[1]
+    lhs_axes = convert_string_to_list(str(attrs.get('lhs_axes', 'None')))
+    rhs_axes = convert_string_to_list(str(attrs.get('rhs_axes', 'None')))
+
+    if lhs_axes[0] is None or rhs_axes[0] is None:
+        nodes = [
+            make_node('Shape', [rhs], [name+'_rhs_shape']),
+            make_node('Expand', [lhs, name+'_rhs_shape'], [name], name=name)
+        ]
+        return nodes
+
+    lhs_axes = [[i] for i in lhs_axes]
+    rhs_axes = [[i] for i in rhs_axes]
+
+    create_tensor([0], name+'_0', kwargs['initializer'])
+    create_tensor(lhs_axes, name+'_lhs_axes', kwargs['initializer'])
+    create_tensor(rhs_axes, name+'_rhs_axes', kwargs['initializer'])
+
+    nodes = [
+        make_node('Shape', [lhs], [name+'_lhs_shape']),
+        make_node('Shape', [rhs], [name+'_rhs_shape']),
+        make_node('Shape', [name+'_lhs_shape'], [name+'_lhs_dim']),
+        make_node('Less', [name+'_lhs_axes', name+'_0'], [name+'_less']),
+        make_node('Cast', [name+'_less'], [name+'_mask'], to=int(onnx.TensorProto.INT64)),
+        make_node('Mul', [name+'_mask', name+'_lhs_dim'], [name+'_mul']),
+        make_node('Add', [name+'_lhs_axes', name+'_mul'], [name+'_lhs_axes_positive']),
+        make_node('GatherND', [name+'_rhs_shape', name+'_rhs_axes'], [name+'_gather']),
+        make_node('ScatterND', [name+'_lhs_shape', name+'_lhs_axes_positive', name+'_gather'],
+                  [name+'_scatter']),
+        make_node('Expand', [lhs, name+'_scatter'], [name], name=name)
+    ]
+
+    return nodes
+
+
+@mx_op.register('_contrib_ROIAlign')
+def convert_contrib_roialign(node, **kwargs):
+    """Map MXNet's _contrib_ROIAlign
+    """
+    from onnx.helper import make_node
+    from onnx import TensorProto
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    pooled_size = convert_string_to_list(str(attrs.get('pooled_size')))
+    spatial_scale = float(attrs.get('spatial_scale'))
+    sample_ratio = int(attrs.get('sample_ratio', '0'))
+    position_sensitive = attrs.get('position_sensitive', 'False')
+    aligned = attrs.get('aligned', 'False')
+
+    if position_sensitive != 'False':
+        raise NotImplementedError('_contrib_ROIAlign does not currently support \
+                                   position_sensitive!=False')
+    if aligned != 'False':
+        raise NotImplementedError('_contrib_ROIAlign does not currently support \
+                                   aligned!=False')
+
+    create_tensor([0], name+'_0', kwargs['initializer'])
+    create_tensor([0], name+'_0_s', kwargs['initializer'], dtype='float32')
+    create_tensor([1], name+'_1', kwargs['initializer'])
+    create_tensor([5], name+'_5', kwargs['initializer'])
+
+    nodes = [
+        make_node('Slice', [input_nodes[1], name+'_1', name+'_5', name+'_1'], [name+'_rois']),
+        make_node('Slice', [input_nodes[1], name+'_0', name+'_1', name+'_1'], [name+'_inds___']),
+        make_node('Squeeze', [name+'_inds___'], [name+'_inds__'], axes=[1]),
... 5142 lines suppressed ...