You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2022/04/21 15:44:09 UTC

[tvm-site] branch asf-site updated: deploying docs (apache/tvm@a6ef5af1587c71dc69d710058b95f8baa9c6cc4d)

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new 00bdb4cec deploying docs (apache/tvm@a6ef5af1587c71dc69d710058b95f8baa9c6cc4d)
00bdb4cec is described below

commit 00bdb4cecf0432b6cfdb9fe0428aad9acbdca7ae
Author: tvm-bot <95...@users.noreply.github.com>
AuthorDate: Thu Apr 21 15:44:02 2022 +0000

    deploying docs (apache/tvm@a6ef5af1587c71dc69d710058b95f8baa9c6cc4d)
---
 .../how_to/compile_models/from_mxnet.rst.txt       |    2 +-
 .../how_to/compile_models/from_paddle.rst.txt      |    2 +-
 .../how_to/compile_models/from_pytorch.rst.txt     |    2 +-
 .../how_to/compile_models/from_tensorflow.rst.txt  |    2 +-
 .../compile_models/sg_execution_times.rst.txt      |   20 +-
 .../deploy_models/deploy_model_on_android.rst.txt  |    2 +-
 .../deploy_object_detection_pytorch.rst.txt        |    4 +-
 .../deploy_models/deploy_prequantized.rst.txt      |    6 +-
 .../deploy_prequantized_tflite.rst.txt             |    4 +-
 .../how_to/deploy_models/deploy_quantized.rst.txt  |    2 +-
 .../deploy_models/deploy_ssd_gluoncv.rst.txt       |    4 +-
 .../deploy_models/sg_execution_times.rst.txt       |   18 +-
 .../extend_tvm/bring_your_own_datatypes.rst.txt    |    4 +-
 .../how_to/extend_tvm/sg_execution_times.rst.txt   |   10 +-
 .../how_to/extend_tvm/use_pass_instrument.rst.txt  |   16 +-
 .../optimize_operators/opt_conv_cuda.rst.txt       |    2 +-
 .../optimize_operators/opt_conv_tensorcore.rst.txt |    2 +-
 .../how_to/optimize_operators/opt_gemm.rst.txt     |   16 +-
 .../optimize_operators/sg_execution_times.rst.txt  |    8 +-
 .../sg_execution_times.rst.txt                     |   16 +-
 .../tune_conv2d_layer_cuda.rst.txt                 |  996 +++++++++++++++++--
 .../tune_network_cuda.rst.txt                      |    2 +-
 .../tune_network_x86.rst.txt                       |    4 +-
 .../tune_sparse_x86.rst.txt                        |   86 +-
 .../tune_with_autotvm/sg_execution_times.rst.txt   |   12 +-
 .../tune_with_autotvm/tune_conv2d_cuda.rst.txt     |   34 +-
 .../work_with_microtvm/micro_autotune.rst.txt      |   16 +-
 .../work_with_microtvm/sg_execution_times.rst.txt  |   12 +-
 .../work_with_relay/sg_execution_times.rst.txt     |    8 +-
 .../work_with_schedules/sg_execution_times.rst.txt |   18 +-
 .../how_to/work_with_schedules/tensorize.rst.txt   |    2 +-
 .../tutorials/autotvm/sg_execution_times.rst.txt   |    6 +-
 .../frontend/deploy_classification.rst.txt         |    2 +-
 .../tutorials/frontend/deploy_detection.rst.txt    |    2 +-
 .../tutorials/frontend/sg_execution_times.rst.txt  |    6 +-
 .../tutorials/optimize/sg_execution_times.rst.txt  |    6 +-
 .../topic/vta/tutorials/sg_execution_times.rst.txt |    6 +-
 .../tutorial/auto_scheduler_matmul_x86.rst.txt     |    6 +-
 docs/_sources/tutorial/autotvm_relay_x86.rst.txt   |   68 +-
 .../tutorial/cross_compilation_and_rpc.rst.txt     |    2 +-
 docs/_sources/tutorial/intro_topi.rst.txt          |    2 +-
 docs/_sources/tutorial/sg_execution_times.rst.txt  |   24 +-
 .../tutorial/tensor_expr_get_started.rst.txt       |   44 +-
 docs/_static/pygments.css                          |    7 +-
 docs/arch/convert_layout.html                      |  108 +--
 docs/arch/debugger.html                            |    2 +-
 docs/arch/frontend/tensorflow.html                 |    4 +-
 docs/arch/index.html                               |    2 +-
 docs/arch/inferbound.html                          |  202 ++--
 .../arch/introduction_to_module_serialization.html |   78 +-
 docs/arch/pass_infra.html                          |  322 +++----
 docs/arch/relay_intro.html                         |   42 +-
 docs/arch/relay_op_strategy.html                   |    8 +-
 docs/arch/runtime.html                             |  128 +--
 docs/arch/virtual_machine.html                     |   24 +-
 docs/commit_hash                                   |    2 +-
 docs/contribute/ci.html                            |    2 +-
 docs/contribute/code_guide.html                    |   30 +-
 docs/contribute/document.html                      |    8 +-
 docs/contribute/error_handling.html                |   14 +-
 docs/dev/how_to/pytest_target_parametrization.html |    6 +-
 docs/dev/how_to/relay_add_op.html                  |  200 ++--
 docs/dev/how_to/relay_add_pass.html                |  226 ++---
 docs/dev/how_to/relay_bring_your_own_codegen.html  |  910 +++++++++---------
 docs/dev/tutorial/codebase_walkthrough.html        |    2 +-
 docs/how_to/compile_models/from_coreml.html        |    8 +-
 docs/how_to/compile_models/from_darknet.html       |   12 +-
 docs/how_to/compile_models/from_keras.html         |   10 +-
 docs/how_to/compile_models/from_mxnet.html         |   12 +-
 docs/how_to/compile_models/from_onnx.html          |    8 +-
 docs/how_to/compile_models/from_paddle.html        |   10 +-
 docs/how_to/compile_models/from_pytorch.html       |   19 +-
 docs/how_to/compile_models/from_tensorflow.html    |   12 +-
 docs/how_to/compile_models/from_tflite.html        |   12 +-
 docs/how_to/compile_models/sg_execution_times.html |   20 +-
 docs/how_to/deploy/arm_compute_lib.html            |   20 +-
 docs/how_to/deploy/bnns.html                       |    6 +-
 docs/how_to/deploy/hls.html                        |    2 +-
 docs/how_to/deploy/tensorrt.html                   |    6 +-
 docs/how_to/deploy/vitis_ai.html                   |    2 +-
 .../deploy_models/deploy_model_on_android.html     |   14 +-
 .../how_to/deploy_models/deploy_model_on_rasp.html |   12 +-
 .../deploy_object_detection_pytorch.html           |   29 +-
 docs/how_to/deploy_models/deploy_prequantized.html |   16 +-
 .../deploy_models/deploy_prequantized_tflite.html  |   16 +-
 docs/how_to/deploy_models/deploy_quantized.html    |   10 +-
 docs/how_to/deploy_models/deploy_sparse.html       |    8 +-
 docs/how_to/deploy_models/deploy_ssd_gluoncv.html  |   47 +-
 docs/how_to/deploy_models/sg_execution_times.html  |   18 +-
 .../extend_tvm/bring_your_own_datatypes.html       |   14 +-
 docs/how_to/extend_tvm/low_level_custom_pass.html  |    2 +-
 docs/how_to/extend_tvm/sg_execution_times.html     |   10 +-
 docs/how_to/extend_tvm/use_pass_infra.html         |    4 +-
 docs/how_to/extend_tvm/use_pass_instrument.html    |   30 +-
 docs/how_to/optimize_operators/opt_conv_cuda.html  |    4 +-
 .../optimize_operators/opt_conv_tensorcore.html    |    6 +-
 docs/how_to/optimize_operators/opt_gemm.html       |   18 +-
 .../optimize_operators/sg_execution_times.html     |    8 +-
 docs/how_to/profile/papi.html                      |    2 +-
 .../sg_execution_times.html                        |   14 +-
 .../tune_conv2d_layer_cuda.html                    | 1000 ++++++++++++++++++--
 .../tune_with_autoscheduler/tune_network_arm.html  |   72 +-
 .../tune_with_autoscheduler/tune_network_cuda.html |   70 +-
 .../tune_with_autoscheduler/tune_network_mali.html |   82 +-
 .../tune_with_autoscheduler/tune_network_x86.html  |   88 +-
 .../tune_with_autoscheduler/tune_sparse_x86.html   |  162 ++--
 .../tune_with_autotvm/sg_execution_times.html      |   12 +-
 .../how_to/tune_with_autotvm/tune_conv2d_cuda.html |   40 +-
 docs/how_to/tune_with_autotvm/tune_relay_arm.html  |   10 +-
 docs/how_to/tune_with_autotvm/tune_relay_cuda.html |    6 +-
 .../tune_with_autotvm/tune_relay_mobile_gpu.html   |   10 +-
 docs/how_to/tune_with_autotvm/tune_relay_x86.html  |   16 +-
 docs/how_to/work_with_microtvm/micro_autotune.html |   18 +-
 docs/how_to/work_with_microtvm/micro_ethosu.html   |  122 +--
 docs/how_to/work_with_microtvm/micro_tflite.html   |    8 +-
 .../work_with_microtvm/sg_execution_times.html     |   12 +-
 docs/how_to/work_with_relay/build_gcn.html         |   18 +-
 .../how_to/work_with_relay/sg_execution_times.html |    8 +-
 .../how_to/work_with_relay/using_external_lib.html |    8 +-
 docs/how_to/work_with_relay/using_relay_viz.html   |   16 +-
 docs/how_to/work_with_schedules/extern_op.html     |    8 +-
 docs/how_to/work_with_schedules/intrin_math.html   |    6 +-
 docs/how_to/work_with_schedules/reduction.html     |   16 +-
 docs/how_to/work_with_schedules/scan.html          |    4 +-
 .../work_with_schedules/schedule_primitives.html   |    4 +-
 .../work_with_schedules/sg_execution_times.html    |   18 +-
 docs/how_to/work_with_schedules/tedd.html          |    6 +-
 docs/how_to/work_with_schedules/tensorize.html     |   12 +-
 docs/how_to/work_with_schedules/tuple_inputs.html  |    4 +-
 docs/reference/api/python/auto_scheduler.html      |    8 +-
 docs/reference/api/python/autotvm.html             |   10 +-
 docs/reference/api/python/error.html               |   12 +-
 docs/reference/api/python/ir.html                  |   52 +-
 .../api/python/relay/dataflow_pattern.html         |   98 +-
 docs/reference/api/python/relay/index.html         |   36 +-
 docs/reference/api/python/relay/nn.html            |    4 +-
 docs/reference/api/python/relay/testing.html       |   42 +-
 docs/reference/api/python/relay/transform.html     |  130 +--
 docs/reference/api/python/rpc.html                 |    4 +-
 docs/reference/api/python/target.html              |   16 +-
 docs/reference/api/python/te.html                  |   20 +-
 docs/reference/api/python/tir.html                 |  260 ++---
 docs/reference/api/python/topi.html                |  320 +++----
 .../api/typedoc/classes/bytestreamreader.html      |   12 +-
 .../api/typedoc/classes/cachedcallstack.html       |   34 +-
 docs/reference/api/typedoc/classes/dldatatype.html |   12 +-
 docs/reference/api/typedoc/classes/dldevice.html   |   10 +-
 .../reference/api/typedoc/classes/environment.html |   12 +-
 docs/reference/api/typedoc/classes/ffilibrary.html |   20 +-
 .../api/typedoc/classes/graphexecutor.html         |   16 +-
 docs/reference/api/typedoc/classes/instance.html   |   40 +-
 docs/reference/api/typedoc/classes/memory.html     |   34 +-
 docs/reference/api/typedoc/classes/module.html     |   10 +-
 docs/reference/api/typedoc/classes/ndarray.html    |   22 +-
 .../api/typedoc/classes/packedfunccell.html        |    6 +-
 docs/reference/api/typedoc/classes/rpcserver.html  |   14 +-
 docs/reference/api/typedoc/classes/scalar.html     |    6 +-
 .../api/typedoc/classes/webgpucontext.html         |   12 +-
 docs/reference/api/typedoc/enums/argtypecode.html  |   30 +-
 .../api/typedoc/enums/aynccallbackcode.html        |    4 +-
 .../api/typedoc/enums/dldatatypecode.html          |    8 +-
 .../api/typedoc/enums/rpcserverstate.html          |   12 +-
 docs/reference/api/typedoc/enums/sizeof.html       |   18 +-
 docs/reference/api/typedoc/index.html              |  112 +--
 .../api/typedoc/interfaces/disposable.html         |    2 +-
 .../api/typedoc/interfaces/functioninfo.html       |    6 +-
 .../api/typedoc/interfaces/libraryprovider.html    |    4 +-
 docs/reference/langref/hybrid_script.html          |    4 +-
 docs/reference/langref/relay_adt.html              |  448 ++++-----
 docs/reference/langref/relay_expr.html             |   44 +-
 docs/reference/langref/relay_pattern.html          |    8 +-
 docs/reference/langref/relay_type.html             |   10 +-
 docs/searchindex.js                                |    2 +-
 docs/topic/vta/dev/hardware.html                   |   46 +-
 .../vta/tutorials/autotvm/sg_execution_times.html  |    6 +-
 docs/topic/vta/tutorials/autotvm/tune_alu_vta.html |   24 +-
 .../vta/tutorials/autotvm/tune_relay_vta.html      |   22 +-
 .../tutorials/frontend/deploy_classification.html  |   28 +-
 .../vta/tutorials/frontend/deploy_detection.html   |   18 +-
 .../vta/tutorials/frontend/sg_execution_times.html |    6 +-
 docs/topic/vta/tutorials/matrix_multiply.html      |   10 +-
 .../vta/tutorials/optimize/convolution_opt.html    |   14 +-
 .../tutorials/optimize/matrix_multiply_opt.html    |   60 +-
 .../vta/tutorials/optimize/sg_execution_times.html |    6 +-
 docs/topic/vta/tutorials/sg_execution_times.html   |    6 +-
 docs/topic/vta/tutorials/vta_get_started.html      |   10 +-
 docs/tutorial/auto_scheduler_matmul_x86.html       |    8 +-
 docs/tutorial/autotvm_matmul_x86.html              |   14 +-
 docs/tutorial/autotvm_relay_x86.html               |  190 ++--
 docs/tutorial/cross_compilation_and_rpc.html       |    8 +-
 docs/tutorial/intro_topi.html                      |    8 +-
 docs/tutorial/relay_quick_start.html               |   10 +-
 docs/tutorial/sg_execution_times.html              |   24 +-
 docs/tutorial/tensor_expr_get_started.html         |   58 +-
 docs/tutorial/tensor_ir_blitz_course.html          |    6 +-
 docs/tutorial/tvmc_command_line_driver.html        |    6 +-
 docs/tutorial/tvmc_python.html                     |    2 +-
 197 files changed, 5144 insertions(+), 3560 deletions(-)

diff --git a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
index b1d29558f..5c409578f 100644
--- a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
@@ -98,7 +98,7 @@ In this section, we download a pretrained imagenet model and classify an image.
 
  .. code-block:: none
 
-    Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip90c37b1c-4f10-4a8e-8112-80f1ef514b75 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+    Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip375983c7-f444-43eb-aeb6-1fc225818214 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
     x (1, 3, 224, 224)
 
 
diff --git a/docs/_sources/how_to/compile_models/from_paddle.rst.txt b/docs/_sources/how_to/compile_models/from_paddle.rst.txt
index 6411b2eda..d8ee0d02a 100644
--- a/docs/_sources/how_to/compile_models/from_paddle.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_paddle.rst.txt
@@ -201,7 +201,7 @@ Look up prediction top 1 index in 1000 class synset.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  6.070 seconds)
+   **Total running time of the script:** ( 1 minutes  4.802 seconds)
 
 
 .. _sphx_glr_download_how_to_compile_models_from_paddle.py:
diff --git a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
index 4959d58b9..b16265af3 100644
--- a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
@@ -79,7 +79,7 @@ Load a pretrained PyTorch model
  .. code-block:: none
 
     Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
-
      0%|          | 0.00/44.7M [00:00<?, ?B/s]
      8%|8         | 3.58M/44.7M [00:00<00:01, 37.5MB/s]
     19%|#8        | 8.34M/44.7M [00:00<00:00, 44.6MB/s]
     77%|#######7  | 34.5M/44.7M [00:00<00:00, 149MB/s] 
    100%|##########| 44.7M/44.7M [00:00<00:00, 139MB/s]
+
      0%|          | 0.00/44.7M [00:00<?, ?B/s]
      4%|3         | 1.70M/44.7M [00:00<00:02, 17.7MB/s]
      8%|7         | 3.40M/44.7M [00:00<00:02, 15.6MB/s]
     37%|###6      | 16.4M/44.7M [00:00<00:00, 66.9MB/s]
     72%|#######2  | 32.2M/44.7M [00:00<00:00, 104MB/s] 
    100%|##########| 44.7M/44.7M [00:00<00:00, 89.7MB/s]
 
 
 
diff --git a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
index 5910816b0..01ba17daa 100644
--- a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
@@ -372,7 +372,7 @@ Run the corresponding model on tensorflow
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  0.457 seconds)
+   **Total running time of the script:** ( 1 minutes  2.588 seconds)
 
 
 .. _sphx_glr_download_how_to_compile_models_from_tensorflow.py:
diff --git a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
index d6328f1f6..17ad1472c 100644
--- a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**04:44.140** total execution time for **how_to_compile_models** files:
+**04:50.106** total execution time for **how_to_compile_models** files:
 
-- **01:06.070**: :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``)
-- **01:00.457**: :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``)
-- **00:55.771**: :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``)
-- **00:24.997**: :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``)
-- **00:21.753**: :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``)
-- **00:20.903**: :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``)
-- **00:18.634**: :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``)
-- **00:13.172**: :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``)
-- **00:02.383**: :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``)
+- **01:04.802**: :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``)
+- **01:02.588**: :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``)
+- **00:57.900**: :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``)
+- **00:25.115**: :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``)
+- **00:22.278**: :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``)
+- **00:21.336**: :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``)
+- **00:19.651**: :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``)
+- **00:13.779**: :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``)
+- **00:02.656**: :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``)
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
index e7e68dc9d..2775f9e4c 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
@@ -393,7 +393,7 @@ Execute on TVM
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      16.4564      16.4444      16.9575      16.1097       0.2470   
+      15.9748      15.8680      16.8013      15.7512       0.2894   
                
 
 
diff --git a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
index 906e0dd26..26cadbbef 100644
--- a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
@@ -108,7 +108,7 @@ Load pre-trained maskrcnn from torchvision and do tracing
  .. code-block:: none
 
     Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
-
      0%|          | 0.00/170M [00:00<?, ?B/s]
      9%|8         | 14.8M/170M [00:00<00:01, 155MB/s]
     23%|##3       | 39.8M/170M [00:00<00:00, 218MB/s]
     40%|###9      | 67.1M/170M [00:00<00:00, 249MB/s]
     55%|#####5    | 94.2M/170M [00:00<00:00, 263MB/s]
     71%|#######1  | 121M/170M [00:00<00:00, 268MB/s] 
     86%|########6 | 146M/170M [00:00<00:00, 261MB/s]
    100%|##########| 170M/170M [00:00<00:00, 257MB/s]
+
      0%|          | 0.00/170M [00:00<?, ?B/s]
      2%|1         | 3.30M/170M [00:00<00:05, 34.5MB/s]
      4%|3         | 6.60M/170M [00:00<00:05, 33.3MB/s]
     16%|#6        | 27.2M/170M [00:00<00:01, 115MB/s] 
     30%|##9       | 50.1M/170M [00:00<00:00, 164MB/s]
     43%|####2     | 72.6M/170M [00:00<00:00, 189MB/s]
     57%|#####7    | 97.3M/170M [00:00<00:00, 213MB/s]
     71%|#######1  | 121M/170M [00:00<00:00, 224MB/s] 
     84%|########4 | 143M/170M [00:00<00:00, 228MB/s]
     98%|#########8| 167M/170M [00:00<00:00, 234MB/s]
    100%|##########| 170M/170M [00:00<00:00, 194MB/s]
     /usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:3878: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
       for i in range(dim)
     /usr/local/lib/python3.7/dist-packages/torchvision/models/detection/anchor_utils.py:127: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -253,7 +253,7 @@ Get boxes with score larger than 0.9
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 3 minutes  3.616 seconds)
+   **Total running time of the script:** ( 3 minutes  3.882 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
index b84486f08..a3a0b5090 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
@@ -187,7 +187,7 @@ training. Other models require a full post training calibration.
  .. code-block:: none
 
     Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
-
      0%|          | 0.00/13.6M [00:00<?, ?B/s]
    100%|##########| 13.6M/13.6M [00:00<00:00, 182MB/s]
+
      0%|          | 0.00/13.6M [00:00<?, ?B/s]
     26%|##6       | 3.59M/13.6M [00:00<00:00, 36.2MB/s]
     52%|#####1    | 7.04M/13.6M [00:00<00:00, 36.1MB/s]
    100%|##########| 13.6M/13.6M [00:00<00:00, 57.5MB/s]
 
 
 
@@ -344,7 +344,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      90.5416      90.2102      102.7956     90.0396       1.3333   
+      90.3311      90.2821      91.2938      90.0786       0.2095   
                
 
 
@@ -384,7 +384,7 @@ TODO
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  5.240 seconds)
+   **Total running time of the script:** ( 1 minutes  4.808 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_prequantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
index 656443ead..c5eef0006 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
@@ -351,7 +351,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      121.8275     121.7358     126.9073     120.5809      0.6996   
+      120.7286     120.6932     123.0922     119.5067      0.6691   
                
 
 
@@ -385,7 +385,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  53.611 seconds)
+   **Total running time of the script:** ( 1 minutes  52.034 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
index 4cd4038e1..730659482 100644
--- a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
@@ -221,7 +221,7 @@ We create a Relay VM to build and execute the model.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  28.479 seconds)
+   **Total running time of the script:** ( 1 minutes  14.969 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_quantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
index 5e713b856..ac577703c 100644
--- a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
@@ -137,7 +137,7 @@ Convert and compile model for CPU.
             data: None
       input_sym_arg_type = in_param.infer_type()[0]
     Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
-
      0%|          | 0/132723 [00:00<?, ?KB/s]
      3%|2         | 3819/132723 [00:00<00:03, 38183.66KB/s]
      9%|9         | 12493/132723 [00:00<00:01, 66738.87KB/s]
     16%|#5        | 21212/132723 [00:00<00:01, 76073.58KB/s]
     23%|##2       | 29893/132723 [00:00<00:01, 80309.58KB/s]
     29%|##9       | 38683/132723 [00:00<00:01, 83040.67KB/s]
     36%|###5      | 47463/132723 [00:00<00:01, 84655.64KB/s]
     42%|####2     | 56259/132723 [00:00<00:00, 85732.37KB/s]
     49%|####9     | 65037/132723 [00:00<00:00, 86381.00KB/s]
     56%|#####5    | 73852/132723 [00:00<00:00, 86932.25KB/s]
     62%|######2   | 82704/132723 [00:01<00:00, 87419.45KB/s]
     69%|######8   | 91540/132723 [00:01<00:00, 87705.52KB/s]
     76%|#######5  | 100365/132723 [00:01<00:00, 87862.25KB/s]
     82%|########2 | 109215/132723 [00:01<00:00, 88053.35KB/s]
     89%|########8 | 118021/132723 [00:01<00:00, 87969.87KB/s]
     96%|#########5| 126819/132723 [00:01<00:00, 87564.90KB/s]
    100%|#######
 ###| 132723/132723 [00:01<00:00, 84457.93KB/s]
+
      0%|          | 0/132723 [00:00<?, ?KB/s]
      5%|4         | 6006/132723 [00:00<00:02, 60043.74KB/s]
     11%|#1        | 14931/132723 [00:00<00:01, 77216.17KB/s]
     17%|#7        | 22653/132723 [00:00<00:01, 57818.07KB/s]
     24%|##3       | 31487/132723 [00:00<00:01, 67827.45KB/s]
     29%|##9       | 38744/132723 [00:00<00:01, 48676.18KB/s]
     36%|###5      | 47684/132723 [00:00<00:01, 58547.99KB/s]
     41%|####1     | 54527/132723 [00:00<00:01, 54097.16KB/s]
     48%|####7     | 63515/132723 [00:01<00:01, 62882.16KB/s]
     53%|#####3    | 70531/132723 [00:01<00:01, 58778.23KB/s]
     59%|#####9    | 78874/132723 [00:01<00:00, 64984.76KB/s]
     65%|######4   | 85872/132723 [00:01<00:00, 60609.60KB/s]
     71%|#######1  | 94856/132723 [00:01<00:00, 68110.70KB/s]
     78%|#######8  | 103809/132723 [00:01<00:00, 73856.23KB/s]
     85%|########5 | 112829/132723 [00:01<00:00, 78384.20KB/s]
     92%|#########1| 121823/132723 [00:01<00:00, 81660.63KB/s]
     99%|########
 #8| 130793/132723 [00:01<00:00, 83976.31KB/s]
    100%|##########| 132723/132723 [00:01<00:00, 67888.43KB/s]
 
 
 
@@ -202,7 +202,7 @@ Display result
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  22.292 seconds)
+   **Total running time of the script:** ( 2 minutes  22.844 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
index f890b7449..721651660 100644
--- a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
@@ -5,13 +5,13 @@
 
 Computation times
 =================
-**10:42.710** total execution time for **how_to_deploy_models** files:
+**10:27.896** total execution time for **how_to_deploy_models** files:
 
-- **03:03.616**: :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``)
-- **02:22.292**: :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)
-- **01:53.611**: :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)
-- **01:28.479**: :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``)
-- **01:05.240**: :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``)
-- **00:27.764**: :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``)
-- **00:21.519**: :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)
-- **00:00.188**: :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``)
+- **03:03.882**: :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``)
+- **02:22.844**: :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)
+- **01:52.034**: :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)
+- **01:14.969**: :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``)
+- **01:04.808**: :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``)
+- **00:27.820**: :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``)
+- **00:21.350**: :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)
+- **00:00.190**: :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``)
diff --git a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
index 27c9aee0f..cc83b5680 100644
--- a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
@@ -423,7 +423,7 @@ First let us define two helper functions to get the mobilenet model and a cat im
 
  .. code-block:: none
 
-    Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zipb171556b-b337-4a31-9148-a8a50c7ffe93 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+    Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip19f69514-aa16-4970-ad6d-7e8a958c7858 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
 
 
 
@@ -525,7 +525,7 @@ Now, to actually convert the entire network, we have written `a pass in Relay <h
 
  .. code-block:: none
 
-      Check failed: (lower) is false: Intrinsic lowering function for target llvm, intrinsic name tir.sqrt, type 150 not found
+      Check failed: (lower) is false: FloatImm lowering function for target llvm type 150 not found
 
 
 
diff --git a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
index d98849282..af682427d 100644
--- a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
@@ -5,9 +5,9 @@
 
 Computation times
 =================
-**00:38.522** total execution time for **how_to_extend_tvm** files:
+**00:38.402** total execution time for **how_to_extend_tvm** files:
 
-- **00:35.019**: :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``)
-- **00:02.248**: :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``)
-- **00:01.062**: :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``)
-- **00:00.192**: :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``)
+- **00:34.875**: :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``)
+- **00:02.254**: :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``)
+- **00:01.077**: :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``)
+- **00:00.196**: :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``)
diff --git a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
index cde73dc75..165898520 100644
--- a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
@@ -199,10 +199,10 @@ profile the execution time of each passes.
  .. code-block:: none
 
     Printing results of timing profile...
-    InferType: 6040us [6040us] (45.42%; 45.42%)
-    FoldScaleAxis: 7257us [3us] (54.58%; 54.58%)
-            FoldConstant: 7255us [1471us] (54.56%; 99.96%)
-                    InferType: 5784us [5784us] (43.50%; 79.73%)
+    InferType: 6481us [6481us] (45.91%; 45.91%)
+    FoldScaleAxis: 7636us [2us] (54.09%; 54.09%)
+            FoldConstant: 7634us [1570us] (54.07%; 99.97%)
+                    InferType: 6064us [6064us] (42.95%; 79.43%)
 
 
 
@@ -239,10 +239,10 @@ Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for th
  .. code-block:: none
 
     Printing results of timing profile...
-    InferType: 5855us [5855us] (44.88%; 44.88%)
-    FoldScaleAxis: 7191us [2us] (55.12%; 55.12%)
-            FoldConstant: 7189us [1515us] (55.11%; 99.98%)
-                    InferType: 5675us [5675us] (43.50%; 78.93%)
+    InferType: 6060us [6060us] (44.75%; 44.75%)
+    FoldScaleAxis: 7481us [2us] (55.25%; 55.25%)
+            FoldConstant: 7479us [1534us] (55.23%; 99.97%)
+                    InferType: 5945us [5945us] (43.91%; 79.49%)
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
index 58c3319a2..f476c4c8a 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
@@ -295,7 +295,7 @@ latency of convolution.
 
  .. code-block:: none
 
-    Convolution: 37.126799 ms
+    Convolution: 37.440570 ms
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
index 8236bcc78..5542d44b4 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
@@ -628,7 +628,7 @@ be able to run on our build server
 
  .. code-block:: none
 
-    conv2d with tensor core: 9.389606 ms
+    conv2d with tensor core: 7.454823 ms
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
index d6a2a151d..5881e15ad 100644
--- a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
@@ -118,8 +118,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 
  .. code-block:: none
 
-    Numpy running time: 0.018728
-    Baseline: 3.458604
+    Numpy running time: 0.019117
+    Baseline: 3.342248
 
 
 
@@ -210,7 +210,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 
  .. code-block:: none
 
-    Opt1: 0.297369
+    Opt1: 0.301089
 
 
 
@@ -309,7 +309,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
 
  .. code-block:: none
 
-    Opt2: 0.337631
+    Opt2: 0.333339
 
 
 
@@ -401,7 +401,7 @@ the access pattern for A matrix is more cache friendly.
 
  .. code-block:: none
 
-    Opt3: 0.117098
+    Opt3: 0.117179
 
 
 
@@ -520,7 +520,7 @@ flattening.
 
  .. code-block:: none
 
-    Opt4: 0.111210
+    Opt4: 0.110986
 
 
 
@@ -638,7 +638,7 @@ write to C when all the block results are ready.
 
  .. code-block:: none
 
-    Opt5: 0.111095
+    Opt5: 0.111551
 
 
 
@@ -759,7 +759,7 @@ Futhermore, we can also utilize multi-core processors to do the thread-level par
 
  .. code-block:: none
 
-    Opt6: 0.144609
+    Opt6: 0.144950
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
index 95b434d7c..aca86f9e7 100644
--- a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
@@ -5,8 +5,8 @@
 
 Computation times
 =================
-**00:35.150** total execution time for **how_to_optimize_operators** files:
+**00:34.790** total execution time for **how_to_optimize_operators** files:
 
-- **00:32.478**: :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``)
-- **00:01.472**: :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``)
-- **00:01.201**: :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``)
+- **00:32.204**: :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``)
+- **00:01.398**: :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``)
+- **00:01.188**: :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``)
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
index 84bf44fbe..4b5c86670 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
@@ -5,11 +5,11 @@
 
 Computation times
 =================
-**05:06.155** total execution time for **how_to_tune_with_autoscheduler** files:
-
-- **02:21.677**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``)
-- **01:20.086**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``)
-- **00:40.448**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``)
-- **00:26.771**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)
-- **00:08.608**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``)
-- **00:08.564**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``)
+**05:02.754** total execution time for **how_to_tune_with_autoscheduler** files:
+
+- **02:29.207**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``)
+- **01:19.500**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``)
+- **00:40.516**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``)
+- **00:16.464**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)
+- **00:08.597**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``)
+- **00:08.472**: :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``)
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
index dec67e0ea..ca3c9fa71 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
@@ -187,7 +187,7 @@ file and apply it.
 
  .. code-block:: none
 
-
+    .T
 
 
 
@@ -222,45 +222,483 @@ cooperative fetching, unrolling and operator fusion.
                  compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
       buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
       preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
-      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 64;
-      allocate(conv2d_nchw: Pointer(local float32), float32, [8]), storage_scope = local;
-      allocate(pad_temp.shared: Pointer(shared float32), float32, [6272]), storage_scope = shared;
-      allocate(kernel.shared: Pointer(shared float32), float32, [1024]), storage_scope = shared;
-      attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49 {
-        for (ff.outer.inner.init: int32, 0, 4) {
-          for (ff.inner.init: int32, 0, 2) {
-            conv2d_nchw_1: Buffer(conv2d_nchw, float32, [8], [], scope="local", align=32)[((ff.outer.inner.init*2) + ff.inner.init)] = 0f32
-          }
-        }
-        for (rc.outer.outer: int32, 0, 4) {
+      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 28;
+      allocate(conv2d_nchw: Pointer(local float32), float32, [14]), storage_scope = local;
+      allocate(pad_temp.shared: Pointer(shared float32), float32, [72]), storage_scope = shared;
+      allocate(kernel.shared: Pointer(shared float32), float32, [3072]), storage_scope = shared;
+      attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
+        conv2d_nchw_1: Buffer(conv2d_nchw, float32, [14], [], scope="local", align=32)[0] = 0f32
+        conv2d_nchw_1[1] = 0f32
+        conv2d_nchw_1[2] = 0f32
+        conv2d_nchw_1[3] = 0f32
+        conv2d_nchw_1[4] = 0f32
+        conv2d_nchw_1[5] = 0f32
+        conv2d_nchw_1[6] = 0f32
+        conv2d_nchw_1[7] = 0f32
+        conv2d_nchw_1[8] = 0f32
+        conv2d_nchw_1[9] = 0f32
+        conv2d_nchw_1[10] = 0f32
+        conv2d_nchw_1[11] = 0f32
+        conv2d_nchw_1[12] = 0f32
+        conv2d_nchw_1[13] = 0f32
+        for (rc.outer.outer: int32, 0, 64) {
           for (ry.outer.outer: int32, 0, 3) {
-            for (rx.outer.outer: int32, 0, 3) {
-              for (ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer: int32, 0, 128) {
-                let cse_var_1: int32 = (ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer*49)
-                attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
-                pad_temp.shared_1: Buffer(pad_temp.shared, float32, [6272], [], scope="shared")[(cse_var_1 + threadIdx.x_1)] = @tir.if_then_else(((((1 <= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) && ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((((rc.outer.outer*6272) + cse_var_1) + (ry.outer.outer*7)) + rx.outer.outer) + threadIdx.x_1) - 8)], 0f32, dtype= [...]
-              }
-              for (ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer_1: int32, 0, 21) {
-                attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 49;
-                if @tir.likely((((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer_1*49) + threadIdx.x_2) < 1024), dtype=bool) {
-                  kernel.shared_1: Buffer(kernel.shared, float32, [1024], [], scope="shared")[((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer_1*49) + threadIdx.x_2)] = kernel[((((((blockIdx.x*36864) + (floordiv(((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer_1*49) + threadIdx.x_2), 128)*4608)) + (rc.outer.outer*1152)) + (floormod(((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer_1*49) + threadIdx.x_2), 128)*9)) + (ry.outer.outer*3)) + rx.outer.outer)]
+            let cse_var_2: int32 = (rc.outer.outer*72)
+            let cse_var_1: int32 = (ry.outer.outer*3)
+             {
+              attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
+                if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
+                  pad_temp.shared_1: Buffer(pad_temp.shared, float32, [72], [], scope="shared")[(threadIdx.x_1*4)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1*4), 9))) && (floormod((threadIdx.x_1*4), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1*4), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1*4), 9)) - 8)], 0f3 [...]
                 }
-              }
-              for (rc.outer.inner: int32, 0, 2) {
-                for (ff.outer.inner: int32, 0, 4) {
-                  for (rc.inner: int32, 0, 64) {
-                    for (ff.inner: int32, 0, 2) {
-                      let cse_var_2: int32 = ((ff.outer.inner*2) + ff.inner)
-                      conv2d_nchw_1[cse_var_2] = (conv2d_nchw_1[cse_var_2] + (pad_temp.shared_1[(((rc.outer.inner*3136) + (rc.inner*49)) + threadIdx.x)]*kernel.shared_1[((((ff.outer.inner*256) + (ff.inner*128)) + (rc.outer.inner*64)) + rc.inner)]))
-                    }
-                  }
+                if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
+                  pad_temp.shared_1[((threadIdx.x_1*4) + 1)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 1), 9))) && (floormod(((threadIdx.x_1*4) + 1), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 1), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 1), 9)) - 8)], 0f32, dtype=float32)
+                }
+                if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
+                  pad_temp.shared_1[((threadIdx.x_1*4) + 2)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 2), 9))) && (floormod(((threadIdx.x_1*4) + 2), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 2), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 2), 9)) - 8)], 0f32, dtype=float32)
+                }
+                if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
+                  pad_temp.shared_1[((threadIdx.x_1*4) + 3)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 3), 9))) && (floormod(((threadIdx.x_1*4) + 3), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 3), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 3), 9)) - 8)], 0f32, dtype=float32)
                 }
               }
+              attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1: Buffer(kernel.shared, float32, [3072], [], scope="shared")[threadIdx.x_2] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 64)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 8), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 64), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 128)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 16), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 128), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 192)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 36864)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 256)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 32), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 256), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 320)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 40), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 320), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 384)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 73728)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 448)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 56), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 448), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 512)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 64), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 512), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 576)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 110592)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 640)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 80), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 640), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 704)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 88), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 704), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 768)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 147456)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 832)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 104), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 832), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 896)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 112), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 896), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 960)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 184320)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1024)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 128), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1024), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1088)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 136), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1088), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1152)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 221184)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1216)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 152), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1216), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1280)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 160), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1280), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 258048)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1408)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 176), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1408), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1472)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 184), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1472), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1536)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 294912)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1600)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 200), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1600), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1664)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 208), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1664), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1728)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 331776)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 224), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1792), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1856)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 232), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1856), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1920)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 368640)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 1984)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 248), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1984), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2048)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 256), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2048), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2112)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 405504)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2176)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 272), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2176), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 280), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2240), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2304)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 442368)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2368)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 296), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2368), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2432)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 304), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2432), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2496)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 479232)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2560)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 320), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2560), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2624)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 328), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2624), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 516096)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2752)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 344), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2752), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2816)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 352), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2816), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2880)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 552960)]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 2944)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 368), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2944), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
+              kernel.shared_1[(threadIdx.x_2 + 3008)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 376), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 3008), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[0]*kernel.shared_1[(threadIdx.x*48)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[1]*kernel.shared_1[(threadIdx.x*48)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[2]*kernel.shared_1[(threadIdx.x*48)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[3]*kernel.shared_1[(threadIdx.x*48)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[4]*kernel.shared_1[(threadIdx.x*48)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[5]*kernel.shared_1[(threadIdx.x*48)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[6]*kernel.shared_1[(threadIdx.x*48)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[0]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 47)]))
             }
           }
         }
-        for (i1.inner: int32, 0, 8) {
-          compute[(((blockIdx.x*392) + (i1.inner*49)) + threadIdx.x)] = max((conv2d_nchw_1[i1.inner] + bias[((blockIdx.x*8) + i1.inner)]), 0f32)
+        for (i1.inner: int32, 0, 2) {
+          for (i3.inner: int32, 0, 7) {
+            compute[(((((floordiv(blockIdx.x, 7)*6272) + (threadIdx.x*98)) + (i1.inner*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[((i1.inner*7) + i3.inner)] + bias[(((floordiv(blockIdx.x, 7)*128) + (threadIdx.x*2)) + i1.inner)]), 0f32)
+          }
         }
       }
     }
@@ -313,7 +751,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 0.374 ms
+    Execution time of this operator: 0.361 ms
 
 
 
@@ -357,36 +795,36 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_i, factor=1)
     conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
     conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
-    conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=2)
-    conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=4)
-    conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=1)
+    conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
+    conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
+    conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
     conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
     conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
     conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
-    conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
+    conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
     conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
     conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
-    conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
-    conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
+    conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
+    conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
     conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
-    conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=64)
-    conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=2)
+    conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
+    conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
     conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
     conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
     conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
-    conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1)
+    conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
     s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2 [...]
     compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
     compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
     compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
-    compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=8)
-    compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=1)
+    compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
+    compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
     compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
     compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
-    compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
+    compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
     compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-    compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
-    compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
+    compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
+    compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
     compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
     s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
     s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -406,14 +844,14 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
     kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
     s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=49)
+    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
     s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
     pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
-    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
+    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
     s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=49)
+    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
     s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
-    s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 0)
+    s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 512)
     s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
 
     CUDA source code:
@@ -431,42 +869,430 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
       #define int64_t long long
       #define uint64_t unsigned long long
     #endif
-    extern "C" __global__ void __launch_bounds__(49) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
-      float conv2d_nchw[8];
-      __shared__ float pad_temp_shared[6272];
-      __shared__ float kernel_shared[1024];
-      for (int ff_outer_inner_init = 0; ff_outer_inner_init < 4; ++ff_outer_inner_init) {
-        for (int ff_inner_init = 0; ff_inner_init < 2; ++ff_inner_init) {
-          conv2d_nchw[((ff_outer_inner_init * 2) + ff_inner_init)] = 0.000000e+00f;
-        }
-      }
-      for (int rc_outer_outer = 0; rc_outer_outer < 4; ++rc_outer_outer) {
+    extern "C" __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+      float conv2d_nchw[14];
+      __shared__ float pad_temp_shared[72];
+      __shared__ float kernel_shared[3072];
+      conv2d_nchw[0] = 0.000000e+00f;
+      conv2d_nchw[1] = 0.000000e+00f;
+      conv2d_nchw[2] = 0.000000e+00f;
+      conv2d_nchw[3] = 0.000000e+00f;
+      conv2d_nchw[4] = 0.000000e+00f;
+      conv2d_nchw[5] = 0.000000e+00f;
+      conv2d_nchw[6] = 0.000000e+00f;
+      conv2d_nchw[7] = 0.000000e+00f;
+      conv2d_nchw[8] = 0.000000e+00f;
+      conv2d_nchw[9] = 0.000000e+00f;
+      conv2d_nchw[10] = 0.000000e+00f;
+      conv2d_nchw[11] = 0.000000e+00f;
+      conv2d_nchw[12] = 0.000000e+00f;
+      conv2d_nchw[13] = 0.000000e+00f;
+      for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
         for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
-          for (int rx_outer_outer = 0; rx_outer_outer < 3; ++rx_outer_outer) {
-            __syncthreads();
-            for (int ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = 0; ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer < 128; ++ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer) {
-              pad_temp_shared[((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) + ((int)threadIdx.x))] = (((((1 <= ((((int)threadIdx.x) / 7) + ry_outer_outer)) && (((((int)threadIdx.x) / 7) + ry_outer_outer) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 6272) + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49)) + (ry_outer_outer * 7)) + rx_outer_outer) + ((int)threadIdx.x)) - 8)] : 0 [...]
-            }
-            for (int ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer1 = 0; ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer1 < 21; ++ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer1) {
-              if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer1 * 49) + ((int)threadIdx.x)) < 1024) {
-                kernel_shared[((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer1 * 49) + ((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) * 36864) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer1 * 49) + ((int)threadIdx.x)) >> 7) * 4608)) + (rc_outer_outer * 1152)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer1 * 49) + ((int)threadIdx.x)) & 127) * 9)) + (ry_outer_outer * 3)) + rx_outer_outer)];
-              }
-            }
-            __syncthreads();
-            for (int rc_outer_inner = 0; rc_outer_inner < 2; ++rc_outer_inner) {
-              for (int ff_outer_inner = 0; ff_outer_inner < 4; ++ff_outer_inner) {
-                for (int rc_inner = 0; rc_inner < 64; ++rc_inner) {
-                  for (int ff_inner = 0; ff_inner < 2; ++ff_inner) {
-                    conv2d_nchw[((ff_outer_inner * 2) + ff_inner)] = (conv2d_nchw[((ff_outer_inner * 2) + ff_inner)] + (pad_temp_shared[(((rc_outer_inner * 3136) + (rc_inner * 49)) + ((int)threadIdx.x))] * kernel_shared[((((ff_outer_inner * 256) + (ff_inner * 128)) + (rc_outer_inner * 64)) + rc_inner)]));
-                  }
-                }
-              }
-            }
+          __syncthreads();
+          if (((int)threadIdx.x) < 18) {
+            pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) * 4) % 9))) && (((((int)threadIdx.x) * 4) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
+          }
+          if (((int)threadIdx.x) < 18) {
+            pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 1) % 9))) && ((((((int)threadIdx.x) * 4) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
+          }
+          if (((int)threadIdx.x) < 18) {
+            pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 2) % 9))) && ((((((int)threadIdx.x) * 4) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
           }
+          if (((int)threadIdx.x) < 18) {
+            pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 3) % 9))) && ((((((int)threadIdx.x) * 4) + 3) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
+          }
+          kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
+          kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
+          kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
+          kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
+          kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
+          kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
+          kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
+          kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
+          kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
+          kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
+          kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
+          kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
+          kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
+          kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
+          kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
+          kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          __syncthreads();
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
         }
       }
-      for (int i1_inner = 0; i1_inner < 8; ++i1_inner) {
-        compute[(((((int)blockIdx.x) * 392) + (i1_inner * 49)) + ((int)threadIdx.x))] = max((conv2d_nchw[i1_inner] + bias[((((int)blockIdx.x) * 8) + i1_inner)]), 0.000000e+00f);
+      for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
+        for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
+          compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
+        }
       }
     }
 
@@ -525,7 +1351,7 @@ In the example below we resume the status and do more 5 trials.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  21.677 seconds)
+   **Total running time of the script:** ( 2 minutes  29.207 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
index 0da923ad5..ae25c873f 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
@@ -614,7 +614,7 @@ so we can read the log file and load the best schedules.
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-       9.4762       9.4668       9.5029       9.4590       0.0191   
+       9.9205       9.9307       9.9435       9.8873       0.0241   
                
 
 
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
index c1d420604..b8d894af4 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
@@ -633,7 +633,7 @@ so we can read the log file and load the best schedules.
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      766.3483     762.8012     773.6176     762.6261      5.1407   
+      750.3560     751.6426     751.9646     747.4609      2.0514   
                
 
 
@@ -658,7 +658,7 @@ Other Tips
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  20.086 seconds)
+   **Total running time of the script:** ( 1 minutes  19.500 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_x86.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
index 5fbdfc22e..d8c13d98d 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
@@ -362,77 +362,31 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
                  placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
                  compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
       buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
-      preflattened_buffer_map = {placeholder_9: placeholder_15: Buffer(placeholder_14, float32, [128, 512], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_8: placeholder_17: Buffer(placeholder_13, int32, [33], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_5: placeholder_18: Buffer(placeholder_10, float32, [128, 256], []), placeholder_6: placeholder_19: Buffer(placeholder_11, float32, [4916, 16, 1], [])} {
-      for (i0.outer.i1.outer.fused: int32, 0, 32) "parallel" {
-        allocate(compute_4: Pointer(global float32), float32, [2048]), storage_scope = global {
-          for (nb_j.inner: int32, 0, 2) {
-            for (i.inner.init: int32, 0, 64) {
-              let cse_var_1: int32 = ((i.inner.init*32) + (nb_j.inner*16))
-               {
-                compute_5: Buffer(compute_4, float32, [2048], [])[cse_var_1] = 0f32
-                compute_5[(cse_var_1 + 1)] = 0f32
-                compute_5[(cse_var_1 + 2)] = 0f32
-                compute_5[(cse_var_1 + 3)] = 0f32
-                compute_5[(cse_var_1 + 4)] = 0f32
-                compute_5[(cse_var_1 + 5)] = 0f32
-                compute_5[(cse_var_1 + 6)] = 0f32
-                compute_5[(cse_var_1 + 7)] = 0f32
-                compute_5[(cse_var_1 + 8)] = 0f32
-                compute_5[(cse_var_1 + 9)] = 0f32
-                compute_5[(cse_var_1 + 10)] = 0f32
-                compute_5[(cse_var_1 + 11)] = 0f32
-                compute_5[(cse_var_1 + 12)] = 0f32
-                compute_5[(cse_var_1 + 13)] = 0f32
-                compute_5[(cse_var_1 + 14)] = 0f32
-                compute_5[(cse_var_1 + 15)] = 0f32
+      preflattened_buffer_map = {compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_9: placeholder_15: Buffer(placeholder_14, float32, [128, 512], []), placeholder_6: placeholder_16: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_7: placeholder_17: Buffer(placeholder_12, int32, [4916], []), placeholder_8: placeholder_18: Buffer(placeholder_13, int32, [33], []), placeholder_5: placeholder_19: Buffer(placeholder_10, float32, [128, 256], [])} {
+      for (i0.outer.i1.outer.fused: int32, 0, 16) "parallel" {
+        allocate(compute_4: Pointer(global float32), float32, [4096]), storage_scope = global {
+          for (i.outer.inner: int32, 0, 32) {
+            for (nb_j.inner: int32, 0, 2) {
+              for (i.inner.init: int32, 0, 4) {
+                for (j.init: int32, 0, 16) {
+                  compute_5: Buffer(compute_4, float32, [4096], [])[((((i.outer.inner*128) + (i.inner.init*32)) + (nb_j.inner*16)) + j.init)] = 0f32
+                }
               }
-            }
-            for (elem_idx: int32, 0, let cse_var_2: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) in (placeholder_3[(cse_var_2 + 1)] - placeholder_3[cse_var_2])) {
-              for (i.inner: int32, 0, 64) {
-                let cse_var_21: int32 = (elem_idx*16)
-                let cse_var_20: int32 = ((i.inner*32) + (nb_j.inner*16))
-                let cse_var_19: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)
-                let cse_var_18: int32 = (cse_var_20 + 1)
-                let cse_var_17: int32 = (cse_var_20 + 11)
-                let cse_var_16: int32 = (cse_var_20 + 12)
-                let cse_var_15: int32 = (cse_var_20 + 13)
-                let cse_var_14: int32 = (cse_var_20 + 14)
-                let cse_var_13: int32 = (cse_var_20 + 15)
-                let cse_var_12: int32 = (cse_var_20 + 2)
-                let cse_var_11: int32 = (cse_var_20 + 3)
-                let cse_var_10: int32 = (cse_var_20 + 4)
-                let cse_var_9: int32 = (cse_var_20 + 5)
-                let cse_var_8: int32 = (cse_var_20 + 6)
-                let cse_var_7: int32 = (cse_var_20 + 7)
-                let cse_var_6: int32 = (cse_var_20 + 8)
-                let cse_var_5: int32 = (cse_var_20 + 9)
-                let cse_var_4: int32 = ((floordiv(i0.outer.i1.outer.fused, 16)*16384) + (i.inner*256))
-                let cse_var_3: int32 = (cse_var_20 + 10)
-                 {
-                  compute_5[cse_var_20] = (compute_5[cse_var_20] + (placeholder_1[((placeholder_3[cse_var_19]*16) + cse_var_21)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_18] = (compute_5[cse_var_18] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 1)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 2)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 3)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 4)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 5)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 6)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 7)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 8)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 9)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 10)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 11)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 12)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 13)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 14)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-                  compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 15)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
+              for (elem_idx: int32, 0, let cse_var_1: int32 = ((i0.outer.i1.outer.fused*2) + nb_j.inner) in (placeholder_3[(cse_var_1 + 1)] - placeholder_3[cse_var_1])) {
+                for (i.inner: int32, 0, 4) {
+                  for (j: int32, 0, 16) {
+                    let cse_var_3: int32 = ((i0.outer.i1.outer.fused*2) + nb_j.inner)
+                    let cse_var_2: int32 = ((((i.outer.inner*128) + (i.inner*32)) + (nb_j.inner*16)) + j)
+                    compute_5[cse_var_2] = (compute_5[cse_var_2] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + j)]*max(placeholder[(((i.outer.inner*1024) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+                  }
                 }
               }
             }
           }
-          for (i0.inner: int32, 0, 64) {
+          for (i0.inner: int32, 0, 128) {
             for (i1.inner: int32, 0, 32) {
-              let cse_var_22: int32 = ((((floordiv(i0.outer.i1.outer.fused, 16)*32768) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)) + i1.inner)
-              compute[cse_var_22] = max((compute_5[((i0.inner*32) + i1.inner)] + placeholder_4[cse_var_22]), 0f32)
+              let cse_var_4: int32 = (((i0.inner*512) + (i0.outer.i1.outer.fused*32)) + i1.inner)
+              compute[cse_var_4] = max((compute_5[((i0.inner*32) + i1.inner)] + placeholder_4[cse_var_4]), 0f32)
             }
           }
         }
@@ -487,7 +441,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 1.840 ms
+    Execution time of this operator: 1.446 ms
 
 
 
diff --git a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
index 01c11a85f..2112b02dd 100644
--- a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:43.877** total execution time for **how_to_tune_with_autotvm** files:
+**00:44.901** total execution time for **how_to_tune_with_autotvm** files:
 
-- **00:43.046**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)
-- **00:00.211**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)
-- **00:00.208**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``)
-- **00:00.207**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``)
-- **00:00.205**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``)
+- **00:44.047**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)
+- **00:00.223**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)
+- **00:00.215**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``)
+- **00:00.209**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``)
+- **00:00.207**: :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``)
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
index 6981ae6cf..2c3c25423 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
@@ -859,8 +859,8 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 4, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2885496
-    No: 6   GFLOPS: 96.35/96.35     result: MeasureResult(costs=(0.002402670208333333,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5966920852661133, timestamp=1650540209.386137)        [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
-    No: 7   GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+    No: 6   GFLOPS: 110.06/110.06   result: MeasureResult(costs=(0.0021034644791666666,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6358001232147217, timestamp=1650555136.586403)       [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
+    No: 7   GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -983,7 +983,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 16, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 256, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6225319
-    No: 8   GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+    No: 8   GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1106,7 +1106,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 1, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,943546
-    No: 9   GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+    No: 9   GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1229,7 +1229,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 16, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2868708
-    No: 10  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+    No: 10  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
         res = future.result()
       File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
@@ -1247,7 +1247,7 @@ for this template
     TimeoutError
 
             [('tile_f', [-1, 32, 2, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4691833
-    No: 11  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+    No: 11  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1370,7 +1370,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 2, 64]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1042124
-    No: 12  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+    No: 12  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1493,7 +1493,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 32, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10013405
-    No: 13  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+    No: 13  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1616,7 +1616,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 8, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6732082
-    No: 14  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+    No: 14  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1739,7 +1739,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 4, 32]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7536735
-    No: 15  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+    No: 15  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1862,7 +1862,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 128, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,482121
-    No: 16  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+    No: 16  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -1985,7 +1985,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 1, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 32, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2824525
-    No: 17  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+    No: 17  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -2108,7 +2108,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 64, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4559286
-    No: 18  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+    No: 18  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 571, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 523, in _build_func_common
@@ -2231,7 +2231,7 @@ for this template
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 854, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
     tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 32, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9677544
-    No: 19  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+    No: 19  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 721, in __call__
         yield remote, remote.load_module(os.path.split(build_result.filename)[1])
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 685, in run_through_rpc
@@ -2319,7 +2319,7 @@ for this template
       15: _PyEval_EvalFrameDefault
       14: 0x0000000000537c30
       13: _PyObject_FastCallKeywords
-      12: 0x00007facecca0fa2
+      12: 0x00007fcd9bcc9fa2
       11: _ctypes_callproc
       10: ffi_call
       9: ffi_call_unix64
@@ -2384,7 +2384,7 @@ for this template
       21: _PyFunction_FastCallKeywords
       20: _PyEval_EvalFrameDefault
       19: _PyFunction_FastCall      [('tile_f', [-1, 8, 2, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6390073
-    No: 20  GFLOPS: 144.26/144.26   result: MeasureResult(costs=(0.00160480132,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4118306636810303, timestamp=1650540235.138862)       [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
+    No: 20  GFLOPS: 143.01/143.01   result: MeasureResult(costs=(0.0016188322999999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4208097457885742, timestamp=1650555162.9244173)      [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
 
 
 
@@ -2437,7 +2437,7 @@ and measure running time.
 
     Best config:
     [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
-    Time cost of this operator: 0.002011
+    Time cost of this operator: 0.002010
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
index 668c0ff2f..da9eeb79b 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
@@ -292,10 +292,10 @@ Timing the untuned program
     ########## Build without Autotuning ##########
     Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  
     ---------                                     ---                                           --------  -------  -----              ------  -------  
-    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  311.4     98.723   (1, 2, 10, 10, 3)  2       1        
-    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.128     0.992    (1, 6, 10, 10)     1       1        
-    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.901     0.286    (1, 1, 10, 10, 3)  1       1        
-    Total_time                                    -                                             315.429   -        -                  -       -        
+    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  579.8     98.933   (1, 2, 10, 10, 3)  2       1        
+    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       4.755     0.811    (1, 6, 10, 10)     1       1        
+    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         1.501     0.256    (1, 1, 10, 10, 3)  1       1        
+    Total_time                                    -                                             586.056   -        -                  -       -        
 
 
 
@@ -357,10 +357,10 @@ Timing the tuned program
     ########## Build with Autotuning ##########
     Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  
     ---------                                     ---                                           --------  -------  -----              ------  -------  
-    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  89.75     97.082   (1, 6, 10, 10, 1)  2       1        
-    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.752     1.895    (1, 6, 10, 10)     1       1        
-    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.946     1.024    (1, 1, 10, 10, 3)  1       1        
-    Total_time                                    -                                             92.448    -        -                  -       -        
+    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  211.9     98.68    (1, 1, 10, 10, 6)  2       1        
+    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.933     0.9      (1, 6, 10, 10)     1       1        
+    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.901     0.42     (1, 3, 10, 10, 1)  1       1        
+    Total_time                                    -                                             214.734   -        -                  -       -        
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
index d9ff7d816..5273a77e8 100644
--- a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:44.271** total execution time for **how_to_work_with_microtvm** files:
+**00:43.890** total execution time for **how_to_work_with_microtvm** files:
 
-- **00:40.200**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``)
-- **00:03.503**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``)
-- **00:00.192**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_tvmc.py` (``micro_tvmc.py``)
-- **00:00.188**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``)
-- **00:00.187**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_reference_vm.py` (``micro_reference_vm.py``)
+- **00:39.796**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``)
+- **00:03.507**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``)
+- **00:00.197**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``)
+- **00:00.197**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_reference_vm.py` (``micro_reference_vm.py``)
+- **00:00.193**: :ref:`sphx_glr_how_to_work_with_microtvm_micro_tvmc.py` (``micro_tvmc.py``)
diff --git a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
index 5770ebb36..938867160 100644
--- a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
@@ -5,8 +5,8 @@
 
 Computation times
 =================
-**00:08.510** total execution time for **how_to_work_with_relay** files:
+**00:08.607** total execution time for **how_to_work_with_relay** files:
 
-- **00:06.691**: :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``)
-- **00:01.615**: :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``)
-- **00:00.205**: :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``)
+- **00:06.801**: :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``)
+- **00:01.601**: :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``)
+- **00:00.206**: :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``)
diff --git a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
index d4be3b03c..d06989787 100644
--- a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
@@ -5,13 +5,13 @@
 
 Computation times
 =================
-**00:05.600** total execution time for **how_to_work_with_schedules** files:
+**00:05.410** total execution time for **how_to_work_with_schedules** files:
 
-- **00:02.072**: :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``)
-- **00:01.161**: :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``)
-- **00:00.709**: :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``)
-- **00:00.706**: :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``)
-- **00:00.297**: :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``)
-- **00:00.225**: :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``)
-- **00:00.221**: :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``)
-- **00:00.209**: :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``)
+- **00:01.965**: :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``)
+- **00:01.091**: :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``)
+- **00:00.694**: :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``)
+- **00:00.678**: :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``)
+- **00:00.301**: :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``)
+- **00:00.235**: :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``)
+- **00:00.230**: :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``)
+- **00:00.215**: :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``)
diff --git a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
index 1e072eb12..f56fab3b4 100644
--- a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
@@ -318,7 +318,7 @@ The importing needs to happen before the tensorized GEMV being executed.
                  C: Buffer(C_2: Pointer(float32), float32, [524288], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C}
       preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [1024, 64], []), B_1: B_3: Buffer(B_2, float32, [512, 64], []), C_1: C_3: Buffer(C_2, float32, [1024, 512], [])} {
-      attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmprng545fk/input0.cc'\nsource_filename = \"/tmp/tmprng545fk/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca float*, align 8\n  %8 = alloca float*, align 8\n  %9 = alloca floa [...]
+      attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpbiyrqa8p/input0.cc'\nsource_filename = \"/tmp/tmpbiyrqa8p/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca float*, align 8\n  %8 = alloca float*, align 8\n  %9 = alloca floa [...]
       for (i, 0, 1024) {
         for (j.outer: int32, 0, 32) {
           @tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
index c1da160b9..76a890c66 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**00:20.646** total execution time for **topic_vta_tutorials_autotvm** files:
+**00:20.319** total execution time for **topic_vta_tutorials_autotvm** files:
 
-- **00:20.450**: :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``)
-- **00:00.196**: :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``)
+- **00:20.119**: :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``)
+- **00:00.200**: :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``)
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
index 9f9715c65..686eab9b4 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -265,7 +265,7 @@ The compilation steps are:
       DeprecationWarning,
     /workspace/vta/tutorials/frontend/deploy_classification.py:213: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the  new recommended usage.
       relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
-    resnet18_v1 inference graph built in 21.59s!
+    resnet18_v1 inference graph built in 21.49s!
 
 
 
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
index 4f62abe8e..0e8121415 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
@@ -301,7 +301,7 @@ The compilation steps are:
 
     /workspace/python/tvm/relay/build_module.py:439: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
       DeprecationWarning,
-    yolov3-tiny inference graph built in 14.96s!
+    yolov3-tiny inference graph built in 14.94s!
 
 
 
diff --git a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
index df461aac6..cbe02643a 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**01:27.473** total execution time for **topic_vta_tutorials_frontend** files:
+**01:28.1000** total execution time for **topic_vta_tutorials_frontend** files:
 
-- **00:46.170**: :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``)
-- **00:41.303**: :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``)
+- **00:47.285**: :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``)
+- **00:41.714**: :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``)
diff --git a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
index 50cae4962..9bbce57c6 100644
--- a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**00:03.546** total execution time for **topic_vta_tutorials_optimize** files:
+**00:03.469** total execution time for **topic_vta_tutorials_optimize** files:
 
-- **00:02.996**: :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)
-- **00:00.550**: :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``)
+- **00:02.956**: :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)
+- **00:00.513**: :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``)
diff --git a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
index 5f6ef5730..e97a0e5ab 100644
--- a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
@@ -5,7 +5,7 @@
 
 Computation times
 =================
-**00:00.998** total execution time for **topic_vta_tutorials** files:
+**00:00.931** total execution time for **topic_vta_tutorials** files:
 
-- **00:00.500**: :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``)
-- **00:00.498**: :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``)
+- **00:00.477**: :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``)
+- **00:00.454**: :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``)
diff --git a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
index 40901535d..54fb4c027 100644
--- a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
@@ -185,7 +185,7 @@ trials, we can load the best schedule from the log file and apply it.
  .. code-block:: none
 
 
-
+    *E
 
 
 
@@ -306,7 +306,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 93.966 ms
+    Execution time of this operator: 93.387 ms
 
 
 
@@ -417,7 +417,7 @@ operations.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  9.812 seconds)
+   **Total running time of the script:** ( 1 minutes  10.896 seconds)
 
 
 .. _sphx_glr_download_tutorial_auto_scheduler_matmul_x86.py:
diff --git a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
index 26d7a029a..a418aa923 100644
--- a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
@@ -268,7 +268,7 @@ standard deviation.
 
  .. code-block:: none
 
-    {'mean': 494.3461641700105, 'median': 493.782350750007, 'std': 1.1968974665647765}
+    {'mean': 495.14908602999856, 'median': 494.9211825499958, 'std': 1.0668301378714276}
 
 
 
@@ -482,31 +482,31 @@ the tuning data to.
 
  .. code-block:: none
 
-
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  1/25]  Current/Best:   11.46/  23.38 GFLOPS | Progress: (4/10) | 5.81 s
    [Task  1/25]  Current/Best:    9.79/  23.38 GFLOPS | Progress: (8/10) | 10.26 s
    [Task  1/25]  Current/Best:   11.24/  23.38 GFLOPS | Progress: (10/10) | 11.72 s Done.
-
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  2/25]  Current/Best:   16.70/  16.70 GFLOPS | Progress: (4/10) | 2.64 s
    [Task  2/25]  Current/Best:   16.16/  17.85 GFLOPS | Progress: (8/10) | 5.15 s
    [Task  2/25]  Current/Best:    6.68/  17.85 GFLOPS | Progress: (10/10) | 6.03 s Done.
-
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  3/25]  Current/Best:    7.05/  22.97 GFLOPS | Progress: (4/10) | 3.06 s
    [Task  3/25]  Current/Best:   13.29/  22.97 GFLOPS | Progress: (8/10) | 5.57 s
    [Task  3/25]  Current/Best:    6.19/  22.97 GFLOPS | Progress: (10/10) | 6.66 s Done.
-
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  4/25]  Current/Best:   12.09/  21.00 GFLOPS | Progress: (4/10) | 2.60 s
    [Task  4/25]  Current/Best:    8.45/  21.21 GFLOPS | Progress: (8/10) | 6.54 s
    [Task  4/25]  Current/Best:    6.46/  21.21 GFLOPS | Progress: (10/10) | 8.94 s Done.
-
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  5/25]  Current/Best:   16.92/  16.92 GFLOPS | Progress: (4/10) | 2.72 s
    [Task  5/25]  Current/Best:    5.25/  16.92 GFLOPS | Progress: (8/10) | 4.84 s
    [Task  5/25]  Current/Best:    9.00/  17.14 GFLOPS | Progress: (10/10) | 5.59 s Done.
-
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  6/25]  Current/Best:   23.00/  23.00 GFLOPS | Progress: (4/10) | 3.84 s
    [Task  6/25]  Current/Best:   13.16/  23.00 GFLOPS | Progress: (8/10) | 6.87 s
    [Task  6/25]  Current/Best:   11.91/  23.00 GFLOPS | Progress: (10/10) | 9.06 s Done.
-
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  7/25]  Current/Best:   18.70/  22.00 GFLOPS | Progress: (4/10) | 2.72 s
    [Task  7/25]  Current/Best:   13.85/  22.00 GFLOPS | Progress: (8/10) | 7.32 s
    [Task  7/25]  Current/Best:   14.44/  22.00 GFLOPS | Progress: (10/10) | 8.40 s Done.
-
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  8/25]  Current/Best:    9.34/   9.34 GFLOPS | Progress: (4/10) | 8.78 s
    [Task  8/25]  Current/Best:    2.06/   9.45 GFLOPS | Progress: (8/10) | 12.32 s
    [Task  8/25]  Current/Best:    5.16/  13.10 GFLOPS | Progress: (10/10) | 16.89 s Done.
-
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  9/25]  Current/Best:   15.77/  21.83 GFLOPS | Progress: (4/10) | 3.47 s
    [Task  9/25]  Current/Best:   18.03/  21.83 GFLOPS | Progress: (8/10) | 19.81 s
    [Task  9/25]  Current/Best:   16.79/  21.83 GFLOPS | Progress: (10/10) | 21.80 s
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 10/25]  Current/Best:   13.09/  13.09 GFLOPS | Progress: (4/10) | 3.10 s
    [Task 10/25]  Current/Best:   14.36/  16.97 GFLOPS | Progress: (8/10) | 5.00 s
    [Task 10/25]  Current/Best:   14.45/  18.62 GFLOPS | Progress: (10/10) | 5.82 s Done.
-
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 11/25]  Current/Best:    8.40/  20.35 GFLOPS | Progress: (4/10) | 3.43 s
    [Task 11/25]  Current/Best:    9.57/  20.35 GFLOPS | Progress: (8/10) | 5.79 s
    [Task 11/25]  Current/Best:   12.65/  20.35 GFLOPS | Progress: (10/10) | 8.12 s Done.
-
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 12/25]  Current/Best:   11.33/  19.62 GFLOPS | Progress: (4/10) | 2.49 s
    [Task 12/25]  Current/Best:   22.65/  22.65 GFLOPS | Progress: (8/10) | 4.48 s
    [Task 12/25]  Current/Best:   12.56/  22.65 GFLOPS | Progress: (10/10) | 5.67 s Done.
-
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 13/25]  Current/Best:   20.39/  20.43 GFLOPS | Progress: (4/10) | 3.49 s
    [Task 13/25]  Current/Best:   12.26/  20.43 GFLOPS | Progress: (8/10) | 6.79 s
    [Task 13/25]  Current/Best:    1.57/  20.43 GFLOPS | Progress: (10/10) | 9.21 s Done.
-
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 14/25]  Current/Best:   11.65/  12.17 GFLOPS | Progress: (4/10) | 4.22 s
    [Task 14/25]  Current/Best:    9.82/  17.31 GFLOPS | Progress: (8/10) | 6.78 s
    [Task 14/25]  Current/Best:   12.75/  17.31 GFLOPS | Progress: (10/10) | 8.39 s
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s Done.
+
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  1/25]  Current/Best:   12.94/  17.62 GFLOPS | Progress: (4/10) | 5.32 s
    [Task  1/25]  Current/Best:    8.59/  17.62 GFLOPS | Progress: (8/10) | 8.74 s
    [Task  1/25]  Current/Best:   17.64/  17.64 GFLOPS | Progress: (10/10) | 10.15 s Done.
+
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  2/25]  Current/Best:    6.32/  11.46 GFLOPS | Progress: (4/10) | 2.50 s
    [Task  2/25]  Current/Best:   12.61/  23.21 GFLOPS | Progress: (8/10) | 3.84 s
    [Task  2/25]  Current/Best:   18.72/  23.21 GFLOPS | Progress: (10/10) | 4.34 s Done.
+
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  3/25]  Current/Best:   12.54/  12.54 GFLOPS | Progress: (4/10) | 3.12 s
    [Task  3/25]  Current/Best:    9.15/  23.55 GFLOPS | Progress: (8/10) | 5.08 s
    [Task  3/25]  Current/Best:   11.13/  23.55 GFLOPS | Progress: (10/10) | 6.01 s Done.
+
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  4/25]  Current/Best:    9.60/  18.34 GFLOPS | Progress: (4/10) | 2.57 s
    [Task  4/25]  Current/Best:    5.61/  18.34 GFLOPS | Progress: (8/10) | 7.09 s
    [Task  4/25]  Current/Best:   11.68/  18.34 GFLOPS | Progress: (10/10) | 8.18 s Done.
+
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  5/25]  Current/Best:   10.99/  17.93 GFLOPS | Progress: (4/10) | 2.82 s
    [Task  5/25]  Current/Best:   13.24/  20.44 GFLOPS | Progress: (8/10) | 4.88 s
    [Task  5/25]  Current/Best:   13.52/  20.44 GFLOPS | Progress: (10/10) | 6.81 s Done.
+
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  6/25]  Current/Best:    2.45/  18.31 GFLOPS | Progress: (4/10) | 3.54 s
    [Task  6/25]  Current/Best:   14.12/  18.31 GFLOPS | Progress: (8/10) | 6.26 s
    [Task  6/25]  Current/Best:   20.50/  20.50 GFLOPS | Progress: (10/10) | 8.66 s Done.
+
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  7/25]  Current/Best:   12.27/  15.03 GFLOPS | Progress: (4/10) | 2.78 s
    [Task  7/25]  Current/Best:   17.14/  17.14 GFLOPS | Progress: (8/10) | 4.98 s
    [Task  7/25]  Current/Best:    3.14/  21.90 GFLOPS | Progress: (10/10) | 6.37 s Done.
+
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  8/25]  Current/Best:    3.03/  20.86 GFLOPS | Progress: (4/10) | 4.69 s
    [Task  8/25]  Current/Best:   18.63/  20.86 GFLOPS | Progress: (8/10) | 9.03 s
    [Task  8/25]  Current/Best:   13.18/  20.86 GFLOPS | Progress: (10/10) | 10.56 s Done.
+
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task  9/25]  Current/Best:    6.82/  18.19 GFLOPS | Progress: (4/10) | 3.53 s
    [Task  9/25]  Current/Best:    8.39/  22.93 GFLOPS | Progress: (8/10) | 4.85 s
    [Task  9/25]  Current/Best:   10.32/  22.93 GFLOPS | Progress: (10/10) | 5.70 s Done.
+
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 10/25]  Current/Best:   18.41/  18.41 GFLOPS | Progress: (4/10) | 2.47 s
    [Task 10/25]  Current/Best:   13.37/  20.96 GFLOPS | Progress: (8/10) | 5.18 s
    [Task 10/25]  Current/Best:   14.12/  22.72 GFLOPS | Progress: (10/10) | 5.73 s Done.
+
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 11/25]  Current/Best:   19.24/  21.04 GFLOPS | Progress: (4/10) | 3.45 s
    [Task 11/25]  Current/Best:    7.80/  21.08 GFLOPS | Progress: (8/10) | 5.27 s
    [Task 11/25]  Current/Best:    6.22/  21.08 GFLOPS | Progress: (10/10) | 7.02 s Done.
+
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 12/25]  Current/Best:    6.38/  18.90 GFLOPS | Progress: (4/10) | 3.07 s
    [Task 12/25]  Current/Best:   18.60/  18.90 GFLOPS | Progress: (8/10) | 4.83 s
    [Task 12/25]  Current/Best:   14.25/  20.74 GFLOPS | Progress: (10/10) | 5.55 s Done.
+
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 13/25]  Current/Best:   23.59/  23.59 GFLOPS | Progress: (4/10) | 3.50 s
    [Task 13/25]  Current/Best:   21.70/  23.59 GFLOPS | Progress: (8/10) | 6.48 s
    [Task 13/25]  Current/Best:   14.29/  23.59 GFLOPS | Progress: (10/10) | 7.40 s Done.
+
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 14/25]  Current/Best:   10.80/  20.57 GFLOPS | Progress: (4/10) | 3.02 s
    [Task 14/25]  Current/Best:   13.54/  20.57 GFLOPS | Progress: (8/10) | 5.66 s
    [Task 14/25]  Current/Best:    5.51/  20.57 GFLOPS | Progress: (10/10) | 6.65 s Done.
+
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 15/25]  Current/Best:   13.05/  13.05 GFLOPS | Progress: (4/10) | 3.29 s
    [Task 15/25]  Current/Best:   20.63/  20.63 GFLOPS | Progress: (8/10) | 6.02 s
    [Task 15/25]  Current/Best:   10.61/  20.63 GFLOPS | Progress: (10/10) | 8.50 s
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 16/25]  Current/Best:    9.77/  14.15 GFLOPS | Progress: (4/10) | 4.09 s
    [Task 16/25]  Current/Best:   23.74/  23.74 GFLOPS | Progress: (8/10) | 6.48 s
    [Task 16/25]  Current/Best:   17.31/  23.74 GFLOPS | Progress: (10/10) | 7.06 s Done.
+
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 17/25]  Current/Best:   15.84/  15.84 GFLOPS | Progress: (4/10) | 3.83 s Done.
+
    [Task 17/25]  Current/Best:   23.19/  23.19 GFLOPS | Progress: (8/10) | 6.02 s
    [Task 17/25]  Current/Best:   18.56/  23.19 GFLOPS | Progress: (10/10) | 6.91 s Done.
+
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 18/25]  Current/Best:   19.96/  20.37 GFLOPS | Progress: (4/10) | 2.95 s
    [Task 18/25]  Current/Best:   10.19/  20.37 GFLOPS | Progress: (8/10) | 6.92 s
    [Task 18/25]  Current/Best:   17.27/  20.37 GFLOPS | Progress: (10/10) | 7.92 s Done.
+
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 19/25]  Current/Best:    2.57/  18.02 GFLOPS | Progress: (4/10) | 4.31 s
    [Task 19/25]  Current/Best:   10.63/  18.33 GFLOPS | Progress: (8/10) | 8.27 s
    [Task 19/25]  Current/Best:   18.72/  18.72 GFLOPS | Progress: (10/10) | 9.23 s Done.
+
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 20/25]  Current/Best:   10.42/  12.21 GFLOPS | Progress: (4/10) | 3.19 s
    [Task 20/25]  Current/Best:    8.57/  12.21 GFLOPS | Progress: (8/10) | 5.82 s
    [Task 20/25]  Current/Best:   16.27/  16.27 GFLOPS | Progress: (10/10) | 6.91 s Done.
+
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 21/25]  Current/Best:    5.31/  16.65 GFLOPS | Progress: (4/10) | 2.92 s
    [Task 21/25]  Current/Best:   19.45/  19.45 GFLOPS | Progress: (8/10) | 4.74 s
    [Task 21/25]  Current/Best:    7.05/  19.45 GFLOPS | Progress: (10/10) | 5.67 s
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 22/25]  Current/Best:   10.80/  15.42 GFLOPS | Progress: (4/10) | 2.83 s
    [Task 22/25]  Current/Best:    5.34/  18.01 GFLOPS | Progress: (8/10) | 4.74 s
    [Task 22/25]  Current/Best:   11.14/  18.01 GFLOPS | Progress: (10/10) | 6.82 s Done.
+
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 23/25]  Current/Best:   10.05/  19.51 GFLOPS | Progress: (4/10) | 4.83 s
    [Task 23/25]  Current/Best:   22.11/  22.11 GFLOPS | Progress: (8/10) | 7.63 s
    [Task 23/25]  Current/Best:   12.66/  22.11 GFLOPS | Progress: (10/10) | 8.62 s Done.
+
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 24/25]  Current/Best:   10.44/  10.44 GFLOPS | Progress: (4/10) | 12.93 s
    [Task 24/25]  Current/Best:    8.74/  10.44 GFLOPS | Progress: (8/10) | 6858.62 s
    [Task 24/25]  Current/Best:    7.51/  10.44 GFLOPS | Progress: (10/10) | 6859.24 s
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s Done.
      Done.
-
    [Task 15/25]  Current/Best:   12.28/  12.28 GFLOPS | Progress: (4/10) | 5.85 s
    [Task 15/25]  Current/Best:   10.56/  16.71 GFLOPS | Progress: (8/10) | 7.57 s
    [Task 15/25]  Current/Best:   18.49/  18.49 GFLOPS | Progress: (10/10) | 8.31 s
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 16/25]  Current/Best:   12.95/  22.22 GFLOPS | Progress: (4/10) | 2.98 s
    [Task 16/25]  Current/Best:   11.79/  22.99 GFLOPS | Progress: (8/10) | 4.79 s
    [Task 16/25]  Current/Best:   17.54/  22.99 GFLOPS | Progress: (10/10) | 6.08 s Done.
-
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 17/25]  Current/Best:    8.71/  24.24 GFLOPS | Progress: (4/10) | 4.02 s
    [Task 17/25]  Current/Best:   12.23/  24.24 GFLOPS | Progress: (8/10) | 6.25 s
    [Task 17/25]  Current/Best:   19.25/  24.24 GFLOPS | Progress: (10/10) | 7.43 s Done.
-
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 18/25]  Current/Best:    9.11/  19.20 GFLOPS | Progress: (4/10) | 6.34 s
    [Task 18/25]  Current/Best:   14.72/  21.46 GFLOPS | Progress: (8/10) | 10.18 s
    [Task 18/25]  Current/Best:   10.23/  21.46 GFLOPS | Progress: (10/10) | 12.13 s Done.
-
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 19/25]  Current/Best:   12.00/  23.51 GFLOPS | Progress: (4/10) | 3.07 s
    [Task 19/25]  Current/Best:   24.01/  24.01 GFLOPS | Progress: (8/10) | 5.59 s
    [Task 19/25]  Current/Best:   20.22/  24.01 GFLOPS | Progress: (10/10) | 6.71 s Done.
-
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 20/25]  Current/Best:   13.36/  13.36 GFLOPS | Progress: (4/10) | 2.83 s
    [Task 20/25]  Current/Best:   17.30/  17.30 GFLOPS | Progress: (8/10) | 5.40 s
    [Task 20/25]  Current/Best:   10.54/  17.30 GFLOPS | Progress: (10/10) | 7.60 s
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 21/25]  Current/Best:   14.53/  16.71 GFLOPS | Progress: (4/10) | 2.59 s
    [Task 21/25]  Current/Best:   14.98/  18.13 GFLOPS | Progress: (8/10) | 6.71 s
    [Task 21/25]  Current/Best:    9.66/  18.13 GFLOPS | Progress: (10/10) | 8.97 s
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 22/25]  Current/Best:   13.77/  19.44 GFLOPS | Progress: (4/10) | 3.30 s
    [Task 22/25]  Current/Best:   16.41/  19.98 GFLOPS | Progress: (8/10) | 4.75 s
    [Task 22/25]  Current/Best:   10.12/  19.98 GFLOPS | Progress: (10/10) | 5.90
  s Done.
-
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 23/25]  Current/Best:   12.74/  18.60 GFLOPS | Progress: (4/10) | 4.59 s
    [Task 23/25]  Current/Best:   14.26/  20.47 GFLOPS | Progress: (8/10) | 6.66 s
    [Task 23/25]  Current/Best:    2.69/  20.47 GFLOPS | Progress: (10/10) | 8.62 s Done.
-
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 24/25]  Current/Best:    4.08/   4.08 GFLOPS | Progress: (4/10) | 37.21 s Done.
-     Done.
-     Done.
-
    [Task 24/25]  Current/Best:    7.06/   9.04 GFLOPS | Progress: (8/10) | 41.27 s
    [Task 24/25]  Current/Best:    3.51/  10.61 GFLOPS | Progress: (10/10) | 46.13 s Done.
-
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/10) | 0.00 s
    [Task 25/25]  Current/Best:    5.66/   6.40 GFLOPS | Progress: (4/10) | 18.52 s
    [Task 25/25]  Current/Best:    8.86/   8.86 GFLOPS | Progress: (8/10) | 47.31 s
    [Task 25/25]  Current/Best:    5.25/   8.86 GFLOPS | Progress: (10/10) | 52.02 s
+
    [Task 25/25]  Current/Best:    3.02/   8.96 GFLOPS | Progress: (4/10) | 21.45 s
    [Task 25/25]  Current/Best:    5.81/   8.96 GFLOPS | Progress: (8/10) | 53.21 s
    [Task 25/25]  Current/Best:    9.29/   9.29 GFLOPS | Progress: (10/10) | 54.29 s
 
 
 The output from this tuning process will look something like this:
@@ -564,6 +564,14 @@ model using optimized operators to speed up our computations.
 
 
 
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+     Done.
+
 
 
 Verify that the optimized model runs and produces the same results:
@@ -594,8 +602,8 @@ Verify that the optimized model runs and produces the same results:
 
  .. code-block:: none
 
-    class='n02123045 tabby, tabby cat' with probability=0.621104
-    class='n02123159 tiger cat' with probability=0.356378
+    class='n02123045 tabby, tabby cat' with probability=0.621102
+    class='n02123159 tiger cat' with probability=0.356379
     class='n02124075 Egyptian cat' with probability=0.019712
     class='n02129604 tiger, Panthera tigris' with probability=0.001215
     class='n04040759 radiator' with probability=0.000262
@@ -648,8 +656,8 @@ improvement in comparing the optimized model to the unoptimized model.
 
  .. code-block:: none
 
-    optimized: {'mean': 424.93260788000043, 'median': 424.90663404996667, 'std': 0.5254916326844217}
-    unoptimized: {'mean': 494.3461641700105, 'median': 493.782350750007, 'std': 1.1968974665647765}
+    optimized: {'mean': 420.5590011900222, 'median': 420.5846446000578, 'std': 0.5195011732683108}
+    unoptimized: {'mean': 495.14908602999856, 'median': 494.9211825499958, 'std': 1.0668301378714276}
 
 
 
@@ -669,7 +677,7 @@ profiling/benchmarking.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 8 minutes  8.279 seconds)
+   **Total running time of the script:** ( 121 minutes  8.766 seconds)
 
 
 .. _sphx_glr_download_tutorial_autotvm_relay_x86.py:
diff --git a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
index c32dee192..79bf721da 100644
--- a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
@@ -235,7 +235,7 @@ device and returns the measured cost. Network overhead is excluded.
 
  .. code-block:: none
 
-    1.311e-07 secs/op
+    1.302e-07 secs/op
 
 
 
diff --git a/docs/_sources/tutorial/intro_topi.rst.txt b/docs/_sources/tutorial/intro_topi.rst.txt
index 872a2e9fc..84899e2c7 100644
--- a/docs/_sources/tutorial/intro_topi.rst.txt
+++ b/docs/_sources/tutorial/intro_topi.rst.txt
@@ -233,7 +233,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
 
  .. code-block:: none
 
-    [stage(a, placeholder(a, 0x20a71670)), stage(b, placeholder(b, 0x22f5e400)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(mi [...]
+    [stage(a, placeholder(a, 0x20a4a670)), stage(b, placeholder(b, 0x14c0d9f0)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(mi [...]
 
 
 
diff --git a/docs/_sources/tutorial/sg_execution_times.rst.txt b/docs/_sources/tutorial/sg_execution_times.rst.txt
index 430b794ed..4b3b42821 100644
--- a/docs/_sources/tutorial/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorial/sg_execution_times.rst.txt
@@ -5,17 +5,17 @@
 
 Computation times
 =================
-**11:09.474** total execution time for **tutorial** files:
+**124:06.927** total execution time for **tutorial** files:
 
-- **08:08.279**: :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``)
-- **01:09.812**: :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``)
-- **01:01.061**: :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)
-- **00:25.972**: :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``)
-- **00:22.751**: :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``)
-- **00:00.719**: :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``)
-- **00:00.554**: :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``)
-- **00:00.194**: :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``)
-- **00:00.038**: :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)
-- **00:00.034**: :ref:`sphx_glr_tutorial_introduction.py` (``introduction.py``)
+- **121:08.766**: :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``)
+- **01:10.896**: :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``)
+- **01:00.477**: :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)
+- **00:26.016**: :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``)
+- **00:18.621**: :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``)
+- **00:01.081**: :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``)
+- **00:00.713**: :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``)
+- **00:00.219**: :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``)
+- **00:00.037**: :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)
+- **00:00.036**: :ref:`sphx_glr_tutorial_introduction.py` (``introduction.py``)
 - **00:00.032**: :ref:`sphx_glr_tutorial_install.py` (``install.py``)
-- **00:00.028**: :ref:`sphx_glr_tutorial_tvmc_python.py` (``tvmc_python.py``)
+- **00:00.032**: :ref:`sphx_glr_tutorial_tvmc_python.py` (``tvmc_python.py``)
diff --git a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
index 4fb7b5b33..ac0458c26 100644
--- a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
@@ -243,7 +243,7 @@ helper function to run a profile of the TVM generated code.
 
  .. code-block:: none
 
-    Numpy running time: 0.000007
+    Numpy running time: 0.000009
     naive: 0.000007
 
 
@@ -335,7 +335,7 @@ compile and run this new schedule with the parallel operation applied:
 
  .. code-block:: none
 
-    parallel: 0.000009
+    parallel: 0.000007
 
 
 
@@ -438,10 +438,10 @@ We can now compare the different schedules
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                   numpy    6.77042999996047e-06                     1.0
-                   naive    6.6617999999999995e-06    0.9839552288464537
-                parallel                8.63e-06       1.274660545940271
-                  vector             2.47144e-05      3.6503442174491574
+                   numpy    9.09332999981416e-06                     1.0
+                   naive    6.704599999999999e-06     0.7373096544540912
+                parallel    6.906500000000001e-06     0.7595127417723924
+                  vector    2.4509900000000004e-05     2.695371222698496
 
 
 
@@ -830,7 +830,7 @@ matrix multiplication.
 
  .. code-block:: none
 
-    Numpy running time: 0.018053
+    Numpy running time: 0.019190
 
 
 
@@ -886,7 +886,7 @@ optimizations.
 
  .. code-block:: none
 
-    none: 3.422137
+    none: 3.385655
 
 
 
@@ -985,7 +985,7 @@ schedule.
 
  .. code-block:: none
 
-    blocking: 0.307885
+    blocking: 0.299180
 
 
 
@@ -1077,7 +1077,7 @@ already cache friendly from our previous optimizations.
 
  .. code-block:: none
 
-    vectorization: 0.339063
+    vectorization: 0.335357
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1149,7 +1149,7 @@ more cache friendly.
 
  .. code-block:: none
 
-    loop permutation: 0.114917
+    loop permutation: 0.115867
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1246,7 +1246,7 @@ optimized schedule.
 
  .. code-block:: none
 
-    array packing: 0.107850
+    array packing: 0.108651
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1337,7 +1337,7 @@ to `C` when all the block results are ready.
 
  .. code-block:: none
 
-    block caching: 0.109867
+    block caching: 0.110501
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1421,7 +1421,7 @@ of thread-level parallelization.
 
  .. code-block:: none
 
-    parallelization: 0.144065
+    parallelization: 0.144748
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1500,13 +1500,13 @@ working, we can compare the results.
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                    none              3.42213662                     1.0
-                blocking            0.3078848383     0.08996859929572304
-           vectorization     0.33906326600000003     0.09907940671287403
-        loop permutation            0.1149165871     0.03358036217151377
-           array packing            0.1078496648     0.03151530075383139
-           block caching     0.10986729290000001     0.03210488215400355
-         parallelization     0.14406499639999998    0.042097967555719616
+                    none            3.3856548339                     1.0
+                blocking            0.2991795664     0.08836682446313329
+           vectorization            0.3353573871     0.09905244437268743
+        loop permutation            0.1158668396     0.03422287423982048
+           array packing            0.1086510665     0.03209159581540767
+           block caching            0.1105005719     0.03263787282553913
+         parallelization     0.14474813320000002      0.0427533639137283
 
 
 
@@ -1543,7 +1543,7 @@ the computation for specific platforms.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  1.061 seconds)
+   **Total running time of the script:** ( 1 minutes  0.477 seconds)
 
 
 .. _sphx_glr_download_tutorial_tensor_expr_get_started.py:
diff --git a/docs/_static/pygments.css b/docs/_static/pygments.css
index 20c4814dc..691aeb82d 100644
--- a/docs/_static/pygments.css
+++ b/docs/_static/pygments.css
@@ -1,5 +1,10 @@
+pre { line-height: 125%; }
+td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
 .highlight .hll { background-color: #ffffcc }
-.highlight  { background: #eeffcc; }
+.highlight { background: #eeffcc; }
 .highlight .c { color: #408090; font-style: italic } /* Comment */
 .highlight .err { border: 1px solid #FF0000 } /* Error */
 .highlight .k { color: #007020; font-weight: bold } /* Keyword */
diff --git a/docs/arch/convert_layout.html b/docs/arch/convert_layout.html
index 2a54443d5..0addba805 100644
--- a/docs/arch/convert_layout.html
+++ b/docs/arch/convert_layout.html
@@ -390,7 +390,7 @@
 </ul>
 <p>These steps happen for each operator in sequence, where ConvertLayout pass keeps on passing the new layouts to the next operator properties, finally resulting in modifying the whole graph operator-by-operator. Now, let’s look at a couple of examples of how to define the two properties.</p>
 <p><strong>FTVMConvertLayout - Python callback for layout alteration</strong> - This is used for <em>heavily-layout sensitive</em> operators. For example, one can return a new convolution operator with new data and kernel layout. The other 2 components will infer layout and insert layout transforms if needed. One example for convolution operator is as follows where we are converting to NCHW layout.</p>
-<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@reg.register_convert_op_layout</span><span class="p">(</span><span class="s2">&quot;nn.conv2d&quot;</span><span class="p">)</span>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@reg</span><span class="o">.</span><span class="n">register_convert_op_layout</span><span class="p">(</span><span class="s2">&quot;nn.conv2d&quot;</span><span class="p">)</span>
 <span class="k">def</span> <span class="nf">convert_conv2d</span><span class="p">(</span><span class="n">attrs</span><span class="p">,</span> <span class="n">inputs</span><span class="p">,</span> <span class="n">tinfos</span><span class="p">,</span> <span class="n">desired_layouts</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;Convert Layout pass registration for conv2d op.</span>
 
@@ -445,74 +445,74 @@
 <span class="c1">//          .set_attr&lt;FInferCorrectLayout&gt;(&quot;FInferCorrectLayout&quot;, ElemwiseArbitraryLayout);</span>
 
 <span class="c1">// Take arbitrary input layouts and copy to outputs.</span>
-<span class="kr">inline</span> <span class="n">Array</span><span class="o">&lt;</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;</span> <span class="o">&gt;</span> <span class="n">ElemwiseArbitraryLayout</span><span class="p">(</span><span class="k">const</span> <span class="n">Attrs</span><span class="o">&amp;</span> <span class="n">attrs</span><span class="p">,</span>
-                                                     <span class="k">const</span> <span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;&amp;</span> <span class="n">new_in_layouts</span><span class="p">,</span>
-                                                     <span class="k">const</span> <span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;&amp;</span> <span class="n">old_in_layouts</span><span class="p">,</span>
-                                                     <span class="k">const</span> <span class="n">Array</span><span class="o">&lt;</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">IndexExpr</span><span class="o">&gt;&gt;</span> <span class="o">&amp;</span><span class="n">old_in_shapes</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">Layout</span> <span class="n">ret</span><span class="p">;</span>
-
-  <span class="k">if</span> <span class="p">(</span><span class="n">new_in_layouts</span><span class="p">.</span><span class="n">defined</span><span class="p">())</span> <span class="p">{</span>
-    <span class="n">ICHECK_GE</span><span class="p">(</span><span class="n">new_in_layouts</span><span class="p">.</span><span class="n">size</span><span class="p">(),</span> <span class="mi">1</span><span class="p">);</span>
-    <span class="n">ret</span> <span class="o">=</span> <span class="n">new_in_layouts</span><span class="p">[</span><span class="mi">0</span><span class="p">];</span>
-  <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-    <span class="k">for</span> <span class="p">(</span><span class="kt">size_t</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">old_in_layouts</span><span class="p">.</span><span class="n">size</span><span class="p">();</span> <span class="o">++</span><span class="n">i</span><span class="p">)</span> <span class="p">{</span>
-      <span class="k">if</span> <span class="p">(</span><span class="n">old_in_layouts</span><span class="p">[</span><span class="n">i</span><span class="p">].</span><span class="n">defined</span><span class="p">())</span> <span class="p">{</span>
-        <span class="n">ret</span> <span class="o">=</span> <span class="n">old_in_layouts</span><span class="p">[</span><span class="n">i</span><span class="p">];</span>
-        <span class="k">break</span><span class="p">;</span>
-      <span class="p">}</span>
-    <span class="p">}</span>
-  <span class="p">}</span>
-
-  <span class="k">return</span> <span class="n">Array</span><span class="o">&lt;</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;</span> <span class="o">&gt;</span><span class="p">{</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;</span><span class="p">(</span><span class="n">old_in_layouts</span><span class="p">.</span><span class="n">size</span><span class="p">(), [...]
-<span class="p">}</span>
+<span class="kr">inline</span><span class="w"> </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;</span><span class="w"> </span><span class="o">&gt;</span><span class="w"> </span><span class="n">ElemwiseArbitraryLayout</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Attrs</span><span class="o">&amp;</span><span class="w"> </ [...]
+<span class="w">                                                     </span><span class="k">const</span><span class="w"> </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;&amp;</span><span class="w"> </span><span class="n">new_in_layouts</span><span class="p">,</span><span class="w"></span>
+<span class="w">                                                     </span><span class="k">const</span><span class="w"> </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;&amp;</span><span class="w"> </span><span class="n">old_in_layouts</span><span class="p">,</span><span class="w"></span>
+<span class="w">                                                     </span><span class="k">const</span><span class="w"> </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">IndexExpr</span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="o">&amp;</span><span class="n">old_in_shapes</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">Layout</span><span class="w"> </span><span class="n">ret</span><span class="p">;</span><span class="w"></span>
+
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">new_in_layouts</span><span class="p">.</span><span class="n">defined</span><span class="p">())</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">ICHECK_GE</span><span class="p">(</span><span class="n">new_in_layouts</span><span class="p">.</span><span class="n">size</span><span class="p">(),</span><span class="w"> </span><span class="mi">1</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="n">ret</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">new_in_layouts</span><span class="p">[</span><span class="mi">0</span><span class="p">];</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="kt">size_t</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">&lt;</span><span class="w"> </span><span class="n">old_in_layouts</span><span class="p">.</span><span cl [...]
+<span class="w">      </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">old_in_layouts</span><span class="p">[</span><span class="n">i</span><span class="p">].</span><span class="n">defined</span><span class="p">())</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">        </span><span class="n">ret</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">old_in_layouts</span><span class="p">[</span><span class="n">i</span><span class="p">];</span><span class="w"></span>
+<span class="w">        </span><span class="k">break</span><span class="p">;</span><span class="w"></span>
+<span class="w">      </span><span class="p">}</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;</span><span class="w"> </span><span class="o">&gt;</span><span class="p">{</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;</span><span class="p">(</span><span class="n">old_in_layouts</span>< [...]
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Second example is for a lightly-layout sensitive operator - batch normalization. BatchNorm has an axis operator that has to change when we go from NHWC to NCHW data layout. (Similar handling also needs to be for heavily-layout sensitive operators)</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;&gt;</span> <span class="n">BatchNormInferCorrectLayout</span><span class="p">(</span><span class="k">const</span> <span class="n">Attrs</span><span class="o">&amp;</span> <span class="n">attrs</span><span class="p">,</span>
-                                                 <span class="k">const</span> <span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;&amp;</span> <span class="n">new_in_layouts</span><span class="p">,</span>
-                                                 <span class="k">const</span> <span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;&amp;</span> <span class="n">old_in_layouts</span><span class="p">,</span>
-                                                 <span class="k">const</span> <span class="n">Array</span><span class="o">&lt;</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">IndexExpr</span><span class="o">&gt;&gt;&amp;</span> <span class="n">old_in_shapes</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">BatchNormAttrs</span><span class="o">*</span> <span class="n">param</span> <span class="o">=</span> <span class="k">const_cast</span><span class="o">&lt;</span><span class="n">BatchNormAttrs</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">attrs</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">BatchNormAttrs</span><span class="o">&gt;</span><span class="p">());</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="n">BatchNormInferCorrectLayout</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Attrs</span><span class="o">&amp;</span><span class="w"> </span><span class= [...]
+<span class="w">                                                 </span><span class="k">const</span><span class="w"> </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;&amp;</span><span class="w"> </span><span class="n">new_in_layouts</span><span class="p">,</span><span class="w"></span>
+<span class="w">                                                 </span><span class="k">const</span><span class="w"> </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;&amp;</span><span class="w"> </span><span class="n">old_in_layouts</span><span class="p">,</span><span class="w"></span>
+<span class="w">                                                 </span><span class="k">const</span><span class="w"> </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">IndexExpr</span><span class="o">&gt;&gt;&amp;</span><span class="w"> </span><span class="n">old_in_shapes</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">BatchNormAttrs</span><span class="o">*</span><span class="w"> </span><span class="n">param</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">const_cast</span><span class="o">&lt;</span><span class="n">BatchNormAttrs</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">attrs</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">BatchNormAttr [...]
 
-  <span class="kt">size_t</span> <span class="n">axis</span> <span class="o">=</span>
-      <span class="n">param</span><span class="o">-&gt;</span><span class="n">axis</span> <span class="o">&lt;</span> <span class="mi">0</span> <span class="o">?</span> <span class="n">param</span><span class="o">-&gt;</span><span class="n">axis</span> <span class="o">+</span> <span class="n">old_in_shapes</span><span class="p">[</span><span class="mi">0</span><span class="p">].</span><span class="n">size</span><span class="p">()</span> <span class="o">:</span> <span class="k">static_cas [...]
+<span class="w">  </span><span class="kt">size_t</span><span class="w"> </span><span class="n">axis</span><span class="w"> </span><span class="o">=</span><span class="w"></span>
+<span class="w">      </span><span class="n">param</span><span class="o">-&gt;</span><span class="n">axis</span><span class="w"> </span><span class="o">&lt;</span><span class="w"> </span><span class="mi">0</span><span class="w"> </span><span class="o">?</span><span class="w"> </span><span class="n">param</span><span class="o">-&gt;</span><span class="n">axis</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">old_in_shapes</span><span class="p">[ [...]
 
-  <span class="n">Layout</span> <span class="n">ret</span> <span class="o">=</span> <span class="n">Layout</span><span class="o">::</span><span class="n">Undef</span><span class="p">();</span>
+<span class="w">  </span><span class="n">Layout</span><span class="w"> </span><span class="n">ret</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Layout</span><span class="o">::</span><span class="n">Undef</span><span class="p">();</span><span class="w"></span>
 
-  <span class="c1">// For example, consider old_layout = NHWC, and new_layout = NCHW, and param-&gt;axis = 3</span>
+<span class="w">  </span><span class="c1">// For example, consider old_layout = NHWC, and new_layout = NCHW, and param-&gt;axis = 3</span>
 
-  <span class="k">if</span> <span class="p">(</span><span class="n">new_in_layouts</span><span class="p">.</span><span class="n">defined</span><span class="p">()</span> <span class="o">&amp;&amp;</span> <span class="n">old_in_layouts</span><span class="p">.</span><span class="n">defined</span><span class="p">())</span> <span class="p">{</span>
-    <span class="c1">// Get the new C axis. Extract the dim in old layout. Find the index of that dim in next layout.</span>
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">new_in_layouts</span><span class="p">.</span><span class="n">defined</span><span class="p">()</span><span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span><span class="n">old_in_layouts</span><span class="p">.</span><span class="n">defined</span><span class="p">())</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="c1">// Get the new C axis. Extract the dim in old layout. Find the index of that dim in next layout.</span>
 
-    <span class="c1">// Following line gives bn_dim = C as old_layout = NHWC, axis = 3</span>
-    <span class="k">const</span> <span class="k">auto</span><span class="o">&amp;</span> <span class="n">bn_dim</span> <span class="o">=</span> <span class="n">old_in_layouts</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="n">axis</span><span class="p">];</span>
+<span class="w">    </span><span class="c1">// Following line gives bn_dim = C as old_layout = NHWC, axis = 3</span>
+<span class="w">    </span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">&amp;</span><span class="w"> </span><span class="n">bn_dim</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">old_in_layouts</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="n">axis</span><span class="p">];</span><span class="w"></span>
 
-    <span class="c1">// The new_index is 1 because new_layout = NCHW and bn_dim is C</span>
-    <span class="k">auto</span> <span class="n">new_index</span> <span class="o">=</span> <span class="n">new_in_layouts</span><span class="p">[</span><span class="mi">0</span><span class="p">].</span><span class="n">IndexOf</span><span class="p">(</span><span class="n">bn_dim</span><span class="p">);</span>
+<span class="w">    </span><span class="c1">// The new_index is 1 because new_layout = NCHW and bn_dim is C</span>
+<span class="w">    </span><span class="k">auto</span><span class="w"> </span><span class="n">new_index</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">new_in_layouts</span><span class="p">[</span><span class="mi">0</span><span class="p">].</span><span class="n">IndexOf</span><span class="p">(</span><span class="n">bn_dim</span><span class="p">);</span><span class="w"></span>
 
-    <span class="c1">// We modify the layout-dependent attribute here - axis to 1.</span>
-    <span class="n">param</span><span class="o">-&gt;</span><span class="n">axis</span> <span class="o">=</span> <span class="n">new_index</span><span class="p">;</span>
+<span class="w">    </span><span class="c1">// We modify the layout-dependent attribute here - axis to 1.</span>
+<span class="w">    </span><span class="n">param</span><span class="o">-&gt;</span><span class="n">axis</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">new_index</span><span class="p">;</span><span class="w"></span>
 
-    <span class="c1">// Finally, we adapt to the new layout.</span>
-    <span class="n">ret</span> <span class="o">=</span> <span class="n">new_in_layouts</span><span class="p">[</span><span class="mi">0</span><span class="p">];</span>
+<span class="w">    </span><span class="c1">// Finally, we adapt to the new layout.</span>
+<span class="w">    </span><span class="n">ret</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">new_in_layouts</span><span class="p">[</span><span class="mi">0</span><span class="p">];</span><span class="w"></span>
 
-  <span class="p">}</span> <span class="k">else</span> <span class="k">if</span> <span class="p">(</span><span class="n">old_in_layouts</span><span class="p">.</span><span class="n">defined</span><span class="p">())</span> <span class="p">{</span>
-    <span class="n">ret</span> <span class="o">=</span> <span class="n">old_in_layouts</span><span class="p">[</span><span class="mi">0</span><span class="p">];</span>
-  <span class="p">}</span>
+<span class="w">  </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">old_in_layouts</span><span class="p">.</span><span class="n">defined</span><span class="p">())</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">ret</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">old_in_layouts</span><span class="p">[</span><span class="mi">0</span><span class="p">];</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
 
-  <span class="c1">// In case both new and old layouts are undefined, then there is no need of a change.</span>
-  <span class="c1">// ConvertLayout pass skips the automatic insertion of layout transforms in this case.</span>
+<span class="w">  </span><span class="c1">// In case both new and old layouts are undefined, then there is no need of a change.</span>
+<span class="w">  </span><span class="c1">// ConvertLayout pass skips the automatic insertion of layout transforms in this case.</span>
 
-  <span class="c1">// Following line is not important to tutorial. But, layout inference needs to define</span>
-  <span class="c1">// the layout for all input and output data layouts. For batch norm, the other inputs</span>
-  <span class="c1">// and outputs are vector having length of C dim in the input. So, we set the other</span>
-  <span class="c1">// layouts as C. BN has 5 inputs, 3 outputs. The last 4 inputs and last 2 outputs</span>
-  <span class="c1">// have &quot;C&quot; layout.</span>
-  <span class="n">Layout</span> <span class="n">c_layout</span> <span class="o">=</span> <span class="n">Layout</span><span class="p">(</span><span class="s">&quot;C&quot;</span><span class="p">);</span>
+<span class="w">  </span><span class="c1">// Following line is not important to tutorial. But, layout inference needs to define</span>
+<span class="w">  </span><span class="c1">// the layout for all input and output data layouts. For batch norm, the other inputs</span>
+<span class="w">  </span><span class="c1">// and outputs are vector having length of C dim in the input. So, we set the other</span>
+<span class="w">  </span><span class="c1">// layouts as C. BN has 5 inputs, 3 outputs. The last 4 inputs and last 2 outputs</span>
+<span class="w">  </span><span class="c1">// have &quot;C&quot; layout.</span>
+<span class="w">  </span><span class="n">Layout</span><span class="w"> </span><span class="n">c_layout</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Layout</span><span class="p">(</span><span class="s">&quot;C&quot;</span><span class="p">);</span><span class="w"></span>
 
-  <span class="k">return</span> <span class="n">Array</span><span class="o">&lt;</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;&gt;</span><span class="p">{{</span><span class="n">ret</span><span class="p">,</span> <span class="n">c_layout</span><span class="p">,</span> <span class="n">c_layout</span><span class="p">,</span> <span class="n">c_layout</span><span class="p">,</span> <span class="n">c_layout</span><span class="p" [...]
-                              <span class="p">{</span><span class="n">ret</span><span class="p">,</span> <span class="n">c_layout</span><span class="p">,</span> <span class="n">c_layout</span><span class="p">}};</span>
-<span class="p">}</span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Layout</span><span class="o">&gt;&gt;</span><span class="p">{{</span><span class="n">ret</span><span class="p">,</span><span class="w"> </span><span class="n">c_layout</span><span class="p">,</span><span class="w"> </span><span class="n">c_layout</span><span class="p">,</span><span class [...]
+<span class="w">                              </span><span class="p">{</span><span class="n">ret</span><span class="p">,</span><span class="w"> </span><span class="n">c_layout</span><span class="p">,</span><span class="w"> </span><span class="n">c_layout</span><span class="p">}};</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
diff --git a/docs/arch/debugger.html b/docs/arch/debugger.html
index 468247b06..2a7b9785f 100644
--- a/docs/arch/debugger.html
+++ b/docs/arch/debugger.html
@@ -448,7 +448,7 @@ API “load_params”.</p>
 <code class="docutils literal notranslate"><span class="pre">debug_executor</span></code>
 <code class="docutils literal notranslate"><span class="pre">from</span> <span class="pre">tvm.contrib.debugger</span> <span class="pre">import</span> <span class="pre">debug_executor</span> <span class="pre">as</span> <span class="pre">graph_executor</span></code></p></li>
 </ol>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib.debugger</span> <span class="k">import</span> <span class="n">debug_executor</span> <span class="k">as</span> <span class="n">graph_executor</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib.debugger</span> <span class="kn">import</span> <span class="n">debug_executor</span> <span class="k">as</span> <span class="n">graph_executor</span>
 <span class="n">m</span> <span class="o">=</span> <span class="n">graph_executor</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="n">graph</span><span class="p">,</span> <span class="n">lib</span><span class="p">,</span> <span class="n">dev</span><span class="p">,</span> <span class="n">dump_root</span><span class="o">=</span><span class="s2">&quot;/tmp/tvmdbg&quot;</span><span class="p">)</span>
 <span class="c1"># set inputs</span>
 <span class="n">m</span><span class="o">.</span><span class="n">set_input</span><span class="p">(</span><span class="s1">&#39;data&#39;</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">data</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">)))</span>
diff --git a/docs/arch/frontend/tensorflow.html b/docs/arch/frontend/tensorflow.html
index cb5abce58..49989b92f 100644
--- a/docs/arch/frontend/tensorflow.html
+++ b/docs/arch/frontend/tensorflow.html
@@ -362,14 +362,14 @@
 <h3>Export<a class="headerlink" href="#export" title="Permalink to this headline">¶</a></h3>
 <p>TensorFlow frontend expects a frozen protobuf (.pb) or saved model as input. It currently does not support checkpoint (.ckpt). The graphdef needed by the TensorFlow frontend can be extracted from the active session, or by using the <a class="reference external" href="https://github.com/apache/tvm/blob/main/python/tvm/relay/frontend/tensorflow_parser.py">TFParser</a> helper class.</p>
 <p>The model should be exported with a number of transformations to prepare the model for inference. It is also important to set <code class="docutils literal notranslate"><span class="pre">`add_shapes=True`</span></code>, as this will embed the output shapes of each node into the graph. Here is one function to export a model as a protobuf given a session:</p>
-<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tensorflow</span> <span class="kn">as</span> <span class="nn">tf</span>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tensorflow</span> <span class="k">as</span> <span class="nn">tf</span>
 <span class="kn">from</span> <span class="nn">tensorflow.tools.graph_transforms</span> <span class="kn">import</span> <span class="n">TransformGraph</span>
 
 <span class="k">def</span> <span class="nf">export_pb</span><span class="p">(</span><span class="n">session</span><span class="p">):</span>
     <span class="k">with</span> <span class="n">tf</span><span class="o">.</span><span class="n">gfile</span><span class="o">.</span><span class="n">GFile</span><span class="p">(</span><span class="s2">&quot;myexportedmodel.pb&quot;</span><span class="p">,</span> <span class="s2">&quot;wb&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
         <span class="n">inputs</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;myinput1&quot;</span><span class="p">,</span> <span class="s2">&quot;myinput2&quot;</span><span class="p">]</span> <span class="c1"># replace with your input names</span>
         <span class="n">outputs</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;myoutput1&quot;</span><span class="p">]</span> <span class="c1"># replace with your output names</span>
-        <span class="n">graph_def</span> <span class="o">=</span> <span class="n">session</span><span class="o">.</span><span class="n">graph</span><span class="o">.</span><span class="n">as_graph_def</span><span class="p">(</span><span class="n">add_shapes</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
+        <span class="n">graph_def</span> <span class="o">=</span> <span class="n">session</span><span class="o">.</span><span class="n">graph</span><span class="o">.</span><span class="n">as_graph_def</span><span class="p">(</span><span class="n">add_shapes</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
         <span class="n">graph_def</span> <span class="o">=</span> <span class="n">tf</span><span class="o">.</span><span class="n">graph</span><span class="o">.</span><span class="n">util</span><span class="o">.</span><span class="n">convert_variables_to_constants</span><span class="p">(</span><span class="n">session</span><span class="p">,</span> <span class="n">graph_def</span><span class="p">,</span> <span class="n">outputs</span><span class="p">)</span>
         <span class="n">graph_def</span> <span class="o">=</span> <span class="n">TransformGraph</span><span class="p">(</span>
             <span class="n">graph_def</span><span class="p">,</span>
diff --git a/docs/arch/index.html b/docs/arch/index.html
index 04d70b408..6c285812f 100644
--- a/docs/arch/index.html
+++ b/docs/arch/index.html
@@ -473,7 +473,7 @@ a target’s vector length would change the vectorization behavior.</p>
 <span class="n">arr</span><span class="p">:</span> <span class="n">tvm</span><span class="o">.</span><span class="n">runtime</span><span class="o">.</span><span class="n">NDArray</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><spa [...]
 <span class="n">fun</span><span class="p">:</span> <span class="n">tvm</span><span class="o">.</span><span class="n">runtime</span><span class="o">.</span><span class="n">PackedFunc</span> <span class="o">=</span> <span class="n">mod</span><span class="p">[</span><span class="s2">&quot;addone&quot;</span><span class="p">]</span>
 <span class="n">fun</span><span class="p">(</span><span class="n">a</span><span class="p">)</span>
-<span class="k">print</span><span class="p">(</span><span class="n">a</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
+<span class="nb">print</span><span class="p">(</span><span class="n">a</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
 </pre></div>
 </div>
 <p><a class="reference internal" href="../reference/api/python/runtime.html#tvm.runtime.Module" title="tvm.runtime.Module"><code class="xref py py-class docutils literal notranslate"><span class="pre">tvm.runtime.Module</span></code></a> encapsulates the result of compilation. A runtime.Module contains a GetFunction method to obtain PackedFuncs by name.</p>
diff --git a/docs/arch/inferbound.html b/docs/arch/inferbound.html
index e1b626dd9..0dcd834c8 100644
--- a/docs/arch/inferbound.html
+++ b/docs/arch/inferbound.html
@@ -334,28 +334,28 @@
 <span id="dev-inferbound-pass"></span><h1>InferBound Pass<a class="headerlink" href="#inferbound-pass" title="Permalink to this headline">¶</a></h1>
 <p>The InferBound pass is run after normalize, and before ScheduleOps <a class="reference external" href="https://github.com/apache/tvm/blob/main/python/tvm/driver/build_module.py">build_module.py</a>. The main job of InferBound is to create the bounds map, which specifies a Range for each IterVar in the program. These bounds are then passed to ScheduleOps, where they are used to set the extents of For loops, see <a class="reference external" href="https://github.com/apache/tvm/blob/main [...]
 <p>The output of InferBound is a map from IterVar to Range:</p>
-<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="n">Map</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="p">,</span> <span class="n">Range</span><span class="o">&gt;</span> <span class="n">InferBound</span><span class="p">(</span><span class="k">const</span> <span class="n">Schedule</span><span class="o">&amp;</span> <span class="n">sch</span><span class="p">);</span>
+<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="n">Map</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="p">,</span><span class="w"> </span><span class="n">Range</span><span class="o">&gt;</span><span class="w"> </span><span class="n">InferBound</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Schedule</span><span class="o">&amp;</span><span class="w"> </span><span clas [...]
 </pre></div>
 </div>
 <p>Therefore, let’s review the Range and IterVar classes:</p>
-<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">namespace</span> <span class="n">HalideIR</span> <span class="p">{</span>
-<span class="k">namespace</span> <span class="n">IR</span> <span class="p">{</span>
-     <span class="k">class</span> <span class="nc">RangeNode</span> <span class="o">:</span> <span class="k">public</span> <span class="n">Node</span> <span class="p">{</span>
-     <span class="k">public</span><span class="o">:</span>
-             <span class="n">Expr</span> <span class="n">min</span><span class="p">;</span>
-             <span class="n">Expr</span> <span class="n">extent</span><span class="p">;</span>
-             <span class="c1">// remainder omitted</span>
-     <span class="p">};</span>
-     <span class="p">}}</span>
-
-<span class="k">namespace</span> <span class="n">tvm</span> <span class="p">{</span>
-     <span class="k">class</span> <span class="nc">IterVarNode</span> <span class="o">:</span> <span class="k">public</span> <span class="n">Node</span> <span class="p">{</span>
-     <span class="k">public</span><span class="o">:</span>
-             <span class="n">Range</span> <span class="n">dom</span><span class="p">;</span>
-             <span class="n">Var</span> <span class="n">var</span><span class="p">;</span>
-             <span class="c1">// remainder omitted</span>
-     <span class="p">};</span>
-<span class="p">}</span>
+<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">namespace</span><span class="w"> </span><span class="nn">HalideIR</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="k">namespace</span><span class="w"> </span><span class="nn">IR</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">     </span><span class="k">class</span><span class="w"> </span><span class="nc">RangeNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">Node</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">     </span><span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">             </span><span class="n">Expr</span><span class="w"> </span><span class="n">min</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="n">Expr</span><span class="w"> </span><span class="n">extent</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="c1">// remainder omitted</span>
+<span class="w">     </span><span class="p">};</span><span class="w"></span>
+<span class="w">     </span><span class="p">}}</span><span class="w"></span>
+
+<span class="k">namespace</span><span class="w"> </span><span class="nn">tvm</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">     </span><span class="k">class</span><span class="w"> </span><span class="nc">IterVarNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">Node</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">     </span><span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">             </span><span class="n">Range</span><span class="w"> </span><span class="n">dom</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="n">Var</span><span class="w"> </span><span class="n">var</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="c1">// remainder omitted</span>
+<span class="w">     </span><span class="p">};</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Note that IterVarNode also contains a Range <code class="docutils literal notranslate"><span class="pre">dom</span></code>. This <code class="docutils literal notranslate"><span class="pre">dom</span></code> may or may not have a meaningful value, depending on when the IterVar was created. For example, when <code class="docutils literal notranslate"><span class="pre">tvm.compute</span></code> is called, an <a class="reference external" href="https://github.com/apache/tvm/blob/main/src [...]
@@ -364,52 +364,52 @@
 <p>We next review some TVM codebase concepts that are required to understand the InferBound pass.</p>
 <p>Recall that InferBound takes one argument, a Schedule. This schedule object, and its members, contains all information about the program being compiled.</p>
 <p>A TVM schedule is composed of Stages. Each stage has exactly one Operation, e.g., a ComputeOp or a TensorComputeOp. Each operation has a list of root_iter_vars, which in the case of ComputeOp, are composed of the axis IterVars and the reduce axis IterVars. Each operation can also contain many other IterVars, but all of them are related by the operations’s list of IterVarRelations. Each IterVarRelation represents either a split, fuse or rebase in the schedule. For example, in the case  [...]
-<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">namespace</span> <span class="n">tvm</span> <span class="p">{</span>
-     <span class="k">class</span> <span class="nc">ScheduleNode</span> <span class="o">:</span> <span class="k">public</span> <span class="n">Node</span> <span class="p">{</span>
-     <span class="k">public</span><span class="o">:</span>
-             <span class="n">Array</span><span class="o">&lt;</span><span class="n">Operation</span><span class="o">&gt;</span> <span class="n">outputs</span><span class="p">;</span>
-             <span class="n">Array</span><span class="o">&lt;</span><span class="n">Stage</span><span class="o">&gt;</span> <span class="n">stages</span><span class="p">;</span>
-             <span class="n">Map</span><span class="o">&lt;</span><span class="n">Operation</span><span class="p">,</span> <span class="n">Stage</span><span class="o">&gt;</span> <span class="n">stage_map</span><span class="p">;</span>
-             <span class="c1">// remainder omitted</span>
-     <span class="p">};</span>
-
-     <span class="k">class</span> <span class="nc">StageNode</span> <span class="o">:</span> <span class="k">public</span> <span class="n">Node</span> <span class="p">{</span>
-     <span class="k">public</span><span class="o">:</span>
-             <span class="n">Operation</span> <span class="n">op</span><span class="p">;</span>
-             <span class="n">Operation</span> <span class="n">origin_op</span><span class="p">;</span>
-             <span class="n">Array</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="o">&gt;</span> <span class="n">all_iter_vars</span><span class="p">;</span>
-             <span class="n">Array</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="o">&gt;</span> <span class="n">leaf_iter_vars</span><span class="p">;</span>
-             <span class="n">Array</span><span class="o">&lt;</span><span class="n">IterVarRelation</span><span class="o">&gt;</span> <span class="n">relations</span><span class="p">;</span>
-             <span class="c1">// remainder omitted</span>
-     <span class="p">};</span>
-
-     <span class="k">class</span> <span class="nc">OperationNode</span> <span class="o">:</span> <span class="k">public</span> <span class="n">Node</span> <span class="p">{</span>
-     <span class="k">public</span><span class="o">:</span>
-             <span class="k">virtual</span> <span class="n">Array</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="o">&gt;</span> <span class="n">root_iter_vars</span><span class="p">();</span>
-             <span class="k">virtual</span> <span class="n">Array</span><span class="o">&lt;</span><span class="n">Tensor</span><span class="o">&gt;</span> <span class="n">InputTensors</span><span class="p">();</span>
-             <span class="c1">// remainder omitted</span>
-     <span class="p">};</span>
-
-     <span class="k">class</span> <span class="nc">ComputeOpNode</span> <span class="o">:</span> <span class="k">public</span> <span class="n">OperationNode</span> <span class="p">{</span>
-     <span class="k">public</span><span class="o">:</span>
-             <span class="n">Array</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="o">&gt;</span> <span class="n">axis</span><span class="p">;</span>
-             <span class="n">Array</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="o">&gt;</span> <span class="n">reduce_axis</span><span class="p">;</span>
-             <span class="n">Array</span><span class="o">&lt;</span><span class="n">Expr</span><span class="o">&gt;</span> <span class="n">body</span><span class="p">;</span>
-             <span class="n">Array</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="o">&gt;</span> <span class="n">root_iter_vars</span><span class="p">();</span>
-             <span class="c1">// remainder omitted</span>
-     <span class="p">};</span>
-<span class="p">}</span>
+<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">namespace</span><span class="w"> </span><span class="nn">tvm</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">     </span><span class="k">class</span><span class="w"> </span><span class="nc">ScheduleNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">Node</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">     </span><span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">             </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Operation</span><span class="o">&gt;</span><span class="w"> </span><span class="n">outputs</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Stage</span><span class="o">&gt;</span><span class="w"> </span><span class="n">stages</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="n">Map</span><span class="o">&lt;</span><span class="n">Operation</span><span class="p">,</span><span class="w"> </span><span class="n">Stage</span><span class="o">&gt;</span><span class="w"> </span><span class="n">stage_map</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="c1">// remainder omitted</span>
+<span class="w">     </span><span class="p">};</span><span class="w"></span>
+
+<span class="w">     </span><span class="k">class</span><span class="w"> </span><span class="nc">StageNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">Node</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">     </span><span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">             </span><span class="n">Operation</span><span class="w"> </span><span class="n">op</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="n">Operation</span><span class="w"> </span><span class="n">origin_op</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="o">&gt;</span><span class="w"> </span><span class="n">all_iter_vars</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="o">&gt;</span><span class="w"> </span><span class="n">leaf_iter_vars</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">IterVarRelation</span><span class="o">&gt;</span><span class="w"> </span><span class="n">relations</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="c1">// remainder omitted</span>
+<span class="w">     </span><span class="p">};</span><span class="w"></span>
+
+<span class="w">     </span><span class="k">class</span><span class="w"> </span><span class="nc">OperationNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">Node</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">     </span><span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">             </span><span class="k">virtual</span><span class="w"> </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="o">&gt;</span><span class="w"> </span><span class="n">root_iter_vars</span><span class="p">();</span><span class="w"></span>
+<span class="w">             </span><span class="k">virtual</span><span class="w"> </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Tensor</span><span class="o">&gt;</span><span class="w"> </span><span class="n">InputTensors</span><span class="p">();</span><span class="w"></span>
+<span class="w">             </span><span class="c1">// remainder omitted</span>
+<span class="w">     </span><span class="p">};</span><span class="w"></span>
+
+<span class="w">     </span><span class="k">class</span><span class="w"> </span><span class="nc">ComputeOpNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">OperationNode</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">     </span><span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">             </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="o">&gt;</span><span class="w"> </span><span class="n">axis</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="o">&gt;</span><span class="w"> </span><span class="n">reduce_axis</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Expr</span><span class="o">&gt;</span><span class="w"> </span><span class="n">body</span><span class="p">;</span><span class="w"></span>
+<span class="w">             </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="o">&gt;</span><span class="w"> </span><span class="n">root_iter_vars</span><span class="p">();</span><span class="w"></span>
+<span class="w">             </span><span class="c1">// remainder omitted</span>
+<span class="w">     </span><span class="p">};</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Tensors haven’t been mentioned yet, but in the context of TVM, a Tensor represents output of an operation.</p>
-<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">TensorNode</span> <span class="o">:</span> <span class="k">public</span> <span class="n">Node</span> <span class="p">{</span>
-<span class="k">public</span><span class="o">:</span>
-     <span class="c1">// The source operation, can be None</span>
-     <span class="c1">// This Tensor is output by this op</span>
-     <span class="n">Operation</span> <span class="n">op</span><span class="p">;</span>
-     <span class="c1">// The output index from the source operation</span>
-     <span class="kt">int</span> <span class="n">value_index</span><span class="p">;</span>
-<span class="p">};</span>
+<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">TensorNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">Node</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">     </span><span class="c1">// The source operation, can be None</span>
+<span class="w">     </span><span class="c1">// This Tensor is output by this op</span>
+<span class="w">     </span><span class="n">Operation</span><span class="w"> </span><span class="n">op</span><span class="p">;</span><span class="w"></span>
+<span class="w">     </span><span class="c1">// The output index from the source operation</span>
+<span class="w">     </span><span class="kt">int</span><span class="w"> </span><span class="n">value_index</span><span class="p">;</span><span class="w"></span>
+<span class="p">};</span><span class="w"></span>
 </pre></div>
 </div>
 <p>In the Operation class declaration above, we can see that each operation also has a list of InputTensors. Thus the stages of the schedule form a DAG, where each stage is a node in the graph. There is an edge in the graph from Stage A to Stage B, if the operation of Stage B has an input tensor whose source operation is the op of Stage A. Put simply, there is an edge from A to B, if B consumes a tensor produced by A. See the diagram below. This graph is created at the beginning of Infer [...]
@@ -417,17 +417,17 @@
 <p>InferBound makes one pass through the graph, visiting each stage exactly once. InferBound starts from the output stages (i.e., the solid blue nodes in the graph above), and moves upwards (in the opposite direction of the edges). This is achieved by performing a reverse topological sort on the nodes of the graph. Therefore, when InferBound visits a stage, each of its consumer stages has already been visited.</p>
 <img alt="https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/inferbound_traversal.png" class="align-center" src="https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/inferbound_traversal.png" />
 <p>The InferBound pass is shown in the following pseudo-code:</p>
-<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="n">Map</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="p">,</span> <span class="n">Range</span><span class="o">&gt;</span> <span class="n">InferBound</span><span class="p">(</span><span class="k">const</span> <span class="n">Schedule</span><span class="o">&amp;</span> <span class="n">sch</span><span class="p">)</span> <span class="p">{</span>
-     <span class="n">Array</span><span class="o">&lt;</span><span class="n">Operation</span><span class="o">&gt;</span> <span class="n">outputs</span> <span class="o">=</span> <span class="n">sch</span><span class="o">-&gt;</span><span class="n">get_outputs</span><span class="p">();</span>
-     <span class="n">G</span> <span class="o">=</span> <span class="n">CreateGraph</span><span class="p">(</span><span class="n">outputs</span><span class="p">);</span>
-     <span class="n">stage_list</span> <span class="o">=</span> <span class="n">sch</span><span class="o">-&gt;</span><span class="n">reverse_topological_sort</span><span class="p">(</span><span class="n">G</span><span class="p">);</span>
-     <span class="n">Map</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="p">,</span> <span class="n">Range</span><span class="o">&gt;</span> <span class="n">rmap</span><span class="p">;</span>
-     <span class="k">for</span> <span class="p">(</span><span class="n">Stage</span> <span class="n">s</span> <span class="n">in</span> <span class="n">stage_list</span><span class="p">)</span> <span class="p">{</span>
-             <span class="n">InferRootBound</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">rmap</span><span class="p">);</span>
-             <span class="n">PassDownDomain</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">rmap</span><span class="p">);</span>
-     <span class="p">}</span>
-     <span class="k">return</span> <span class="n">rmap</span><span class="p">;</span>
-<span class="p">}</span>
+<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="n">Map</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="p">,</span><span class="w"> </span><span class="n">Range</span><span class="o">&gt;</span><span class="w"> </span><span class="n">InferBound</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Schedule</span><span class="o">&amp;</span><span class="w"> </span><span clas [...]
+<span class="w">     </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Operation</span><span class="o">&gt;</span><span class="w"> </span><span class="n">outputs</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">sch</span><span class="o">-&gt;</span><span class="n">get_outputs</span><span class="p">();</span><span class="w"></span>
+<span class="w">     </span><span class="n">G</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">CreateGraph</span><span class="p">(</span><span class="n">outputs</span><span class="p">);</span><span class="w"></span>
+<span class="w">     </span><span class="n">stage_list</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">sch</span><span class="o">-&gt;</span><span class="n">reverse_topological_sort</span><span class="p">(</span><span class="n">G</span><span class="p">);</span><span class="w"></span>
+<span class="w">     </span><span class="n">Map</span><span class="o">&lt;</span><span class="n">IterVar</span><span class="p">,</span><span class="w"> </span><span class="n">Range</span><span class="o">&gt;</span><span class="w"> </span><span class="n">rmap</span><span class="p">;</span><span class="w"></span>
+<span class="w">     </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="n">Stage</span><span class="w"> </span><span class="n">s</span><span class="w"> </span><span class="n">in</span><span class="w"> </span><span class="n">stage_list</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">             </span><span class="n">InferRootBound</span><span class="p">(</span><span class="n">s</span><span class="p">,</span><span class="w"> </span><span class="o">&amp;</span><span class="n">rmap</span><span class="p">);</span><span class="w"></span>
+<span class="w">             </span><span class="n">PassDownDomain</span><span class="p">(</span><span class="n">s</span><span class="p">,</span><span class="w"> </span><span class="o">&amp;</span><span class="n">rmap</span><span class="p">);</span><span class="w"></span>
+<span class="w">     </span><span class="p">}</span><span class="w"></span>
+<span class="w">     </span><span class="k">return</span><span class="w"> </span><span class="n">rmap</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>The InferBound pass has two interesting properties that are not immediately obvious:</p>
@@ -450,13 +450,13 @@
 <p>The purpose of PassDownDomain is to take the Ranges produced by InferRootBound for the root_iter_vars, and set the Ranges of all other IterVars in the stage.</p>
 <p>PassDownDomain iterates through the stage’s IterVarRelations. There are three possible types of IterVarRelation: split, fuse, and rebase. The most interesting case (since it offers opportunity for improvement), is IterVarRelations representing splits.</p>
 <p>The Ranges of the inner and outer IterVars of the split are set based on the parent IterVar’s known Range, as follows:</p>
-<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="n">rmap</span><span class="p">[</span><span class="n">split</span><span class="o">-&gt;</span><span class="n">inner</span><span class="p">]</span> <span class="o">=</span> <span class="n">Range</span><span class="o">::</span><span class="n">FromMinExtent</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">split</span><span class="o">-&gt;</span><span cl [...]
-<span class="n">rmap</span><span class="p">[</span><span class="n">split</span><span class="o">-&gt;</span><span class="n">outer</span><span class="p">]</span> <span class="o">=</span> <span class="n">Range</span><span class="o">::</span><span class="n">FromMinExtent</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">DivCeil</span><span class="p">(</span><span class="n">rmap</span><span class="p">[</span><span class="n">split</span><span class [...]
+<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="n">rmap</span><span class="p">[</span><span class="n">split</span><span class="o">-&gt;</span><span class="n">inner</span><span class="p">]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Range</span><span class="o">::</span><span class="n">FromMinExtent</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </ [...]
+<span class="n">rmap</span><span class="p">[</span><span class="n">split</span><span class="o">-&gt;</span><span class="n">outer</span><span class="p">]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Range</span><span class="o">::</span><span class="n">FromMinExtent</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">DivCeil</span><span class="p">(</span><span class="n">rmap< [...]
 </pre></div>
 </div>
 <p>There is an opportunity here to tighten the bounds produced by InferBound, when <code class="docutils literal notranslate"><span class="pre">split-&gt;factor</span></code> does not evenly divide the parent’s extent. Suppose the parent’s extent is 20, and the split factor is 16. Then on the second iteration of the outer loop, the inner loop only needs to perform 4 iterations, not 16. If PassDownDomain could set the extent of <code class="docutils literal notranslate"><span class="pre"> [...]
 <p>For Fuse relations, the Range of the fused IterVar is set based on the known Ranges of the inner and outer IterVars, as follows:</p>
-<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="n">rmap</span><span class="p">[</span><span class="n">fuse</span><span class="o">-&gt;</span><span class="n">fused</span><span class="p">]</span> <span class="o">=</span> <span class="n">Range</span><span class="o">::</span><span class="n">FromMinExtent</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">rmap</span><span class="p">[</span><span class="n [...]
+<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="n">rmap</span><span class="p">[</span><span class="n">fuse</span><span class="o">-&gt;</span><span class="n">fused</span><span class="p">]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Range</span><span class="o">::</span><span class="n">FromMinExtent</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </s [...]
 </pre></div>
 </div>
 </div>
@@ -480,7 +480,7 @@
 <div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="cm">/*</span>
 <span class="cm"> * Input: Map&lt;IterVar, Range&gt; rmap: contains the Range for each IterVar of the consumer stage</span>
 <span class="cm"> * Output: Map&lt;IterVar, IntSet&gt; up_state: contains an IntSet for each leaf_iter_var of the consumer</span>
-<span class="cm"> */</span>
+<span class="cm"> */</span><span class="w"></span>
 </pre></div>
 </div>
 <p>In Phase 1, IntSets for each of the consumer’s leaf_iter_vars are created, based on the Ranges of the leaf_iter_vars from <code class="docutils literal notranslate"><span class="pre">rmap</span></code>.  Recall that the consumer has already been visited by InferBound, so all of its IterVars have known Ranges in <code class="docutils literal notranslate"><span class="pre">rmap</span></code>.</p>
@@ -497,7 +497,7 @@
 <div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="cm">/*</span>
 <span class="cm"> * Input: Map&lt;IterVar, IntSet&gt; up_state: consumer leaf -&gt; IntSet</span>
 <span class="cm"> * Output: Map&lt;IterVar, IntSet&gt; dom_map: consumer root -&gt; IntSet</span>
-<span class="cm"> */</span>
+<span class="cm"> */</span><span class="w"></span>
 </pre></div>
 </div>
 <p>The purpose of Phase 2 is to propagate the IntSet information from the consumer’s leaf_iter_vars to the consumer’s root_iter_vars. The result of Phase 2 is another map, <code class="docutils literal notranslate"><span class="pre">dom_map</span></code>, that contains an IntSet for each of the consumer’s root_iter_vars.</p>
@@ -508,7 +508,7 @@
 </ul>
 <p>Case 2 is only needed if the schedule contains compute_at. Please refer to the section <a class="reference internal" href="#inferboundca"><span class="std std-ref">InferBound with compute_at</span></a> below, for further explanation.</p>
 <p>After PassUpDomain has finished propagating up_state to all IterVars of the consumer, a fresh map, from root_iter_vars to IntSet, is created. If the schedule does not contain compute_at, the IntSet for root_iter_var <code class="docutils literal notranslate"><span class="pre">iv</span></code> is created by the following code:</p>
-<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="n">dom_map</span><span class="p">[</span><span class="n">iv</span><span class="o">-&gt;</span><span class="n">var</span><span class="p">.</span><span class="n">get</span><span class="p">()]</span> <span class="o">=</span> <span class="n">IntSet</span><span class="o">::</span><span class="n">range</span><span class="p">(</span><span class="n">up_state</span><span class="p">.</span><span class="n"> [...]
+<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="n">dom_map</span><span class="p">[</span><span class="n">iv</span><span class="o">-&gt;</span><span class="n">var</span><span class="p">.</span><span class="n">get</span><span class="p">()]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">IntSet</span><span class="o">::</span><span class="n">range</span><span class="p">(</span><span class="n">up_state< [...]
 </pre></div>
 </div>
 <p>Note that if the schedule does not contain compute_at, Phases 1-2 are actually unnecessary. dom_map can be built directly from the known Ranges in rmap. Ranges simply need to be converted to IntSets, which involves no loss of information.</p>
@@ -518,17 +518,17 @@
 <div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="cm">/*</span>
 <span class="cm"> * Input: Map&lt;IterVar, IntSet&gt; dom_map: consumer root -&gt; IntSet</span>
 <span class="cm"> * Output: Map&lt;Tensor, TensorDom&gt; tmap: output tensor -&gt; vector&lt;vector&lt;IntSet&gt; &gt;</span>
-<span class="cm"> */</span>
+<span class="cm"> */</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Note that the consumer’s input tensors are output tensors of the stage InferBound is working on. So by establishing information about the consumer’s input tensors, we actually obtain information about the stage’s output tensors too: the consumers require certain regions of these tensors to be computed. This information can then be propagated through the rest of the stage, eventually obtaining Ranges for the stage’s root_iter_vars by the end of Phase 4.</p>
 <p>The output of Phase 3 is tmap, which is a map containing all of the stage’s output tensors. Recall that a Tensor is multi-dimensional, with a number of different axes. For each output tensor, and each of that tensor’s axes, tmap contains a list of IntSets. Each IntSet in the list is a request from a different consumer.</p>
 <p>Phase 3 is accomplished by calling PropBoundToInputs on the consumer. PropBoundToInputs adds IntSets to tmap’s lists, for all input Tensors of the consumer.</p>
 <p>The exact behavior of PropBoundToInputs depends on the type of the consumer’s operation: ComputeOp, TensorComputeOp, PlaceholderOp, ExternOp, etc. Consider the case of TensorComputeOp. A TensorComputeOp already has a Region for each of its Tensor inputs, defining the slice of the tensor that the operation depends on. For each input tensor i, and dimension j, a request is added to tmap, based on the corresponding dimension in the Region:</p>
-<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> <span class="p">(</span><span class="kt">size_t</span> <span class="n">j</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">j</span> <span class="o">&lt;</span> <span class="n">t</span><span class="p">.</span><span class="n">ndim</span><span class="p">();</span> <span class="o">++</span><span class="n">j</span><span class="p">)</span> < [...]
-     <span class="c1">// i selects the Tensor t</span>
-     <span class="n">tmap</span><span class="p">[</span><span class="n">i</span><span class="p">][</span><span class="n">j</span><span class="p">].</span><span class="n">push_back</span><span class="p">(</span><span class="n">EvalSet</span><span class="p">(</span><span class="n">region</span><span class="p">[</span><span class="n">j</span><span class="p">],</span> <span class="n">dom_map</span><span class="p">));</span>
-<span class="p">}</span>
+<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="kt">size_t</span><span class="w"> </span><span class="n">j</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="n">j</span><span class="w"> </span><span class="o">&lt;</span><span class="w"> </span><span class="n"> [...]
+<span class="w">     </span><span class="c1">// i selects the Tensor t</span>
+<span class="w">     </span><span class="n">tmap</span><span class="p">[</span><span class="n">i</span><span class="p">][</span><span class="n">j</span><span class="p">].</span><span class="n">push_back</span><span class="p">(</span><span class="n">EvalSet</span><span class="p">(</span><span class="n">region</span><span class="p">[</span><span class="n">j</span><span class="p">],</span><span class="w"> </span><span class="n">dom_map</span><span class="p">));</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
@@ -537,14 +537,14 @@
 <div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="cm">/*</span>
 <span class="cm"> * Input: Map&lt;Tensor, TensorDom&gt; tmap: output tensor -&gt; vector&lt;vector&lt;IntSet&gt; &gt;</span>
 <span class="cm"> * Output: Map&lt;IterVar, Range&gt; rmap: rmap is populated for all of the stage&#39;s root_iter_vars</span>
-<span class="cm"> */</span>
+<span class="cm"> */</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Phase 4 is performed by GatherBound, whose behavior depends on the type of operation of the stage. We discuss the ComputeOp case only, but TensorComputeOp is the same.</p>
 <p>A ComputeOp has only a single output Tensor, whose axes correspond to the axis variables of the ComputeOp. The root_iter_vars of a ComputeOp include these axis variables, as well as the reduce_axis variables. If the root IterVar is an axis var, it corresponds to one of the axes of the output Tensor. GatherBound sets the Range of such a root IterVar to the union of all IntSets (i.e., union of all consumer requests) for the corresponding axis of the tensor. If the root IterVar is a redu [...]
 <div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="c1">// &#39;output&#39; selects the output tensor</span>
 <span class="c1">// i is the dimension</span>
-<span class="n">rmap</span><span class="p">[</span><span class="n">axis</span><span class="p">[</span><span class="n">i</span><span class="p">]]</span> <span class="o">=</span> <span class="n">arith</span><span class="o">::</span><span class="n">Union</span><span class="p">(</span><span class="n">tmap</span><span class="p">[</span><span class="n">output</span><span class="p">][</span><span class="n">i</span><span class="p">]).</span><span class="n">cover_range</span><span class="p">(</sp [...]
+<span class="n">rmap</span><span class="p">[</span><span class="n">axis</span><span class="p">[</span><span class="n">i</span><span class="p">]]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">arith</span><span class="o">::</span><span class="n">Union</span><span class="p">(</span><span class="n">tmap</span><span class="p">[</span><span class="n">output</span><span class="p">][</span><span class="n">i</span><span class="p">]).</span><span cla [...]
 </pre></div>
 </div>
 <img alt="https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/gatherbound.png" class="align-center" src="https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/gatherbound.png" />
@@ -603,22 +603,22 @@
 <div class="section" id="attach-paths">
 <span id="attachpaths"></span><h3>Attach Paths<a class="headerlink" href="#attach-paths" title="Permalink to this headline">¶</a></h3>
 <p>If stage C is computed at axis j of stage D, we say that C is  <em>attached</em>  to axis j of stage D. This is reflected in the Stage object by setting the following three member variables:</p>
-<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">StageNode</span> <span class="o">:</span> <span class="k">public</span> <span class="n">Node</span> <span class="p">{</span>
-<span class="k">public</span><span class="o">:</span>
-    <span class="c1">// omitted</span>
+<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">StageNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">Node</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">    </span><span class="c1">// omitted</span>
 
-    <span class="c1">// For compute_at, attach_type = kScope</span>
-    <span class="n">AttachType</span> <span class="n">attach_type</span><span class="p">;</span>
+<span class="w">    </span><span class="c1">// For compute_at, attach_type = kScope</span>
+<span class="w">    </span><span class="n">AttachType</span><span class="w"> </span><span class="n">attach_type</span><span class="p">;</span><span class="w"></span>
 
-    <span class="c1">// For compute_at, this is the axis</span>
-    <span class="c1">// passed to compute_at, e.g., D.op.axis[1]</span>
-    <span class="n">IterVar</span> <span class="n">attach_ivar</span><span class="p">;</span>
+<span class="w">    </span><span class="c1">// For compute_at, this is the axis</span>
+<span class="w">    </span><span class="c1">// passed to compute_at, e.g., D.op.axis[1]</span>
+<span class="w">    </span><span class="n">IterVar</span><span class="w"> </span><span class="n">attach_ivar</span><span class="p">;</span><span class="w"></span>
 
-    <span class="c1">// The stage passed to compute_at, e.g., D</span>
-    <span class="n">Stage</span> <span class="n">attach_stage</span><span class="p">;</span>
+<span class="w">    </span><span class="c1">// The stage passed to compute_at, e.g., D</span>
+<span class="w">    </span><span class="n">Stage</span><span class="w"> </span><span class="n">attach_stage</span><span class="p">;</span><span class="w"></span>
 
-    <span class="c1">// omitted</span>
-<span class="p">};</span>
+<span class="w">    </span><span class="c1">// omitted</span>
+<span class="p">};</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Consider the above examples again. In order for InferBound to determine how many elements of C must be computed, it is important to know whether the computation of C occurs within the scope of a leaf variable of D, or above that scope. For example, in Ex. 1, the computation of C occurs  <em>above</em>  the scopes of all of D’s leaf variables. In Ex. 2, the computation of C occurs  <em>within</em>  the scope of all of D’s leaf variables. In Ex. 3, C occurs within the scope of D’s i, bu [...]
@@ -725,7 +725,7 @@
 <div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="cm">/*</span>
 <span class="cm"> * Input: Map&lt;IterVar, Range&gt; rmap: contains the Range for each IterVar of the consumer stage</span>
 <span class="cm"> * Output: Map&lt;IterVar, IntSet&gt; up_state: contains an IntSet for each leaf_iter_var of the consumer</span>
-<span class="cm"> */</span>
+<span class="cm"> */</span><span class="w"></span>
 </pre></div>
 </div>
 <p>In Phase 1, IntSets for each of the consumer’s leaf_iter_vars are created, based on the Ranges of the leaf_iter_vars from rmap. Recall that the consumer has already been visited by InferBound, so all of its IterVars have known Ranges in rmap.</p>
@@ -742,7 +742,7 @@
 <div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="cm">/*</span>
 <span class="cm"> * Input: Map&lt;IterVar, IntSet&gt; up_state: consumer leaf -&gt; IntSet</span>
 <span class="cm"> * Output: Map&lt;IterVar, IntSet&gt; dom_map: consumer root -&gt; IntSet</span>
-<span class="cm"> */</span>
+<span class="cm"> */</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Phase 2 begins by calling PassUpDomain, which visits the IterVarRelations of the consumer stage. In the case of a Split relation, PassUpDomain sets the up_state of the parent IterVar, based on the inner and outer IntSets, as follows:</p>
@@ -776,7 +776,7 @@
 <p><strong>Ex. 6</strong></p>
 <p>Above, we discussed the behavior of PassUpDomain on Split relations only. In the following example, the schedule contains <code class="docutils literal notranslate"><span class="pre">fuse</span></code> in addition to <code class="docutils literal notranslate"><span class="pre">split</span></code>. In the TVM program below, the operation C has two axes that are fused, and then the fused axis is split. Note that all tensors are originally of shape <code class="docutils literal notransla [...]
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
 
 <span class="n">n</span> <span class="o">=</span> <span class="mi">4</span>
 <span class="n">m</span> <span class="o">=</span> <span class="mi">4</span>
diff --git a/docs/arch/introduction_to_module_serialization.html b/docs/arch/introduction_to_module_serialization.html
index b90a7c2bb..4b2de153b 100644
--- a/docs/arch/introduction_to_module_serialization.html
+++ b/docs/arch/introduction_to_module_serialization.html
@@ -403,21 +403,21 @@ function, we firstly construct one helper class <code class="docutils literal no
 initialization work, like marking module index. Then we could use its <code class="docutils literal notranslate"><span class="pre">SerializeModule</span></code> to serialize module.</p>
 <p>For better understanding, let us dig the implementation of this class a little deeper.</p>
 <p>The following code is used to construct <code class="docutils literal notranslate"><span class="pre">ModuleSerializer</span></code>:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">explicit</span> <span class="nf">ModuleSerializer</span><span class="p">(</span><span class="n">runtime</span><span class="o">::</span><span class="n">Module</span> <span class="n">mod</span><span class="p">)</span> <span class="o">:</span> <span class="n">mod_</span><span class="p">(</span><span class="n">mod</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">Init</span><span class="p">();</span>
-<span class="p">}</span>
-<span class="k">private</span><span class="o">:</span>
-<span class="kt">void</span> <span class="n">Init</span><span class="p">()</span> <span class="p">{</span>
-  <span class="n">CreateModuleIndex</span><span class="p">();</span>
-  <span class="n">CreateImportTree</span><span class="p">();</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">explicit</span><span class="w"> </span><span class="n">ModuleSerializer</span><span class="p">(</span><span class="n">runtime</span><span class="o">::</span><span class="n">Module</span><span class="w"> </span><span class="n">mod</span><span class="p">)</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">mod_</span><span class="p">(</span><span class=" [...]
+<span class="w">  </span><span class="n">Init</span><span class="p">();</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
+<span class="k">private</span><span class="o">:</span><span class="w"></span>
+<span class="kt">void</span><span class="w"> </span><span class="n">Init</span><span class="p">()</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">CreateModuleIndex</span><span class="p">();</span><span class="w"></span>
+<span class="w">  </span><span class="n">CreateImportTree</span><span class="p">();</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>In <code class="docutils literal notranslate"><span class="pre">CreateModuleIndex()</span></code>, We will inspect module import relationship
 using DFS and create index for them. Note the root module is fixed at
 location 0. In our example, we have module relationship like this:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="nl">llvm_mod</span><span class="p">:</span><span class="n">imported_modules</span>
-  <span class="o">-</span> <span class="n">cuda_mod</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="nl">llvm_mod</span><span class="p">:</span><span class="n">imported_modules</span><span class="w"></span>
+<span class="w">  </span><span class="o">-</span><span class="w"> </span><span class="n">cuda_mod</span><span class="w"></span>
 </pre></div>
 </div>
 <p>So LLVM module will have index 0, CUDA module will have index 1.</p>
@@ -429,14 +429,14 @@ index. In code, we use <code class="docutils literal notranslate"><span class="p
 <code class="docutils literal notranslate"><span class="pre">import_tree_child_indices_</span></code> to represent them.</p>
 <p>After initialization, we could serialize module using <code class="docutils literal notranslate"><span class="pre">SerializeModule</span></code> function.
 In its function logic, we will assume the serialization format like this:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">binary_blob_size</span>
-<span class="n">binary_blob_type_key</span>
-<span class="n">binary_blob_logic</span>
-<span class="n">binary_blob_type_key</span>
-<span class="n">binary_blob_logic</span>
-<span class="p">...</span>
-<span class="n">_import_tree</span>
-<span class="n">_import_tree_logic</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">binary_blob_size</span><span class="w"></span>
+<span class="n">binary_blob_type_key</span><span class="w"></span>
+<span class="n">binary_blob_logic</span><span class="w"></span>
+<span class="n">binary_blob_type_key</span><span class="w"></span>
+<span class="n">binary_blob_logic</span><span class="w"></span>
+<span class="p">...</span><span class="w"></span>
+<span class="n">_import_tree</span><span class="w"></span>
+<span class="n">_import_tree_logic</span><span class="w"></span>
 </pre></div>
 </div>
 <p><code class="docutils literal notranslate"><span class="pre">binary_blob_size</span></code> is the number of blobs we will have in this
@@ -477,37 +477,37 @@ is to call <code class="docutils literal notranslate"><span class="pre">_LoadFro
 according to the function logic, we will call <code class="docutils literal notranslate"><span class="pre">module.loadfile_so</span></code> in
 <code class="docutils literal notranslate"><span class="pre">dso_library.cc</span></code>. The key is here:</p>
 <div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="c1">// Load the imported modules</span>
-<span class="k">const</span> <span class="kt">char</span><span class="o">*</span> <span class="n">dev_mblob</span> <span class="o">=</span> <span class="k">reinterpret_cast</span><span class="o">&lt;</span><span class="k">const</span> <span class="kt">char</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">lib</span><span class="o">-&gt;</span><span class="n">GetSymbol</span><span class="p">(</span><span class="n">runtime</span><span class="o">::</span><span class= [...]
-<span class="n">Module</span> <span class="n">root_mod</span><span class="p">;</span>
-<span class="k">if</span> <span class="p">(</span><span class="n">dev_mblob</span> <span class="o">!=</span> <span class="k">nullptr</span><span class="p">)</span> <span class="p">{</span>
-<span class="n">root_mod</span> <span class="o">=</span> <span class="n">ProcessModuleBlob</span><span class="p">(</span><span class="n">dev_mblob</span><span class="p">,</span> <span class="n">lib</span><span class="p">);</span>
-<span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
+<span class="k">const</span><span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">dev_mblob</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">reinterpret_cast</span><span class="o">&lt;</span><span class="k">const</span><span class="w"> </span><span class="kt">char</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">lib</span><span class="o">-&gt;</span><span cl [...]
+<span class="n">Module</span><span class="w"> </span><span class="n">root_mod</span><span class="p">;</span><span class="w"></span>
+<span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">dev_mblob</span><span class="w"> </span><span class="o">!=</span><span class="w"> </span><span class="k">nullptr</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="n">root_mod</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ProcessModuleBlob</span><span class="p">(</span><span class="n">dev_mblob</span><span class="p">,</span><span class="w"> </span><span class="n">lib</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
 <span class="c1">// Only have one single DSO Module</span>
-<span class="n">root_mod</span> <span class="o">=</span> <span class="n">Module</span><span class="p">(</span><span class="n">n</span><span class="p">);</span>
-<span class="p">}</span>
+<span class="n">root_mod</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Module</span><span class="p">(</span><span class="n">n</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>As said before, we will pack the blob into the symbol
 <code class="docutils literal notranslate"><span class="pre">runtime::symbol::tvm_dev_mblob</span></code>. During deserialization part, we will
 inspect it. If we have <code class="docutils literal notranslate"><span class="pre">runtime::symbol::tvm_dev_mblob</span></code>, we will call <code class="docutils literal notranslate"><span class="pre">ProcessModuleBlob</span></code>,
 whose logic like this:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">READ</span><span class="p">(</span><span class="n">blob_size</span><span class="p">)</span>
-<span class="n">READ</span><span class="p">(</span><span class="n">blob_type_key</span><span class="p">)</span>
-<span class="k">for</span> <span class="p">(</span><span class="kt">size_t</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">blob_size</span><span class="p">;</span> <span class="n">i</span><span class="o">++</span><span class="p">)</span> <span class="p">{</span>
-    <span class="k">if</span> <span class="p">(</span><span class="n">blob_type_key</span> <span class="o">==</span> <span class="s">&quot;_lib&quot;</span><span class="p">)</span> <span class="p">{</span>
-      <span class="c1">// construct dso module using lib</span>
-    <span class="p">}</span> <span class="k">else</span> <span class="k">if</span> <span class="p">(</span><span class="n">blob_type_key</span> <span class="o">==</span> <span class="s">&quot;_import_tree&quot;</span><span class="p">)</span> <span class="p">{</span>
-      <span class="c1">// READ(_import_tree_row_ptr)</span>
-      <span class="c1">// READ(_import_tree_child_indices)</span>
-    <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-      <span class="c1">// call module.loadbinary_blob_type_key, such as module.loadbinary_cuda</span>
-      <span class="c1">// to restore.</span>
-    <span class="p">}</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">READ</span><span class="p">(</span><span class="n">blob_size</span><span class="p">)</span><span class="w"></span>
+<span class="n">READ</span><span class="p">(</span><span class="n">blob_type_key</span><span class="p">)</span><span class="w"></span>
+<span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="kt">size_t</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">&lt;</span><span class="w"> </span><span class="n">blob_size</span><span class="p">;</span><span class="w"> </span><span class="n"> [...]
+<span class="w">    </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">blob_type_key</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s">&quot;_lib&quot;</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">      </span><span class="c1">// construct dso module using lib</span>
+<span class="w">    </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">blob_type_key</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s">&quot;_import_tree&quot;</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">      </span><span class="c1">// READ(_import_tree_row_ptr)</span>
+<span class="w">      </span><span class="c1">// READ(_import_tree_child_indices)</span>
+<span class="w">    </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">      </span><span class="c1">// call module.loadbinary_blob_type_key, such as module.loadbinary_cuda</span>
+<span class="w">      </span><span class="c1">// to restore.</span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 <span class="c1">// Using _import_tree_row_ptr and _import_tree_child_indices to</span>
 <span class="c1">// restore module import relationship. The first module is the</span>
 <span class="c1">// root module according to our invariance as said before.</span>
-<span class="k">return</span> <span class="n">root_module</span><span class="p">;</span>
+<span class="k">return</span><span class="w"> </span><span class="n">root_module</span><span class="p">;</span><span class="w"></span>
 </pre></div>
 </div>
 <p>After this, we will set the <code class="docutils literal notranslate"><span class="pre">ctx_address</span></code> to be the <code class="docutils literal notranslate"><span class="pre">root_module</span></code> so
diff --git a/docs/arch/pass_infra.html b/docs/arch/pass_infra.html
index 903e58ede..200495c03 100644
--- a/docs/arch/pass_infra.html
+++ b/docs/arch/pass_infra.html
@@ -386,11 +386,11 @@ level it will be performed at, and/or the passes that are required.
 <code class="docutils literal notranslate"><span class="pre">opt_level</span></code> could be used to help the pass infra identify if a certain pass
 needs to be executed when running under a user-provided optimization level. The
 <code class="docutils literal notranslate"><span class="pre">required</span></code> field can be used by the pass infra to resolve pass dependencies.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">PassInfoNode</span> <span class="o">:</span> <span class="k">public</span> <span class="n">Object</span> <span class="p">{</span>
-  <span class="n">String</span> <span class="n">name</span><span class="p">;</span>
-  <span class="kt">int</span> <span class="n">opt_level</span><span class="p">;</span>
-  <span class="n">Array</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">required</span><span class="p">;</span>
-<span class="p">};</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">PassInfoNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">Object</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">String</span><span class="w"> </span><span class="n">name</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="kt">int</span><span class="w"> </span><span class="n">opt_level</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span><span class="w"> </span><span class="n">required</span><span class="p">;</span><span class="w"></span>
+<span class="p">};</span><span class="w"></span>
 </pre></div>
 </div>
 <div class="section" id="passcontext">
@@ -412,49 +412,49 @@ a thread-safe way through <code class="docutils literal notranslate"><span class
 <code class="docutils literal notranslate"><span class="pre">PassContextThreadLocalStore</span></code> is used to hold the created pass context
 objects. Examples will be provided later to show how we can use both the C++ and
 Python APIs to create a compilation pipeline using pass context.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">PassContextNode</span> <span class="o">:</span> <span class="k">public</span> <span class="n">Object</span> <span class="p">{</span>
- <span class="k">public</span><span class="o">:</span>
-  <span class="kt">int</span> <span class="n">opt_level</span><span class="p">{</span><span class="mi">2</span><span class="p">};</span>
-  <span class="n">tvm</span><span class="o">::</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">tvm</span><span class="o">::</span><span class="n">Expr</span><span class="o">&gt;</span> <span class="n">required_pass</span><span class="p">;</span>
-  <span class="n">tvm</span><span class="o">::</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">tvm</span><span class="o">::</span><span class="n">Expr</span><span class="o">&gt;</span> <span class="n">disabled_pass</span><span class="p">;</span>
-  <span class="k">mutable</span> <span class="n">Optional</span><span class="o">&lt;</span><span class="n">DiagnosticContext</span><span class="o">&gt;</span> <span class="n">diag_ctx</span><span class="p">;</span>
-  <span class="n">Map</span><span class="o">&lt;</span><span class="n">String</span><span class="p">,</span> <span class="n">ObjectRef</span><span class="o">&gt;</span> <span class="n">config</span><span class="p">;</span>
-  <span class="n">Array</span><span class="o">&lt;</span><span class="n">instrument</span><span class="o">::</span><span class="n">PassInstrument</span><span class="o">&gt;</span> <span class="n">instruments</span><span class="p">;</span>
-<span class="p">};</span>
-
-<span class="k">class</span> <span class="nc">PassContext</span> <span class="o">:</span> <span class="k">public</span> <span class="n">NodeRef</span> <span class="p">{</span>
- <span class="k">public</span><span class="o">:</span>
-  <span class="n">TVM_DLL</span> <span class="k">static</span> <span class="n">PassContext</span> <span class="n">Create</span><span class="p">();</span>
-  <span class="n">TVM_DLL</span> <span class="k">static</span> <span class="n">PassContext</span> <span class="nf">Current</span><span class="p">();</span>
-  <span class="n">TVM_DLL</span> <span class="kt">void</span> <span class="nf">InstrumentEnterPassContext</span><span class="p">();</span>
-  <span class="n">TVM_DLL</span> <span class="kt">void</span> <span class="nf">InstrumentExitPassContext</span><span class="p">();</span>
-  <span class="n">TVM_DLL</span> <span class="kt">bool</span> <span class="nf">InstrumentBeforePass</span><span class="p">(</span><span class="k">const</span> <span class="n">IRModule</span><span class="o">&amp;</span> <span class="n">mod</span><span class="p">,</span> <span class="k">const</span> <span class="n">PassInfo</span><span class="o">&amp;</span> <span class="n">info</span><span class="p">)</span> <span class="k">const</span><span class="p">;</span>
-  <span class="n">TVM_DLL</span> <span class="kt">void</span> <span class="nf">InstrumentAfterPass</span><span class="p">(</span><span class="k">const</span> <span class="n">IRModule</span><span class="o">&amp;</span> <span class="n">mod</span><span class="p">,</span> <span class="k">const</span> <span class="n">PassInfo</span><span class="o">&amp;</span> <span class="n">info</span><span class="p">)</span> <span class="k">const</span><span class="p">;</span>
-  <span class="cm">/* Other fields are omitted. */</span>
-
- <span class="k">private</span><span class="o">:</span>
-  <span class="c1">// The entry of a pass context scope.</span>
-  <span class="n">TVM_DLL</span> <span class="kt">void</span> <span class="n">EnterWithScope</span><span class="p">();</span>
-  <span class="c1">// The exit of a pass context scope.</span>
-  <span class="n">TVM_DLL</span> <span class="kt">void</span> <span class="nf">ExitWithScope</span><span class="p">();</span>
-
-  <span class="c1">// Classes to get the Python `with` like syntax.</span>
-  <span class="k">friend</span> <span class="k">class</span> <span class="nc">tvm</span><span class="o">::</span><span class="n">With</span><span class="o">&lt;</span><span class="n">PassContext</span><span class="o">&gt;</span><span class="p">;</span>
-<span class="p">};</span>
-
-<span class="k">struct</span> <span class="n">PassContextThreadLocalEntry</span> <span class="p">{</span>
-  <span class="cm">/*! \brief The default pass context. */</span>
-  <span class="n">PassContext</span> <span class="n">default_context</span><span class="p">;</span>
-  <span class="cm">/*! \brief The current pass context. */</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">stack</span><span class="o">&lt;</span><span class="n">PassContext</span><span class="o">&gt;</span> <span class="n">context_stack</span><span class="p">;</span>
-  <span class="n">PassContextThreadLocalEntry</span><span class="p">()</span> <span class="p">{</span>
-    <span class="n">default_context</span> <span class="o">=</span> <span class="n">PassContext</span><span class="p">(</span><span class="n">make_node</span><span class="o">&lt;</span><span class="n">PassContextNode</span><span class="o">&gt;</span><span class="p">());</span>
-  <span class="p">}</span>
-<span class="p">};</span>
-
-<span class="cm">/*! \brief The thread-local store to hold the pass context. */</span>
-<span class="k">typedef</span> <span class="n">dmlc</span><span class="o">::</span><span class="n">ThreadLocalStore</span><span class="o">&lt;</span><span class="n">PassContextThreadLocalEntry</span><span class="o">&gt;</span>
-     <span class="n">PassContextThreadLocalStore</span><span class="p">;</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">PassContextNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">Object</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w"> </span><span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">  </span><span class="kt">int</span><span class="w"> </span><span class="n">opt_level</span><span class="p">{</span><span class="mi">2</span><span class="p">};</span><span class="w"></span>
+<span class="w">  </span><span class="n">tvm</span><span class="o">::</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">tvm</span><span class="o">::</span><span class="n">Expr</span><span class="o">&gt;</span><span class="w"> </span><span class="n">required_pass</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">tvm</span><span class="o">::</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">tvm</span><span class="o">::</span><span class="n">Expr</span><span class="o">&gt;</span><span class="w"> </span><span class="n">disabled_pass</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="k">mutable</span><span class="w"> </span><span class="n">Optional</span><span class="o">&lt;</span><span class="n">DiagnosticContext</span><span class="o">&gt;</span><span class="w"> </span><span class="n">diag_ctx</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">Map</span><span class="o">&lt;</span><span class="n">String</span><span class="p">,</span><span class="w"> </span><span class="n">ObjectRef</span><span class="o">&gt;</span><span class="w"> </span><span class="n">config</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">instrument</span><span class="o">::</span><span class="n">PassInstrument</span><span class="o">&gt;</span><span class="w"> </span><span class="n">instruments</span><span class="p">;</span><span class="w"></span>
+<span class="p">};</span><span class="w"></span>
+
+<span class="k">class</span><span class="w"> </span><span class="nc">PassContext</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">NodeRef</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w"> </span><span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">  </span><span class="n">TVM_DLL</span><span class="w"> </span><span class="k">static</span><span class="w"> </span><span class="n">PassContext</span><span class="w"> </span><span class="n">Create</span><span class="p">();</span><span class="w"></span>
+<span class="w">  </span><span class="n">TVM_DLL</span><span class="w"> </span><span class="k">static</span><span class="w"> </span><span class="n">PassContext</span><span class="w"> </span><span class="n">Current</span><span class="p">();</span><span class="w"></span>
+<span class="w">  </span><span class="n">TVM_DLL</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">InstrumentEnterPassContext</span><span class="p">();</span><span class="w"></span>
+<span class="w">  </span><span class="n">TVM_DLL</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">InstrumentExitPassContext</span><span class="p">();</span><span class="w"></span>
+<span class="w">  </span><span class="n">TVM_DLL</span><span class="w"> </span><span class="kt">bool</span><span class="w"> </span><span class="n">InstrumentBeforePass</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">IRModule</span><span class="o">&amp;</span><span class="w"> </span><span class="n">mod</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">PassInfo</span><s [...]
+<span class="w">  </span><span class="n">TVM_DLL</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">InstrumentAfterPass</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">IRModule</span><span class="o">&amp;</span><span class="w"> </span><span class="n">mod</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">PassInfo</span><sp [...]
+<span class="w">  </span><span class="cm">/* Other fields are omitted. */</span><span class="w"></span>
+
+<span class="w"> </span><span class="k">private</span><span class="o">:</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// The entry of a pass context scope.</span>
+<span class="w">  </span><span class="n">TVM_DLL</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">EnterWithScope</span><span class="p">();</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// The exit of a pass context scope.</span>
+<span class="w">  </span><span class="n">TVM_DLL</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">ExitWithScope</span><span class="p">();</span><span class="w"></span>
+
+<span class="w">  </span><span class="c1">// Classes to get the Python `with` like syntax.</span>
+<span class="w">  </span><span class="k">friend</span><span class="w"> </span><span class="k">class</span><span class="w"> </span><span class="nc">tvm</span><span class="o">::</span><span class="n">With</span><span class="o">&lt;</span><span class="n">PassContext</span><span class="o">&gt;</span><span class="p">;</span><span class="w"></span>
+<span class="p">};</span><span class="w"></span>
+
+<span class="k">struct</span><span class="w"> </span><span class="nc">PassContextThreadLocalEntry</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="cm">/*! \brief The default pass context. */</span><span class="w"></span>
+<span class="w">  </span><span class="n">PassContext</span><span class="w"> </span><span class="n">default_context</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="cm">/*! \brief The current pass context. */</span><span class="w"></span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">stack</span><span class="o">&lt;</span><span class="n">PassContext</span><span class="o">&gt;</span><span class="w"> </span><span class="n">context_stack</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">PassContextThreadLocalEntry</span><span class="p">()</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">default_context</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">PassContext</span><span class="p">(</span><span class="n">make_node</span><span class="o">&lt;</span><span class="n">PassContextNode</span><span class="o">&gt;</span><span class="p">());</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="p">};</span><span class="w"></span>
+
+<span class="cm">/*! \brief The thread-local store to hold the pass context. */</span><span class="w"></span>
+<span class="k">typedef</span><span class="w"> </span><span class="n">dmlc</span><span class="o">::</span><span class="n">ThreadLocalStore</span><span class="o">&lt;</span><span class="n">PassContextThreadLocalEntry</span><span class="o">&gt;</span><span class="w"></span>
+<span class="w">     </span><span class="n">PassContextThreadLocalStore</span><span class="p">;</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
@@ -465,11 +465,11 @@ different granularities of Relay/tir programs. A pure virtual class <code class=
 introduced to serve as the base of the different optimization passes. This class
 contains several virtual methods that must be implemented by the
 subclasses at the level of modules, functions, or sequences of passes.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">PassNode</span> <span class="o">:</span> <span class="n">Object</span> <span class="p">{</span>
-  <span class="k">virtual</span> <span class="n">PassInfo</span> <span class="n">Info</span><span class="p">()</span> <span class="k">const</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-  <span class="k">virtual</span> <span class="n">Module</span> <span class="nf">operator</span><span class="p">()(</span><span class="k">const</span> <span class="n">IRModule</span><span class="o">&amp;</span> <span class="n">mod</span>
-                            <span class="k">const</span> <span class="n">PassContext</span><span class="o">&amp;</span> <span class="n">pass_ctx</span><span class="p">)</span> <span class="k">const</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-<span class="p">};</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">PassNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">Object</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="k">virtual</span><span class="w"> </span><span class="n">PassInfo</span><span class="w"> </span><span class="nf">Info</span><span class="p">()</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="k">virtual</span><span class="w"> </span><span class="n">Module</span><span class="w"> </span><span class="nf">operator</span><span class="p">()(</span><span class="k">const</span><span class="w"> </span><span class="n">IRModule</span><span class="o">&amp;</span><span class="w"> </span><span class="n">mod</span><span class="w"></span>
+<span class="w">                            </span><span class="k">const</span><span class="w"> </span><span class="n">PassContext</span><span class="o">&amp;</span><span class="w"> </span><span class="n">pass_ctx</span><span class="p">)</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"></span>
+<span class="p">};</span><span class="w"></span>
 </pre></div>
 </div>
 <p>The functor shows how a pass must be realized, i.e. it always works on a
@@ -491,12 +491,12 @@ typical passes in Relay that need the global picture of a module, such as
 A-normal form conversion and lambda lifting, etc., fall into this set. At this
 level, users can even add and/or delete functions in a module. Note that all
 passes</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">ModulePassNode</span> <span class="o">:</span> <span class="n">PassNode</span> <span class="p">{</span>
-  <span class="n">PassInfo</span> <span class="n">pass_info</span><span class="p">;</span>
-  <span class="n">runtime</span><span class="o">::</span><span class="n">TypedPackedFunc</span><span class="o">&lt;</span><span class="n">Module</span><span class="p">(</span><span class="n">Module</span><span class="p">,</span> <span class="n">PassContext</span><span class="p">)</span><span class="o">&gt;</span> <span class="n">pass_func</span><span class="p">;</span>
-  <span class="n">Module</span> <span class="nf">operator</span><span class="p">()(</span><span class="k">const</span> <span class="n">Module</span><span class="o">&amp;</span> <span class="n">mod</span><span class="p">,</span> <span class="k">const</span> <span class="n">PassContext</span><span class="o">&amp;</span> <span class="n">pass_ctx</span><span class="p">)</span> <span class="k">const</span> <span class="k">final</span><span class="p">;</span>
-  <span class="c1">// Other members/methods are omitted</span>
-<span class="p">};</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">ModulePassNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">PassNode</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">PassInfo</span><span class="w"> </span><span class="n">pass_info</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">runtime</span><span class="o">::</span><span class="n">TypedPackedFunc</span><span class="o">&lt;</span><span class="n">Module</span><span class="p">(</span><span class="n">Module</span><span class="p">,</span><span class="w"> </span><span class="n">PassContext</span><span class="p">)</span><span class="o">&gt;</span><span class="w"> </span><span class="n">pass_func</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">Module</span><span class="w"> </span><span class="nf">operator</span><span class="p">()(</span><span class="k">const</span><span class="w"> </span><span class="n">Module</span><span class="o">&amp;</span><span class="w"> </span><span class="n">mod</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">PassContext</span><span class="o">&amp;</span><span class="w"> </span><span class [...]
+<span class="w">  </span><span class="c1">// Other members/methods are omitted</span>
+<span class="p">};</span><span class="w"></span>
 </pre></div>
 </div>
 <p><code class="docutils literal notranslate"><span class="pre">pass_info</span></code> maintains the information needed by a module-level pass.
@@ -518,13 +518,13 @@ and flattening storage in tir, etc.</p>
 <p>Note that the scope of passes at this level is either a Relay function or a tir primitive function.
 Therefore, we cannot add or delete a function through these passes as they are not aware of
 the global information.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">FunctionPassNode</span> <span class="o">:</span> <span class="n">PassNode</span> <span class="p">{</span>
-  <span class="n">PassInfo</span> <span class="n">pass_info</span><span class="p">;</span>
-  <span class="n">runtime</span><span class="o">::</span><span class="n">TypedPackedFunc</span><span class="o">&lt;</span><span class="n">Function</span><span class="p">(</span><span class="n">Function</span><span class="p">,</span> <span class="n">Module</span><span class="p">,</span> <span class="n">PassContext</span><span class="p">)</span><span class="o">&gt;</span> <span class="n">pass_func</span><span class="p">;</span>
-  <span class="n">Module</span> <span class="nf">operator</span><span class="p">()(</span><span class="k">const</span> <span class="n">Module</span><span class="o">&amp;</span> <span class="n">mod</span><span class="p">,</span> <span class="k">const</span> <span class="n">PassContext</span><span class="o">&amp;</span> <span class="n">pass_ctx</span><span class="p">)</span> <span class="k">const</span> <span class="k">final</span><span class="p">;</span>
-  <span class="kt">bool</span> <span class="nf">SkipFunction</span><span class="p">(</span><span class="k">const</span> <span class="n">Function</span><span class="o">&amp;</span> <span class="n">func</span><span class="p">)</span> <span class="k">const</span><span class="p">;</span>
-  <span class="c1">// Other members/methods are omitted...</span>
-<span class="p">};</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">FunctionPassNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">PassNode</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">PassInfo</span><span class="w"> </span><span class="n">pass_info</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">runtime</span><span class="o">::</span><span class="n">TypedPackedFunc</span><span class="o">&lt;</span><span class="n">Function</span><span class="p">(</span><span class="n">Function</span><span class="p">,</span><span class="w"> </span><span class="n">Module</span><span class="p">,</span><span class="w"> </span><span class="n">PassContext</span><span class="p">)</span><span class="o">&gt;</span><span class="w"> </span><span class="n">pass_func</ [...]
+<span class="w">  </span><span class="n">Module</span><span class="w"> </span><span class="nf">operator</span><span class="p">()(</span><span class="k">const</span><span class="w"> </span><span class="n">Module</span><span class="o">&amp;</span><span class="w"> </span><span class="n">mod</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">PassContext</span><span class="o">&amp;</span><span class="w"> </span><span class [...]
+<span class="w">  </span><span class="kt">bool</span><span class="w"> </span><span class="nf">SkipFunction</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Function</span><span class="o">&amp;</span><span class="w"> </span><span class="n">func</span><span class="p">)</span><span class="w"> </span><span class="k">const</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// Other members/methods are omitted...</span>
+<span class="p">};</span><span class="w"></span>
 </pre></div>
 </div>
 <p><code class="docutils literal notranslate"><span class="pre">pass_info</span></code> is identical to what we just described in the module pass.
@@ -536,13 +536,13 @@ may use it for reporting errors. A function could be annotated with
 <h4>Sequential Passes<a class="headerlink" href="#sequential-passes" title="Permalink to this headline">¶</a></h4>
 <p><code class="docutils literal notranslate"><span class="pre">SequentialPass</span></code> is similar to Pytorch <code class="docutils literal notranslate"><span class="pre">nn.Sequential</span></code> that contains a host
 of passes for execution.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">SequentialPassNode</span> <span class="o">:</span> <span class="n">PassNode</span> <span class="p">{</span>
-  <span class="n">PassInfo</span> <span class="n">pass_info</span><span class="p">;</span>
-  <span class="c1">// Passes need to be executed.</span>
-  <span class="n">Array</span><span class="o">&lt;</span><span class="n">Pass</span><span class="o">&gt;</span> <span class="n">passes</span><span class="p">;</span>
-  <span class="kt">bool</span> <span class="nf">PassEnabled</span><span class="p">(</span><span class="k">const</span> <span class="n">PassInfo</span><span class="o">&amp;</span> <span class="n">info</span><span class="p">)</span> <span class="k">const</span><span class="p">;</span>
-  <span class="n">Module</span> <span class="nf">operator</span><span class="p">()(</span><span class="k">const</span> <span class="n">Module</span><span class="o">&amp;</span> <span class="n">mod</span><span class="p">,</span> <span class="k">const</span> <span class="n">PassContext</span><span class="o">&amp;</span> <span class="n">pass_ctx</span><span class="p">)</span> <span class="k">const</span> <span class="k">final</span><span class="p">;</span>
-<span class="p">};</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">SequentialPassNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">PassNode</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">PassInfo</span><span class="w"> </span><span class="n">pass_info</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// Passes need to be executed.</span>
+<span class="w">  </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Pass</span><span class="o">&gt;</span><span class="w"> </span><span class="n">passes</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="kt">bool</span><span class="w"> </span><span class="nf">PassEnabled</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">PassInfo</span><span class="o">&amp;</span><span class="w"> </span><span class="n">info</span><span class="p">)</span><span class="w"> </span><span class="k">const</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">Module</span><span class="w"> </span><span class="nf">operator</span><span class="p">()(</span><span class="k">const</span><span class="w"> </span><span class="n">Module</span><span class="o">&amp;</span><span class="w"> </span><span class="n">mod</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">PassContext</span><span class="o">&amp;</span><span class="w"> </span><span class [...]
+<span class="p">};</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Only a few passes currently in Relay are put in this group. For example,
@@ -553,22 +553,22 @@ recommended to be fulfilled first. This pass, hence, is an ideal candidate for
 <p>The following code shows how individual passes in a sequential pass are invoked.
 Essentially, we sequentially execute each pass in a sequential pass using the
 order that they were appended to the pass list.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">Module</span> <span class="n">SequentialNode</span><span class="o">::</span><span class="k">operator</span><span class="p">()(</span><span class="k">const</span> <span class="n">Module</span><span class="o">&amp;</span> <span class="n">module</span><span class="p">,</span>
-                                  <span class="k">const</span> <span class="n">PassContext</span><span class="o">&amp;</span> <span class="n">pass_ctx</span><span class="p">)</span> <span class="k">const</span> <span class="p">{</span>
-  <span class="n">Module</span> <span class="n">mod</span> <span class="o">=</span> <span class="n">module</span><span class="p">;</span>
-  <span class="k">for</span> <span class="p">(</span><span class="k">const</span> <span class="n">Pass</span><span class="o">&amp;</span> <span class="nl">pass</span> <span class="p">:</span> <span class="n">passes</span><span class="p">)</span> <span class="p">{</span>
-    <span class="n">ICHECK</span><span class="p">(</span><span class="n">pass</span><span class="p">.</span><span class="n">defined</span><span class="p">())</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;Found undefined pass for optimization.&quot;</span><span class="p">;</span>
-    <span class="k">const</span> <span class="n">PassInfo</span><span class="o">&amp;</span> <span class="n">pass_info</span> <span class="o">=</span> <span class="n">pass</span><span class="o">-&gt;</span><span class="n">Info</span><span class="p">();</span>
-    <span class="k">if</span> <span class="p">(</span><span class="o">!</span><span class="n">PassEnabled</span><span class="p">(</span><span class="n">pass_info</span><span class="p">))</span>  <span class="k">continue</span><span class="p">;</span>
-    <span class="k">for</span> <span class="p">(</span><span class="k">const</span> <span class="k">auto</span><span class="o">&amp;</span> <span class="nl">it</span> <span class="p">:</span> <span class="n">pass_info</span><span class="o">-&gt;</span><span class="n">required</span><span class="p">)</span> <span class="p">{</span>
-      <span class="k">const</span> <span class="k">auto</span><span class="o">*</span> <span class="n">name</span> <span class="o">=</span> <span class="n">it</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">tvm</span><span class="o">::</span><span class="n">ir</span><span class="o">::</span><span class="n">StringImm</span><span class="o">&gt;</span><span class="p">();</span>
-      <span class="n">ICHECK</span><span class="p">(</span><span class="n">name</span><span class="p">);</span>
-      <span class="n">mod</span> <span class="o">=</span> <span class="n">GetPass</span><span class="p">(</span><span class="n">name</span><span class="o">-&gt;</span><span class="n">value</span><span class="p">)(</span><span class="n">mod</span><span class="p">,</span> <span class="n">pass_ctx</span><span class="p">);</span>
-    <span class="p">}</span>
-    <span class="n">mod</span> <span class="o">=</span> <span class="n">pass</span><span class="p">(</span><span class="n">mod</span><span class="p">,</span> <span class="n">pass_ctx</span><span class="p">);</span>
-  <span class="p">}</span>
-  <span class="k">return</span> <span class="n">mod</span><span class="p">;</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">Module</span><span class="w"> </span><span class="nf">SequentialNode::operator</span><span class="p">()(</span><span class="k">const</span><span class="w"> </span><span class="n">Module</span><span class="o">&amp;</span><span class="w"> </span><span class="k">module</span><span class="p">,</span><span class="w"></span>
+<span class="w">                                  </span><span class="k">const</span><span class="w"> </span><span class="n">PassContext</span><span class="o">&amp;</span><span class="w"> </span><span class="n">pass_ctx</span><span class="p">)</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">Module</span><span class="w"> </span><span class="n">mod</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">module</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Pass</span><span class="o">&amp;</span><span class="w"> </span><span class="n">pass</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">passes</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">ICHECK</span><span class="p">(</span><span class="n">pass</span><span class="p">.</span><span class="n">defined</span><span class="p">())</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Found undefined pass for optimization.&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="k">const</span><span class="w"> </span><span class="n">PassInfo</span><span class="o">&amp;</span><span class="w"> </span><span class="n">pass_info</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">pass</span><span class="o">-&gt;</span><span class="n">Info</span><span class="p">();</span><span class="w"></span>
+<span class="w">    </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="o">!</span><span class="n">PassEnabled</span><span class="p">(</span><span class="n">pass_info</span><span class="p">))</span><span class="w">  </span><span class="k">continue</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">&amp;</span><span class="w"> </span><span class="n">it</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">pass_info</span><span class="o">-&gt;</span><span class="n">required</span><span class="p">)</span><span class="w"> </span><span class="p">{</span> [...]
+<span class="w">      </span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">*</span><span class="w"> </span><span class="n">name</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">it</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">tvm</span><span class="o">::</span><span class="n">ir</span><span class="o">::</span><span class="n">StringImm</span><sp [...]
+<span class="w">      </span><span class="n">ICHECK</span><span class="p">(</span><span class="n">name</span><span class="p">);</span><span class="w"></span>
+<span class="w">      </span><span class="n">mod</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">GetPass</span><span class="p">(</span><span class="n">name</span><span class="o">-&gt;</span><span class="n">value</span><span class="p">)(</span><span class="n">mod</span><span class="p">,</span><span class="w"> </span><span class="n">pass_ctx</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+<span class="w">    </span><span class="n">mod</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">pass</span><span class="p">(</span><span class="n">mod</span><span class="p">,</span><span class="w"> </span><span class="n">pass_ctx</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="n">mod</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Upon the invocation of a pass, we first check if this pass is enabled. This is
@@ -580,38 +580,38 @@ level is not less than the configured optimization level in the pass context.</p
 <p>To execute the pass, we need first to retrieve the registered pass in the TVM
 packed function registry using the pass name. This is possible because every
 pass is registered with an API endpoint as we will show later.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">Pass</span> <span class="nf">GetPass</span><span class="p">(</span><span class="k">const</span> <span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span> <span class="n">pass_name</span><span class="p">)</span> <span class="p">{</span>
-  <span class="k">using</span> <span class="n">tvm</span><span class="o">::</span><span class="n">runtime</span><span class="o">::</span><span class="n">Registry</span><span class="p">;</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">fpass_name</span> <span class="o">=</span> <span class="s">&quot;relay._transform.&quot;</span> <span class="o">+</span> <span class="n">pass_name</span><span class="p">;</span>
-  <span class="k">const</span> <span class="k">auto</span><span class="o">*</span> <span class="n">f</span> <span class="o">=</span> <span class="n">Registry</span><span class="o">::</span><span class="n">Get</span><span class="p">(</span><span class="n">fpass_name</span><span class="p">);</span>
-  <span class="n">ICHECK</span><span class="p">(</span><span class="n">f</span> <span class="o">!=</span> <span class="k">nullptr</span><span class="p">)</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;Cannot find &quot;</span> <span class="o">&lt;&lt;</span> <span class="n">fpass_name</span>
-                      <span class="o">&lt;&lt;</span> <span class="s">&quot;to create the pass &quot;</span> <span class="o">&lt;&lt;</span> <span class="n">pass_name</span><span class="p">;</span>
-  <span class="k">return</span> <span class="p">(</span><span class="o">*</span><span class="n">f</span><span class="p">)();</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">Pass</span><span class="w"> </span><span class="nf">GetPass</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span><span class="w"> </span><span class="n">pass_name</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="k">using</span><span class="w"> </span><span class="n">tvm</span><span class="o">::</span><span class="n">runtime</span><span class="o">::</span><span class="n">Registry</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">fpass_name</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;relay._transform.&quot;</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">pass_name</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">*</span><span class="w"> </span><span class="n">f</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Registry</span><span class="o">::</span><span class="n">Get</span><span class="p">(</span><span class="n">fpass_name</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="n">ICHECK</span><span class="p">(</span><span class="n">f</span><span class="w"> </span><span class="o">!=</span><span class="w"> </span><span class="k">nullptr</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Cannot find &quot;</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">fpass_name</span><span class="w"></span>
+<span class="w">                      </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;to create the pass &quot;</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">pass_name</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="p">(</span><span class="o">*</span><span class="n">f</span><span class="p">)();</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Some helper functions are provided to create each type of these aforementioned
 passes. These helpers are also exposed to the Python frontend for users to
 favorably use Python APIs to create a specific pass object.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">Pass</span> <span class="nf">CreateFunctionPass</span><span class="p">(</span>
-    <span class="k">const</span> <span class="n">runtime</span><span class="o">::</span><span class="n">TypedPackedFunc</span><span class="o">&lt;</span><span class="n">Function</span><span class="p">(</span><span class="n">Function</span><span class="p">,</span> <span class="n">IRModule</span><span class="p">,</span> <span class="n">PassContext</span><span class="p">)</span><span class="o">&gt;&amp;</span> <span class="n">pass_func</span><span class="p">,</span>
-    <span class="kt">int</span> <span class="n">opt_level</span><span class="p">,</span>
-    <span class="n">String</span> <span class="n">name</span><span class="p">,</span>
-    <span class="n">Array</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">required</span><span class="p">);</span>
-
-<span class="n">Pass</span> <span class="nf">CreatePrimFuncPass</span><span class="p">(</span>
-    <span class="k">const</span> <span class="n">runtime</span><span class="o">::</span><span class="n">TypedPackedFunc</span><span class="o">&lt;</span><span class="n">PrimFunc</span><span class="p">(</span><span class="n">PrimFunc</span><span class="p">,</span> <span class="n">IRModule</span><span class="p">,</span> <span class="n">PassContext</span><span class="p">)</span><span class="o">&gt;&amp;</span> <span class="n">pass_func</span><span class="p">,</span>
-    <span class="kt">int</span> <span class="n">opt_level</span><span class="p">,</span>
-    <span class="n">String</span> <span class="n">name</span><span class="p">,</span>
-    <span class="n">Array</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">required</span><span class="p">);</span>
-
-<span class="n">Pass</span> <span class="nf">CreateModulePass</span><span class="p">(</span>
-    <span class="k">const</span> <span class="n">runtime</span><span class="o">::</span><span class="n">TypedPackedFunc</span><span class="o">&lt;</span><span class="n">IRModule</span><span class="p">(</span><span class="n">IRModule</span><span class="p">,</span> <span class="n">PassContext</span><span class="p">)</span><span class="o">&gt;&amp;</span> <span class="n">pass_func</span><span class="p">,</span>
-    <span class="kt">int</span> <span class="n">opt_level</span><span class="p">,</span>
-    <span class="n">String</span> <span class="n">name</span><span class="p">,</span>
-    <span class="n">Array</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">required</span><span class="p">);</span>
-
-<span class="n">Pass</span> <span class="nf">Sequential</span><span class="p">(</span><span class="n">tvm</span><span class="o">::</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Pass</span><span class="o">&gt;</span> <span class="n">passes</span><span class="p">,</span> <span class="n">PassInfo</span> <span class="n">pass_info</span><span class="p">);</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">Pass</span><span class="w"> </span><span class="nf">CreateFunctionPass</span><span class="p">(</span><span class="w"></span>
+<span class="w">    </span><span class="k">const</span><span class="w"> </span><span class="n">runtime</span><span class="o">::</span><span class="n">TypedPackedFunc</span><span class="o">&lt;</span><span class="n">Function</span><span class="p">(</span><span class="n">Function</span><span class="p">,</span><span class="w"> </span><span class="n">IRModule</span><span class="p">,</span><span class="w"> </span><span class="n">PassContext</span><span class="p">)</span><span class="o">&gt;&a [...]
+<span class="w">    </span><span class="kt">int</span><span class="w"> </span><span class="n">opt_level</span><span class="p">,</span><span class="w"></span>
+<span class="w">    </span><span class="n">String</span><span class="w"> </span><span class="n">name</span><span class="p">,</span><span class="w"></span>
+<span class="w">    </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span><span class="w"> </span><span class="n">required</span><span class="p">);</span><span class="w"></span>
+
+<span class="n">Pass</span><span class="w"> </span><span class="nf">CreatePrimFuncPass</span><span class="p">(</span><span class="w"></span>
+<span class="w">    </span><span class="k">const</span><span class="w"> </span><span class="n">runtime</span><span class="o">::</span><span class="n">TypedPackedFunc</span><span class="o">&lt;</span><span class="n">PrimFunc</span><span class="p">(</span><span class="n">PrimFunc</span><span class="p">,</span><span class="w"> </span><span class="n">IRModule</span><span class="p">,</span><span class="w"> </span><span class="n">PassContext</span><span class="p">)</span><span class="o">&gt;&a [...]
+<span class="w">    </span><span class="kt">int</span><span class="w"> </span><span class="n">opt_level</span><span class="p">,</span><span class="w"></span>
+<span class="w">    </span><span class="n">String</span><span class="w"> </span><span class="n">name</span><span class="p">,</span><span class="w"></span>
+<span class="w">    </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span><span class="w"> </span><span class="n">required</span><span class="p">);</span><span class="w"></span>
+
+<span class="n">Pass</span><span class="w"> </span><span class="nf">CreateModulePass</span><span class="p">(</span><span class="w"></span>
+<span class="w">    </span><span class="k">const</span><span class="w"> </span><span class="n">runtime</span><span class="o">::</span><span class="n">TypedPackedFunc</span><span class="o">&lt;</span><span class="n">IRModule</span><span class="p">(</span><span class="n">IRModule</span><span class="p">,</span><span class="w"> </span><span class="n">PassContext</span><span class="p">)</span><span class="o">&gt;&amp;</span><span class="w"> </span><span class="n">pass_func</span><span class=" [...]
+<span class="w">    </span><span class="kt">int</span><span class="w"> </span><span class="n">opt_level</span><span class="p">,</span><span class="w"></span>
+<span class="w">    </span><span class="n">String</span><span class="w"> </span><span class="n">name</span><span class="p">,</span><span class="w"></span>
+<span class="w">    </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span><span class="w"> </span><span class="n">required</span><span class="p">);</span><span class="w"></span>
+
+<span class="n">Pass</span><span class="w"> </span><span class="nf">Sequential</span><span class="p">(</span><span class="n">tvm</span><span class="o">::</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Pass</span><span class="o">&gt;</span><span class="w"> </span><span class="n">passes</span><span class="p">,</span><span class="w"> </span><span class="n">PassInfo</span><span class="w"> </span><span class="n">pass_info</span><span class="p">);</span><span clas [...]
 </pre></div>
 </div>
 </div>
@@ -623,7 +623,7 @@ a pass.  Let’s take const folding as an example. This pass has already been
 implemented to fold constants in a Relay function (found in
 <a class="reference external" href="https://github.com/apache/tvm/blob/main/src/relay/transforms/fold_constant.cc">src/relay/transforms/fold_constant.cc</a>).</p>
 <p>An API was provided to perform the <code class="docutils literal notranslate"><span class="pre">Expr</span></code> to <code class="docutils literal notranslate"><span class="pre">Expr</span></code> transformation.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span> <span class="nf">FoldConstant</span><span class="p">(</span><span class="k">const</span> <span class="n">Expr</span><span class="o">&amp;</span> <span class="n">expr</span><span class="p">);</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span><span class="w"> </span><span class="nf">FoldConstant</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Expr</span><span class="o">&amp;</span><span class="w"> </span><span class="n">expr</span><span class="p">);</span><span class="w"></span>
 </pre></div>
 </div>
 <p>In order to register this pass to the pass infra, we first need to decide at
@@ -637,25 +637,25 @@ developer has to identify and list them.</p>
 <code class="docutils literal notranslate"><span class="pre">relay._transform.FoldConstant</span></code>. This pass, therefore, becomes an entry in the
 registry that can be accessed by both C++ (e.g. the <code class="docutils literal notranslate"><span class="pre">GetPass</span></code> above) and
 Python when needed.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">namespace</span> <span class="n">transform</span> <span class="p">{</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">namespace</span><span class="w"> </span><span class="nn">transform</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
 
-<span class="n">Pass</span> <span class="n">FoldConstant</span><span class="p">()</span> <span class="p">{</span>
-  <span class="n">runtime</span><span class="o">::</span><span class="n">TypedPackedFunc</span><span class="o">&lt;</span><span class="n">Function</span><span class="p">(</span><span class="n">Function</span><span class="p">,</span> <span class="n">IRModule</span><span class="p">,</span> <span class="n">PassContext</span><span class="p">)</span><span class="o">&gt;</span> <span class="n">pass_func</span> <span class="o">=</span>
-    <span class="p">[</span><span class="o">=</span><span class="p">](</span><span class="n">Function</span> <span class="n">f</span><span class="p">,</span> <span class="n">IRModule</span> <span class="n">m</span><span class="p">,</span> <span class="n">PassContext</span> <span class="n">pc</span><span class="p">)</span> <span class="p">{</span>
-      <span class="k">return</span> <span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Function</span><span class="o">&gt;</span><span class="p">(</span><span class="n">FoldConstant</span><span class="p">(</span><span class="n">f</span><span class="p">));</span>
-  <span class="p">};</span>
-  <span class="k">return</span> <span class="nf">CreateFunctionPass</span><span class="p">(</span><span class="n">pass_func</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="s">&quot;FoldConstant&quot;</span><span class="p">,</span> <span class="p">{});</span>
-<span class="p">}</span>
+<span class="n">Pass</span><span class="w"> </span><span class="nf">FoldConstant</span><span class="p">()</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">runtime</span><span class="o">::</span><span class="n">TypedPackedFunc</span><span class="o">&lt;</span><span class="n">Function</span><span class="p">(</span><span class="n">Function</span><span class="p">,</span><span class="w"> </span><span class="n">IRModule</span><span class="p">,</span><span class="w"> </span><span class="n">PassContext</span><span class="p">)</span><span class="o">&gt;</span><span class="w"> </span><span class="n">pass_func [...]
+<span class="w">    </span><span class="p">[</span><span class="o">=</span><span class="p">](</span><span class="n">Function</span><span class="w"> </span><span class="n">f</span><span class="p">,</span><span class="w"> </span><span class="n">IRModule</span><span class="w"> </span><span class="n">m</span><span class="p">,</span><span class="w"> </span><span class="n">PassContext</span><span class="w"> </span><span class="n">pc</span><span class="p">)</span><span class="w"> </span><span c [...]
+<span class="w">      </span><span class="k">return</span><span class="w"> </span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Function</span><span class="o">&gt;</span><span class="p">(</span><span class="n">FoldConstant</span><span class="p">(</span><span class="n">f</span><span class="p">));</span><span class="w"></span>
+<span class="w">  </span><span class="p">};</span><span class="w"></span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="n">CreateFunctionPass</span><span class="p">(</span><span class="n">pass_func</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;FoldConstant&quot;</span><span class="p">,</span><span class="w"> </span><span class="p">{});</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 
-<span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;relay._transform.FoldConstant&quot;</span><span class="p">)</span>
-<span class="p">.</span><span class="n">set_body_typed</span><span class="p">(</span><span class="n">FoldConstant</span><span class="p">);</span>
+<span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;relay._transform.FoldConstant&quot;</span><span class="p">)</span><span class="w"></span>
+<span class="p">.</span><span class="n">set_body_typed</span><span class="p">(</span><span class="n">FoldConstant</span><span class="p">);</span><span class="w"></span>
 
-<span class="p">}</span>  <span class="c1">// namespace transform</span>
+<span class="p">}</span><span class="w">  </span><span class="c1">// namespace transform</span>
 </pre></div>
 </div>
 <p>To allow other C++ modules to apply this pass, we declare a free function in
 <a class="reference external" href="https://github.com/apache/tvm/blob/main/include/tvm/relay/transform.h">include/tvm/relay/transform.h</a> as the following:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_DLL</span> <span class="n">Pass</span> <span class="nf">FoldConstant</span><span class="p">();</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_DLL</span><span class="w"> </span><span class="n">Pass</span><span class="w"> </span><span class="n">FoldConstant</span><span class="p">();</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
@@ -665,10 +665,10 @@ Python when needed.</p>
 we can use the infrastructure to know how much time and memory a pass requires
 or how a pass can transform the IR module.</p>
 <p>We introduce four instrument points in the life-cycle of <code class="docutils literal notranslate"><span class="pre">PassContext</span></code>.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_DLL</span> <span class="kt">void</span> <span class="nf">InstrumentEnterPassContext</span><span class="p">();</span>
-<span class="n">TVM_DLL</span> <span class="kt">void</span> <span class="nf">InstrumentExitPassContext</span><span class="p">();</span>
-<span class="n">TVM_DLL</span> <span class="kt">bool</span> <span class="nf">InstrumentBeforePass</span><span class="p">(</span><span class="k">const</span> <span class="n">IRModule</span><span class="o">&amp;</span> <span class="n">mod</span><span class="p">,</span> <span class="k">const</span> <span class="n">PassInfo</span><span class="o">&amp;</span> <span class="n">info</span><span class="p">)</span> <span class="k">const</span><span class="p">;</span>
-<span class="n">TVM_DLL</span> <span class="kt">void</span> <span class="nf">InstrumentAfterPass</span><span class="p">(</span><span class="k">const</span> <span class="n">IRModule</span><span class="o">&amp;</span> <span class="n">mod</span><span class="p">,</span> <span class="k">const</span> <span class="n">PassInfo</span><span class="o">&amp;</span> <span class="n">info</span><span class="p">)</span> <span class="k">const</span><span class="p">;</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_DLL</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">InstrumentEnterPassContext</span><span class="p">();</span><span class="w"></span>
+<span class="n">TVM_DLL</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">InstrumentExitPassContext</span><span class="p">();</span><span class="w"></span>
+<span class="n">TVM_DLL</span><span class="w"> </span><span class="kt">bool</span><span class="w"> </span><span class="n">InstrumentBeforePass</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">IRModule</span><span class="o">&amp;</span><span class="w"> </span><span class="n">mod</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">PassInfo</span><span class="o">&amp;</span [...]
+<span class="n">TVM_DLL</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">InstrumentAfterPass</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">IRModule</span><span class="o">&amp;</span><span class="w"> </span><span class="n">mod</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">PassInfo</span><span class="o">&amp;</span> [...]
 </pre></div>
 </div>
 <p><code class="docutils literal notranslate"><span class="pre">InstrumentEnterPassContext</span></code> is called immediately when entering the scope
@@ -679,11 +679,11 @@ This method is also called when instruments is being overriden by <code class="d
 See <a class="reference internal" href="#pass-instrument-overriden"><span class="std std-ref">Override Instruments in Current PassContext</span></a>.</p>
 <p><code class="docutils literal notranslate"><span class="pre">InstrumentBeforePass</span></code> is called before execution.
 <code class="docutils literal notranslate"><span class="pre">InstrumentAfterPass</span></code> is called after execution if the pass should be run. The behavior is like:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">if</span> <span class="p">(</span><span class="n">pass_ctx</span><span class="p">.</span><span class="n">InstrumentBeforePass</span><span class="p">(</span><span class="n">ir_module</span><span class="p">,</span> <span class="n">pass_info</span><span class="p">))</span> <span class="p">{</span>
-  <span class="n">new_ir_module</span> <span class="o">=</span> <span class="n">run_pass</span><span class="p">(</span><span class="n">ir_module</span><span class="p">,</span> <span class="n">pass_ctx</span><span class="p">);</span>
-  <span class="n">pass_ctx</span><span class="p">.</span><span class="n">InstrumentAfterPass</span><span class="p">(</span><span class="n">new_ir_module</span><span class="p">,</span> <span class="n">pass_info</span><span class="p">);</span>
-  <span class="k">return</span> <span class="n">new_ir_module</span><span class="p">;</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">pass_ctx</span><span class="p">.</span><span class="n">InstrumentBeforePass</span><span class="p">(</span><span class="n">ir_module</span><span class="p">,</span><span class="w"> </span><span class="n">pass_info</span><span class="p">))</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">new_ir_module</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">run_pass</span><span class="p">(</span><span class="n">ir_module</span><span class="p">,</span><span class="w"> </span><span class="n">pass_ctx</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="n">pass_ctx</span><span class="p">.</span><span class="n">InstrumentAfterPass</span><span class="p">(</span><span class="n">new_ir_module</span><span class="p">,</span><span class="w"> </span><span class="n">pass_info</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="n">new_ir_module</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>The <code class="docutils literal notranslate"><span class="pre">PassInstrument</span></code> interface allow you to run arbitrary code inside above four methods.
@@ -691,25 +691,25 @@ Multiple <code class="docutils literal notranslate"><span class="pre">PassInstru
 <code class="docutils literal notranslate"><span class="pre">PassContext</span></code>. <code class="docutils literal notranslate"><span class="pre">PassInstrument</span></code> instances are called sequentially in the order of
 <code class="docutils literal notranslate"><span class="pre">instruments</span></code> argument passed to <code class="docutils literal notranslate"><span class="pre">PassContext</span></code>.</p>
 <p><code class="docutils literal notranslate"><span class="pre">PassInstrument</span></code> provides following interfaces:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">namespace</span> <span class="n">instrument</span> <span class="p">{</span>
-
-<span class="k">class</span> <span class="nc">PassInstrumentNode</span> <span class="o">:</span> <span class="k">public</span> <span class="n">Object</span> <span class="p">{</span>
- <span class="k">public</span><span class="o">:</span>
-  <span class="n">String</span> <span class="n">name</span><span class="p">;</span>
-  <span class="k">virtual</span> <span class="kt">void</span> <span class="nf">EnterPassContext</span><span class="p">()</span> <span class="k">const</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-  <span class="k">virtual</span> <span class="kt">void</span> <span class="nf">ExitPassContext</span><span class="p">()</span> <span class="k">const</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-  <span class="k">virtual</span> <span class="kt">bool</span> <span class="nf">ShouldRun</span><span class="p">(</span><span class="k">const</span> <span class="n">IRModule</span><span class="o">&amp;</span> <span class="n">mod</span><span class="p">,</span> <span class="k">const</span> <span class="n">transform</span><span class="o">::</span><span class="n">PassInfo</span><span class="o">&amp;</span> <span class="n">info</span><span class="p">)</span> <span class="k">const</span> <span  [...]
-  <span class="k">virtual</span> <span class="kt">void</span> <span class="nf">RunBeforePass</span><span class="p">(</span><span class="k">const</span> <span class="n">IRModule</span><span class="o">&amp;</span> <span class="n">mod</span><span class="p">,</span> <span class="k">const</span> <span class="n">transform</span><span class="o">::</span><span class="n">PassInfo</span><span class="o">&amp;</span> <span class="n">info</span><span class="p">)</span> <span class="k">const</span> <s [...]
-  <span class="k">virtual</span> <span class="kt">void</span> <span class="nf">RunAfterPass</span><span class="p">(</span><span class="k">const</span> <span class="n">IRModule</span><span class="o">&amp;</span> <span class="n">mod</span><span class="p">,</span> <span class="k">const</span> <span class="n">transform</span><span class="o">::</span><span class="n">PassInfo</span><span class="o">&amp;</span> <span class="n">info</span><span class="p">)</span> <span class="k">const</span> <sp [...]
-  <span class="cm">/* Other fields are omitted. */</span>
-<span class="p">};</span>
-
-<span class="k">class</span> <span class="nc">PassInstrument</span> <span class="o">:</span> <span class="k">public</span> <span class="n">ObjectRef</span> <span class="p">{</span>
- <span class="k">public</span><span class="o">:</span>
-  <span class="n">TVM_DEFINE_OBJECT_REF_METHODS</span><span class="p">(</span><span class="n">PassInstrument</span><span class="p">,</span> <span class="n">ObjectRef</span><span class="p">,</span> <span class="n">PassInstrumentNode</span><span class="p">);</span>
-<span class="p">};</span>
-
-<span class="p">}</span>  <span class="c1">// namespace instrument</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">namespace</span><span class="w"> </span><span class="nn">instrument</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+
+<span class="k">class</span><span class="w"> </span><span class="nc">PassInstrumentNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">Object</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w"> </span><span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">  </span><span class="n">String</span><span class="w"> </span><span class="n">name</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="k">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="nf">EnterPassContext</span><span class="p">()</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="k">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="nf">ExitPassContext</span><span class="p">()</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="k">virtual</span><span class="w"> </span><span class="kt">bool</span><span class="w"> </span><span class="nf">ShouldRun</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">IRModule</span><span class="o">&amp;</span><span class="w"> </span><span class="n">mod</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">transform</span><span class [...]
+<span class="w">  </span><span class="k">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="nf">RunBeforePass</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">IRModule</span><span class="o">&amp;</span><span class="w"> </span><span class="n">mod</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">transform</span><span c [...]
+<span class="w">  </span><span class="k">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="nf">RunAfterPass</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">IRModule</span><span class="o">&amp;</span><span class="w"> </span><span class="n">mod</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">transform</span><span cl [...]
+<span class="w">  </span><span class="cm">/* Other fields are omitted. */</span><span class="w"></span>
+<span class="p">};</span><span class="w"></span>
+
+<span class="k">class</span><span class="w"> </span><span class="nc">PassInstrument</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">ObjectRef</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w"> </span><span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">  </span><span class="n">TVM_DEFINE_OBJECT_REF_METHODS</span><span class="p">(</span><span class="n">PassInstrument</span><span class="p">,</span><span class="w"> </span><span class="n">ObjectRef</span><span class="p">,</span><span class="w"> </span><span class="n">PassInstrumentNode</span><span class="p">);</span><span class="w"></span>
+<span class="p">};</span><span class="w"></span>
+
+<span class="p">}</span><span class="w">  </span><span class="c1">// namespace instrument</span>
 </pre></div>
 </div>
 <p>Python frontend are provided to implement <code class="docutils literal notranslate"><span class="pre">PassInstrument</span></code> quickly. See <a class="reference internal" href="#pass-instrument-py-frontend"><span class="std std-ref">Pass Instrument</span></a>.</p>
@@ -810,7 +810,7 @@ a Pass object.</p>
 <code class="docutils literal notranslate"><span class="pre">with</span></code> syntax by overriding <code class="docutils literal notranslate"><span class="pre">__enter__</span></code> and <code class="docutils literal notranslate"><span class="pre">__exit__</span></code>. A <code class="docutils literal notranslate"><span class="pre">current</span></code>
 static method is offered for users to get the context that is in use under
 a certain scope.</p>
-<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@tvm._ffi.register_object</span><span class="p">(</span><span class="s2">&quot;transform.PassContext&quot;</span><span class="p">)</span>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@tvm</span><span class="o">.</span><span class="n">_ffi</span><span class="o">.</span><span class="n">register_object</span><span class="p">(</span><span class="s2">&quot;transform.PassContext&quot;</span><span class="p">)</span>
 <span class="k">class</span> <span class="nc">PassContext</span><span class="p">(</span><span class="n">tvm</span><span class="o">.</span><span class="n">runtime</span><span class="o">.</span><span class="n">Object</span><span class="p">):</span>
     <span class="k">def</span> <span class="fm">__enter__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
         <span class="n">_transform</span><span class="o">.</span><span class="n">EnterPassContext</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
@@ -832,7 +832,7 @@ as fallback device info and step/depth for loop unrolling, etc. In order to
 enable fetching the required config, the key must be registered through
 <code class="docutils literal notranslate"><span class="pre">TVM_REGISTER_PASS_CONFIG_OPTION</span></code>. For example, the following is used by the
 loop unrolling pass</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_REGISTER_PASS_CONFIG_OPTION</span><span class="p">(</span><span class="s">&quot;tir.UnrollLoop&quot;</span><span class="p">,</span> <span class="n">UnrollLoopConfig</span><span class="p">);</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_REGISTER_PASS_CONFIG_OPTION</span><span class="p">(</span><span class="s">&quot;tir.UnrollLoop&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">UnrollLoopConfig</span><span class="p">);</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Please refer to <a class="reference external" href="https://github.com/apache/tvm/blob/main/src/tir/transforms/unroll_loop.cc">src/tir/transforms/unroll_loop.cc</a> for more details.</p>
@@ -863,7 +863,7 @@ const folding has a Python API like the following:</p>
 </pre></div>
 </div>
 <p>Users can build a pass through decoration like the following:</p>
-<div class="highlight-python notranslate"><div class="highlight"><pre><span></span> <span class="nd">@relay.transform.module_pass</span><span class="p">(</span><span class="n">opt_level</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span> <span class="nd">@relay</span><span class="o">.</span><span class="n">transform</span><span class="o">.</span><span class="n">module_pass</span><span class="p">(</span><span class="n">opt_level</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
  <span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">mod</span><span class="p">,</span> <span class="n">ctx</span><span class="p">):</span>
     <span class="n">tp</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">TensorType</span><span class="p">((</span><span class="mi">10</span><span class="p">,),</span> <span class="s2">&quot;float32&quot;</span><span class="p">)</span>
     <span class="n">x</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">var</span><span class="p">(</span><span class="s2">&quot;x&quot;</span><span class="p">,</span> <span class="n">tp</span><span class="p">)</span>
@@ -889,7 +889,7 @@ function.</p>
 </div>
 <p>Correspondingly, we also offer such functionality for <code class="docutils literal notranslate"><span class="pre">function_pass</span></code>. For
 instance, an example function-level pass could be written as the following:</p>
-<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@relay.transform.function_pass</span><span class="p">(</span><span class="n">opt_level</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@relay</span><span class="o">.</span><span class="n">transform</span><span class="o">.</span><span class="n">function_pass</span><span class="p">(</span><span class="n">opt_level</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
 <span class="k">class</span> <span class="nc">TestReplaceFunc</span><span class="p">:</span>
    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">new_func</span><span class="p">):</span>
       <span class="bp">self</span><span class="o">.</span><span class="n">new_func</span> <span class="o">=</span> <span class="n">new_func</span>
diff --git a/docs/arch/relay_intro.html b/docs/arch/relay_intro.html
index 63c3c6f77..03394be09 100644
--- a/docs/arch/relay_intro.html
+++ b/docs/arch/relay_intro.html
@@ -378,16 +378,16 @@ node can be referenced in later parts of the program.</p>
 <p>So far we have introduced how can we build a dataflow graph as a function. One might naturally ask: Can we support multiple
 functions and enable them to call each other? Relay allows grouping multiple functions together in a module; the code below
 shows an example of a function calling another function.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>def @muladd(%x, %y, %z) {
-  %1 = mul(%x, %y)
-  %2 = add(%1, %z)
-  %2
-}
-def @myfunc(%x) {
-  %1 = @muladd(%x, 1, 2)
-  %2 = @muladd(%1, 2, 3)
-  %2
-}
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nd">@muladd</span><span class="p">(</span><span class="o">%</span><span class="n">x</span><span class="p">,</span> <span class="o">%</span><span class="n">y</span><span class="p">,</span> <span class="o">%</span><span class="n">z</span><span class="p">)</span> <span class="p">{</span>
+  <span class="o">%</span><span class="mi">1</span> <span class="o">=</span> <span class="n">mul</span><span class="p">(</span><span class="o">%</span><span class="n">x</span><span class="p">,</span> <span class="o">%</span><span class="n">y</span><span class="p">)</span>
+  <span class="o">%</span><span class="mi">2</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="o">%</span><span class="mi">1</span><span class="p">,</span> <span class="o">%</span><span class="n">z</span><span class="p">)</span>
+  <span class="o">%</span><span class="mi">2</span>
+<span class="p">}</span>
+<span class="k">def</span> <span class="nd">@myfunc</span><span class="p">(</span><span class="o">%</span><span class="n">x</span><span class="p">)</span> <span class="p">{</span>
+  <span class="o">%</span><span class="mi">1</span> <span class="o">=</span> <span class="nd">@muladd</span><span class="p">(</span><span class="o">%</span><span class="n">x</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
+  <span class="o">%</span><span class="mi">2</span> <span class="o">=</span> <span class="nd">@muladd</span><span class="p">(</span><span class="o">%</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">)</span>
+  <span class="o">%</span><span class="mi">2</span>
+<span class="p">}</span>
 </pre></div>
 </div>
 <p>The Module can be viewed as a <code class="docutils literal notranslate"><span class="pre">Map&lt;GlobalVar,</span> <span class="pre">Function&gt;</span></code>. Here GlobalVar is just an id that is used to represent the functions
@@ -396,17 +396,17 @@ the corresponding GlobalVar is stored in the op field of the CallNode. It contai
 body of the called function from the module using the corresponding GlobalVar. In this particular case, we could also directly
 store the reference to the Function as op in the CallNode. So, why do we need to introduce GlobalVar? The main reason is that
 GlobalVar decouples the definition/declaration and enables recursion and delayed declaration of the function.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>def @myfunc(%x) {
-  %1 = equal(%x, 1)
-   if (%1) {
-      %x
-   } else {
-     %2 = sub(%x, 1)
-     %3 = @myfunc(%2)
-      %4 = add(%3, %3)
-      %4
-  }
-}
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nd">@myfunc</span><span class="p">(</span><span class="o">%</span><span class="n">x</span><span class="p">)</span> <span class="p">{</span>
+  <span class="o">%</span><span class="mi">1</span> <span class="o">=</span> <span class="n">equal</span><span class="p">(</span><span class="o">%</span><span class="n">x</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
+   <span class="k">if</span> <span class="p">(</span><span class="o">%</span><span class="mi">1</span><span class="p">)</span> <span class="p">{</span>
+      <span class="o">%</span><span class="n">x</span>
+   <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
+     <span class="o">%</span><span class="mi">2</span> <span class="o">=</span> <span class="n">sub</span><span class="p">(</span><span class="o">%</span><span class="n">x</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
+     <span class="o">%</span><span class="mi">3</span> <span class="o">=</span> <span class="nd">@myfunc</span><span class="p">(</span><span class="o">%</span><span class="mi">2</span><span class="p">)</span>
+      <span class="o">%</span><span class="mi">4</span> <span class="o">=</span> <span class="n">add</span><span class="p">(</span><span class="o">%</span><span class="mi">3</span><span class="p">,</span> <span class="o">%</span><span class="mi">3</span><span class="p">)</span>
+      <span class="o">%</span><span class="mi">4</span>
+  <span class="p">}</span>
+<span class="p">}</span>
 </pre></div>
 </div>
 <p>In the above example, <code class="docutils literal notranslate"><span class="pre">&#64;myfunc</span></code> recursively calls itself. Using GlobalVar <code class="docutils literal notranslate"><span class="pre">&#64;myfunc</span></code> to represent the function avoids
diff --git a/docs/arch/relay_op_strategy.html b/docs/arch/relay_op_strategy.html
index dbe7f128b..954a83807 100644
--- a/docs/arch/relay_op_strategy.html
+++ b/docs/arch/relay_op_strategy.html
@@ -360,7 +360,7 @@ compute and schedule functions should be used given a workload, and needs to be
 registered to each Relay operator.  <code class="docutils literal notranslate"><span class="pre">FTVMStrategy</span></code> is a generic function (see
 <code class="docutils literal notranslate"><span class="pre">include/tvm/target/generic_func.h</span></code>), that can be overwritten for each
 target. The function signature is</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">OpStrategy</span><span class="p">(</span><span class="k">const</span> <span class="n">Attrs</span><span class="o">&amp;</span> <span class="n">attrs</span><span class="p">,</span> <span class="k">const</span> <span class="n">Array</span><span class="o">&lt;</span><span class="n">Tensor</span><span class="o">&gt;&amp;</span> <span class="n">inputs</span><span class="p">,</span> <span class="k">co [...]
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">OpStrategy</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Attrs</span><span class="o">&amp;</span><span class="w"> </span><span class="n">attrs</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Tensor</span><span class=" [...]
 </pre></div>
 </div>
 <p>that the function returns an <code class="docutils literal notranslate"><span class="pre">OpStrategy</span></code> given the op attributes, input
@@ -388,7 +388,7 @@ It only has one API, which is to add an implementation to the strategy:</p>
     <span class="k">return</span> <span class="n">strategy</span>
 
 <span class="c1"># add to each target file in python/tvm/relay/op/strategy, e.g., x86.py, cuda.py, etc.</span>
-<span class="nd">@topk_strategy.register</span><span class="p">([</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="s2">&quot;gpu&quot;</span><span class="p">])</span>
+<span class="nd">@topk_strategy</span><span class="o">.</span><span class="n">register</span><span class="p">([</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="s2">&quot;gpu&quot;</span><span class="p">])</span>
 <span class="k">def</span> <span class="nf">topk_strategy_cuda</span><span class="p">(</span><span class="n">attrs</span><span class="p">,</span> <span class="n">inputs</span><span class="p">,</span> <span class="n">out_type</span><span class="p">,</span> <span class="n">target</span><span class="p">):</span>
     <span class="n">strategy</span> <span class="o">=</span> <span class="n">_op</span><span class="o">.</span><span class="n">OpStrategy</span><span class="p">()</span>
     <span class="n">strategy</span><span class="o">.</span><span class="n">add_implementation</span><span class="p">(</span>
@@ -504,7 +504,7 @@ code snippet shows <code class="docutils literal notranslate"><span class="pre">
         <span class="k">return</span> <span class="n">topi</span><span class="o">.</span><span class="n">generic</span><span class="o">.</span><span class="n">schedule_pool</span><span class="p">(</span><span class="n">outs</span><span class="p">,</span> <span class="n">attrs</span><span class="o">.</span><span class="n">layout</span><span class="p">)</span>
 
 <span class="c1"># add to each target file in python/tvm/relay/op/strategy, e.g., x86.py, cuda.py, etc.</span>
-<span class="nd">@schedule_pool.register</span><span class="p">(</span><span class="s2">&quot;cpu&quot;</span><span class="p">)</span>
+<span class="nd">@schedule_pool</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="s2">&quot;cpu&quot;</span><span class="p">)</span>
 <span class="k">def</span> <span class="nf">schedule_pool_cpu</span><span class="p">(</span><span class="n">attrs</span><span class="p">,</span> <span class="n">outs</span><span class="p">,</span> <span class="n">target</span><span class="p">):</span>
     <span class="o">...</span>
 </pre></div>
@@ -525,7 +525,7 @@ strategies for the rest.</p>
 <p>Alternatively, you can also register the strategy for the new target outside the
 TVM python library. The following code snippet shows an example how to do
 so. You can find more examples in <code class="docutils literal notranslate"><span class="pre">vta/python/vta/top/op.py</span></code>.</p>
-<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@relay.op.strategy.conv2d_strategy.register</span><span class="p">(</span><span class="s2">&quot;mytarget&quot;</span><span class="p">)</span>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@relay</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">strategy</span><span class="o">.</span><span class="n">conv2d_strategy</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="s2">&quot;mytarget&quot;</span><span class="p">)</span>
 <span class="k">def</span> <span class="nf">conv2d_strategy_mytarget</span><span class="p">(</span><span class="n">attrs</span><span class="p">,</span> <span class="n">inputs</span><span class="p">,</span> <span class="n">out_type</span><span class="p">,</span> <span class="n">target</span><span class="p">):</span>
     <span class="o">...</span>
 </pre></div>
diff --git a/docs/arch/runtime.html b/docs/arch/runtime.html
index e023604df..ec9713392 100644
--- a/docs/arch/runtime.html
+++ b/docs/arch/runtime.html
@@ -360,21 +360,21 @@ We also want the runtime core to be minimal to deploy to embedded devices.</p>
 challenges listed.  A single <code class="docutils literal notranslate"><span class="pre">PackedFunc</span></code> object represents a
 function call whose caller and callee may be in different languages.</p>
 <p>The following code block provides an example in C++</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="cp">#include</span> <span class="cpf">&lt;tvm/runtime/packed_func.h&gt;</span><span class="cp"></span>
-
-<span class="kt">void</span> <span class="nf">MyAdd</span><span class="p">(</span><span class="n">TVMArgs</span> <span class="n">args</span><span class="p">,</span> <span class="n">TVMRetValue</span><span class="o">*</span> <span class="n">rv</span><span class="p">)</span> <span class="p">{</span>
-  <span class="c1">// automatically convert arguments to desired type.</span>
-  <span class="kt">int</span> <span class="n">a</span> <span class="o">=</span> <span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">];</span>
-  <span class="kt">int</span> <span class="n">b</span> <span class="o">=</span> <span class="n">args</span><span class="p">[</span><span class="mi">1</span><span class="p">];</span>
-  <span class="c1">// automatically assign value return to rv</span>
-  <span class="o">*</span><span class="n">rv</span> <span class="o">=</span> <span class="n">a</span> <span class="o">+</span> <span class="n">b</span><span class="p">;</span>
-<span class="p">}</span>
-
-<span class="kt">void</span> <span class="nf">CallPacked</span><span class="p">()</span> <span class="p">{</span>
-  <span class="n">PackedFunc</span> <span class="n">myadd</span> <span class="o">=</span> <span class="n">PackedFunc</span><span class="p">(</span><span class="n">MyAdd</span><span class="p">);</span>
-  <span class="c1">// get back 3</span>
-  <span class="kt">int</span> <span class="n">c</span> <span class="o">=</span> <span class="n">myadd</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">);</span>
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/runtime/packed_func.h&gt;</span><span class="cp"></span>
+
+<span class="kt">void</span><span class="w"> </span><span class="nf">MyAdd</span><span class="p">(</span><span class="n">TVMArgs</span><span class="w"> </span><span class="n">args</span><span class="p">,</span><span class="w"> </span><span class="n">TVMRetValue</span><span class="o">*</span><span class="w"> </span><span class="n">rv</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// automatically convert arguments to desired type.</span>
+<span class="w">  </span><span class="kt">int</span><span class="w"> </span><span class="n">a</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">];</span><span class="w"></span>
+<span class="w">  </span><span class="kt">int</span><span class="w"> </span><span class="n">b</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">args</span><span class="p">[</span><span class="mi">1</span><span class="p">];</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// automatically assign value return to rv</span>
+<span class="w">  </span><span class="o">*</span><span class="n">rv</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">a</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">b</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
+
+<span class="kt">void</span><span class="w"> </span><span class="nf">CallPacked</span><span class="p">()</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">PackedFunc</span><span class="w"> </span><span class="n">myadd</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">PackedFunc</span><span class="p">(</span><span class="n">MyAdd</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// get back 3</span>
+<span class="w">  </span><span class="kt">int</span><span class="w"> </span><span class="n">c</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">myadd</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>In the above codeblock, we defined a PackedFunc MyAdd. It takes two arguments
@@ -385,15 +385,15 @@ and gets the result back via TVMRetValue.</p>
 <p>Thanks to template tricks in C++, we can call a PackedFunc just like a normal function. Because of its type-erased nature, we can call a PackedFunc from dynamic languages like python, without additional glue code for each new type function created.
 The following example registers PackedFunc in C++ and calls from python.</p>
 <div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="c1">// register a global packed function in c++</span>
-<span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;myadd&quot;</span><span class="p">)</span>
-<span class="p">.</span><span class="n">set_body</span><span class="p">(</span><span class="n">MyAdd</span><span class="p">);</span>
+<span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;myadd&quot;</span><span class="p">)</span><span class="w"></span>
+<span class="p">.</span><span class="n">set_body</span><span class="p">(</span><span class="n">MyAdd</span><span class="p">);</span><span class="w"></span>
 </pre></div>
 </div>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
 
 <span class="n">myadd</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">get_global_func</span><span class="p">(</span><span class="s2">&quot;myadd&quot;</span><span class="p">)</span>
 <span class="c1"># prints 3</span>
-<span class="k">print</span><span class="p">(</span><span class="n">myadd</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>
+<span class="nb">print</span><span class="p">(</span><span class="n">myadd</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>
 </pre></div>
 </div>
 <p>Most of the magic of PackedFunc lies in <code class="docutils literal notranslate"><span class="pre">TVMArgs</span></code> and <code class="docutils literal notranslate"><span class="pre">TVMRetValue</span></code> structure.
@@ -411,17 +411,17 @@ Despite being minimum, the PackedFunc is sufficient for the use-case of deep lea
 most functions only take DLTensor or numbers.</p>
 <p>Since one PackedFunc can take another PackedFunc as an argument,
 we can pass functions from python (as PackedFunc) to C++.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;callhello&quot;</span><span class="p">)</span>
-<span class="p">.</span><span class="n">set_body</span><span class="p">([](</span><span class="n">TVMArgs</span> <span class="n">args</span><span class="p">,</span> <span class="n">TVMRetValue</span><span class="o">*</span> <span class="n">rv</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">PackedFunc</span> <span class="n">f</span> <span class="o">=</span> <span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">];</span>
-  <span class="n">f</span><span class="p">(</span><span class="s">&quot;hello world&quot;</span><span class="p">);</span>
-<span class="p">});</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;callhello&quot;</span><span class="p">)</span><span class="w"></span>
+<span class="p">.</span><span class="n">set_body</span><span class="p">([](</span><span class="n">TVMArgs</span><span class="w"> </span><span class="n">args</span><span class="p">,</span><span class="w"> </span><span class="n">TVMRetValue</span><span class="o">*</span><span class="w"> </span><span class="n">rv</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">PackedFunc</span><span class="w"> </span><span class="n">f</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">];</span><span class="w"></span>
+<span class="w">  </span><span class="n">f</span><span class="p">(</span><span class="s">&quot;hello world&quot;</span><span class="p">);</span><span class="w"></span>
+<span class="p">});</span><span class="w"></span>
 </pre></div>
 </div>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
 
 <span class="k">def</span> <span class="nf">callback</span><span class="p">(</span><span class="n">msg</span><span class="p">):</span>
-  <span class="k">print</span><span class="p">(</span><span class="n">msg</span><span class="p">)</span>
+  <span class="nb">print</span><span class="p">(</span><span class="n">msg</span><span class="p">)</span>
 
 <span class="c1"># convert to PackedFunc</span>
 <span class="n">f</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="n">callback</span><span class="p">)</span>
@@ -485,48 +485,48 @@ adding the code back to the central repo. To ease the speed of dispatching, we a
 track of reference. We use <code class="docutils literal notranslate"><span class="pre">ObjectRef</span></code> class to represent a reference to the <code class="docutils literal notranslate"><span class="pre">Object</span></code>.
 We can roughly view <code class="docutils literal notranslate"><span class="pre">ObjectRef</span></code> class as shared_ptr to the <code class="docutils literal notranslate"><span class="pre">Object</span></code> container.
 We can also define subclass <code class="docutils literal notranslate"><span class="pre">ObjectRef</span></code> to hold each subtypes of <code class="docutils literal notranslate"><span class="pre">Object</span></code>. Each subclass of <code class="docutils literal notranslate"><span class="pre">Object</span></code> needs to define the VisitAttr function.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">class</span> <span class="n">AttrVisitor</span> <span class="p">{</span>
-<span class="nl">public</span><span class="p">:</span>
-  <span class="n">virtual</span> <span class="kt">void</span> <span class="n">Visit</span><span class="p">(</span><span class="k">const</span> <span class="kt">char</span><span class="o">*</span> <span class="n">key</span><span class="p">,</span> <span class="kt">double</span><span class="o">*</span> <span class="n">value</span><span class="p">)</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-  <span class="n">virtual</span> <span class="kt">void</span> <span class="nf">Visit</span><span class="p">(</span><span class="k">const</span> <span class="kt">char</span><span class="o">*</span> <span class="n">key</span><span class="p">,</span> <span class="kt">int64_t</span><span class="o">*</span> <span class="n">value</span><span class="p">)</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-  <span class="n">virtual</span> <span class="kt">void</span> <span class="nf">Visit</span><span class="p">(</span><span class="k">const</span> <span class="kt">char</span><span class="o">*</span> <span class="n">key</span><span class="p">,</span> <span class="kt">uint64_t</span><span class="o">*</span> <span class="n">value</span><span class="p">)</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-  <span class="n">virtual</span> <span class="kt">void</span> <span class="nf">Visit</span><span class="p">(</span><span class="k">const</span> <span class="kt">char</span><span class="o">*</span> <span class="n">key</span><span class="p">,</span> <span class="kt">int</span><span class="o">*</span> <span class="n">value</span><span class="p">)</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-  <span class="n">virtual</span> <span class="kt">void</span> <span class="nf">Visit</span><span class="p">(</span><span class="k">const</span> <span class="kt">char</span><span class="o">*</span> <span class="n">key</span><span class="p">,</span> <span class="kt">bool</span><span class="o">*</span> <span class="n">value</span><span class="p">)</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-  <span class="n">virtual</span> <span class="kt">void</span> <span class="nf">Visit</span><span class="p">(</span><span class="k">const</span> <span class="kt">char</span><span class="o">*</span> <span class="n">key</span><span class="p">,</span> <span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">*</span> <span class="n">value</span><span class="p">)</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-  <span class="n">virtual</span> <span class="kt">void</span> <span class="nf">Visit</span><span class="p">(</span><span class="k">const</span> <span class="kt">char</span><span class="o">*</span> <span class="n">key</span><span class="p">,</span> <span class="kt">void</span><span class="o">**</span> <span class="n">value</span><span class="p">)</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-  <span class="n">virtual</span> <span class="kt">void</span> <span class="nf">Visit</span><span class="p">(</span><span class="k">const</span> <span class="kt">char</span><span class="o">*</span> <span class="n">key</span><span class="p">,</span> <span class="n">Type</span><span class="o">*</span> <span class="n">value</span><span class="p">)</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-  <span class="n">virtual</span> <span class="kt">void</span> <span class="nf">Visit</span><span class="p">(</span><span class="k">const</span> <span class="kt">char</span><span class="o">*</span> <span class="n">key</span><span class="p">,</span> <span class="n">ObjectRef</span><span class="o">*</span> <span class="n">value</span><span class="p">)</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-  <span class="c1">// ...</span>
-<span class="p">};</span>
-
-<span class="n">class</span> <span class="nl">BaseAttrsNode</span> <span class="p">:</span> <span class="n">public</span> <span class="n">Object</span> <span class="p">{</span>
-<span class="nl">public</span><span class="p">:</span>
-  <span class="n">virtual</span> <span class="kt">void</span> <span class="n">VisitAttrs</span><span class="p">(</span><span class="n">AttrVisitor</span><span class="o">*</span> <span class="n">v</span><span class="p">)</span> <span class="p">{}</span>
-  <span class="c1">// ...</span>
-<span class="p">};</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">class</span><span class="w"> </span><span class="n">AttrVisitor</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="n">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">  </span><span class="n">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">Visit</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">key</span><span class="p">,</span><span class="w"> </span><span class="kt">double</span><span class="o">*</span><span class="w"> </span><span class="n">value</span>< [...]
+<span class="w">  </span><span class="n">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">Visit</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">key</span><span class="p">,</span><span class="w"> </span><span class="kt">int64_t</span><span class="o">*</span><span class="w"> </span><span class="n">value</span> [...]
+<span class="w">  </span><span class="n">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">Visit</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">key</span><span class="p">,</span><span class="w"> </span><span class="kt">uint64_t</span><span class="o">*</span><span class="w"> </span><span class="n">value</span [...]
+<span class="w">  </span><span class="n">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">Visit</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">key</span><span class="p">,</span><span class="w"> </span><span class="kt">int</span><span class="o">*</span><span class="w"> </span><span class="n">value</span><spa [...]
+<span class="w">  </span><span class="n">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">Visit</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">key</span><span class="p">,</span><span class="w"> </span><span class="kt">bool</span><span class="o">*</span><span class="w"> </span><span class="n">value</span><sp [...]
+<span class="w">  </span><span class="n">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">Visit</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">key</span><span class="p">,</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">*</span><sp [...]
+<span class="w">  </span><span class="n">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">Visit</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">key</span><span class="p">,</span><span class="w"> </span><span class="kt">void</span><span class="o">**</span><span class="w"> </span><span class="n">value</span><s [...]
+<span class="w">  </span><span class="n">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">Visit</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">key</span><span class="p">,</span><span class="w"> </span><span class="n">Type</span><span class="o">*</span><span class="w"> </span><span class="n">value</span><spa [...]
+<span class="w">  </span><span class="n">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">Visit</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">key</span><span class="p">,</span><span class="w"> </span><span class="n">ObjectRef</span><span class="o">*</span><span class="w"> </span><span class="n">value</span [...]
+<span class="w">  </span><span class="c1">// ...</span>
+<span class="p">};</span><span class="w"></span>
+
+<span class="n">class</span><span class="w"> </span><span class="n">BaseAttrsNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">public</span><span class="w"> </span><span class="n">Object</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="n">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">  </span><span class="n">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">VisitAttrs</span><span class="p">(</span><span class="n">AttrVisitor</span><span class="o">*</span><span class="w"> </span><span class="n">v</span><span class="p">)</span><span class="w"> </span><span class="p">{}</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// ...</span>
+<span class="p">};</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Each <code class="docutils literal notranslate"><span class="pre">Object</span></code> subclass will override this to visit its members. Here is an example implementation of TensorNode.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">class</span> <span class="nl">TensorNode</span> <span class="p">:</span> <span class="n">public</span> <span class="n">Object</span> <span class="p">{</span>
-<span class="nl">public</span><span class="p">:</span>
-  <span class="cm">/*! \brief The shape of the tensor */</span>
-  <span class="n">Array</span><span class="o">&lt;</span><span class="n">Expr</span><span class="o">&gt;</span> <span class="n">shape</span><span class="p">;</span>
-  <span class="cm">/*! \brief data type in the content of the tensor */</span>
-  <span class="n">Type</span> <span class="n">dtype</span><span class="p">;</span>
-  <span class="cm">/*! \brief the source operation, can be None */</span>
-  <span class="n">Operation</span> <span class="n">op</span><span class="p">;</span>
-  <span class="cm">/*! \brief the output index from source operation */</span>
-  <span class="kt">int</span> <span class="n">value_index</span><span class="p">{</span><span class="mi">0</span><span class="p">};</span>
-  <span class="cm">/*! \brief constructor */</span>
-  <span class="n">TensorNode</span><span class="p">()</span> <span class="p">{}</span>
-
-  <span class="kt">void</span> <span class="n">VisitAttrs</span><span class="p">(</span><span class="n">AttrVisitor</span><span class="o">*</span> <span class="n">v</span><span class="p">)</span> <span class="n">final</span> <span class="p">{</span>
-    <span class="n">v</span><span class="o">-&gt;</span><span class="n">Visit</span><span class="p">(</span><span class="s">&quot;shape&quot;</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">shape</span><span class="p">);</span>
-    <span class="n">v</span><span class="o">-&gt;</span><span class="n">Visit</span><span class="p">(</span><span class="s">&quot;dtype&quot;</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">dtype</span><span class="p">);</span>
-    <span class="n">v</span><span class="o">-&gt;</span><span class="n">Visit</span><span class="p">(</span><span class="s">&quot;op&quot;</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">op</span><span class="p">);</span>
-    <span class="n">v</span><span class="o">-&gt;</span><span class="n">Visit</span><span class="p">(</span><span class="s">&quot;value_index&quot;</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">value_index</span><span class="p">);</span>
-  <span class="p">}</span>
-<span class="p">};</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">class</span><span class="w"> </span><span class="n">TensorNode</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">public</span><span class="w"> </span><span class="n">Object</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="n">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">  </span><span class="cm">/*! \brief The shape of the tensor */</span><span class="w"></span>
+<span class="w">  </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Expr</span><span class="o">&gt;</span><span class="w"> </span><span class="n">shape</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="cm">/*! \brief data type in the content of the tensor */</span><span class="w"></span>
+<span class="w">  </span><span class="n">Type</span><span class="w"> </span><span class="n">dtype</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="cm">/*! \brief the source operation, can be None */</span><span class="w"></span>
+<span class="w">  </span><span class="n">Operation</span><span class="w"> </span><span class="n">op</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="cm">/*! \brief the output index from source operation */</span><span class="w"></span>
+<span class="w">  </span><span class="kt">int</span><span class="w"> </span><span class="n">value_index</span><span class="p">{</span><span class="mi">0</span><span class="p">};</span><span class="w"></span>
+<span class="w">  </span><span class="cm">/*! \brief constructor */</span><span class="w"></span>
+<span class="w">  </span><span class="n">TensorNode</span><span class="p">()</span><span class="w"> </span><span class="p">{}</span><span class="w"></span>
+
+<span class="w">  </span><span class="kt">void</span><span class="w"> </span><span class="n">VisitAttrs</span><span class="p">(</span><span class="n">AttrVisitor</span><span class="o">*</span><span class="w"> </span><span class="n">v</span><span class="p">)</span><span class="w"> </span><span class="n">final</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">v</span><span class="o">-&gt;</span><span class="n">Visit</span><span class="p">(</span><span class="s">&quot;shape&quot;</span><span class="p">,</span><span class="w"> </span><span class="o">&amp;</span><span class="n">shape</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="n">v</span><span class="o">-&gt;</span><span class="n">Visit</span><span class="p">(</span><span class="s">&quot;dtype&quot;</span><span class="p">,</span><span class="w"> </span><span class="o">&amp;</span><span class="n">dtype</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="n">v</span><span class="o">-&gt;</span><span class="n">Visit</span><span class="p">(</span><span class="s">&quot;op&quot;</span><span class="p">,</span><span class="w"> </span><span class="o">&amp;</span><span class="n">op</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="n">v</span><span class="o">-&gt;</span><span class="n">Visit</span><span class="p">(</span><span class="s">&quot;value_index&quot;</span><span class="p">,</span><span class="w"> </span><span class="o">&amp;</span><span class="n">value_index</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="p">};</span><span class="w"></span>
 </pre></div>
 </div>
 <p>In the above examples, both <code class="docutils literal notranslate"><span class="pre">Operation</span></code> and <code class="docutils literal notranslate"><span class="pre">Array&lt;Expr&gt;</span></code> are ObjectRef.
@@ -539,7 +539,7 @@ For example, in the following code, we accessed the op field of the TensorNode.<
 
 <span class="n">x</span> <span class="o">=</span> <span class="n">te</span><span class="o">.</span><span class="n">placeholder</span><span class="p">((</span><span class="mi">3</span><span class="p">,</span><span class="mi">4</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;x&quot;</span><span class="p">)</span>
 <span class="c1"># access the op field of TensorNode</span>
-<span class="k">print</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
 </pre></div>
 </div>
 <p>New <code class="docutils literal notranslate"><span class="pre">Object</span></code> can be added to C++ without changing the front-end runtime, making it easy to make extensions to the compiler stack.
diff --git a/docs/arch/virtual_machine.html b/docs/arch/virtual_machine.html
index 0688d7b34..7cae306b1 100644
--- a/docs/arch/virtual_machine.html
+++ b/docs/arch/virtual_machine.html
@@ -568,18 +568,18 @@ for each of them can be found at <a class="reference external" href="https://git
 <p>The Relay VM maintains a stack frame, which contains information about how to resume the
 previous call. Registers are allocated in a continuous space (virtual register file) for each function.</p>
 <p>We keep track of a set of Relay functions we have called, a pointer into its bytecode, an offset into the byte code (known as the program counter).</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="k">struct</span> <span class="n">VirtualMachine</span> <span class="p">{</span>
-  <span class="p">...</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">VMFrame</span><span class="o">&gt;</span> <span class="n">frames</span><span class="p">;</span>
-  <span class="p">...</span>
-  <span class="c1">// Current function.</span>
-  <span class="kt">size_t</span> <span class="n">func_index</span><span class="p">;</span>
-  <span class="c1">// Pointer into the current function&#39;s instructions.</span>
-  <span class="k">const</span> <span class="n">Instruction</span><span class="o">*</span> <span class="n">code</span><span class="p">;</span>
-  <span class="c1">// Current program counter relative to the code pointer.</span>
-  <span class="kt">size_t</span> <span class="n">pc</span><span class="p">;</span>
-  <span class="p">...</span>
-<span class="p">};</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="k">struct</span><span class="w"> </span><span class="nc">VirtualMachine</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="p">...</span><span class="w"></span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">VMFrame</span><span class="o">&gt;</span><span class="w"> </span><span class="n">frames</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="p">...</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// Current function.</span>
+<span class="w">  </span><span class="kt">size_t</span><span class="w"> </span><span class="n">func_index</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// Pointer into the current function&#39;s instructions.</span>
+<span class="w">  </span><span class="k">const</span><span class="w"> </span><span class="n">Instruction</span><span class="o">*</span><span class="w"> </span><span class="n">code</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// Current program counter relative to the code pointer.</span>
+<span class="w">  </span><span class="kt">size_t</span><span class="w"> </span><span class="n">pc</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="p">...</span><span class="w"></span>
+<span class="p">};</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
diff --git a/docs/commit_hash b/docs/commit_hash
index be5198f41..99d3774d2 100644
--- a/docs/commit_hash
+++ b/docs/commit_hash
@@ -1 +1 @@
-b952425b2d46076ccbc0a55953e31afbfac0da33
+a6ef5af1587c71dc69d710058b95f8baa9c6cc4d
diff --git a/docs/contribute/ci.html b/docs/contribute/ci.html
index 5963e1743..528207d09 100644
--- a/docs/contribute/ci.html
+++ b/docs/contribute/ci.html
@@ -423,7 +423,7 @@ if you don’t see any reports of the failure. If a certain test or class of tes
 several PRs or commits on <code class="docutils literal notranslate"><span class="pre">main</span></code> with flaky failures, the test should be disabled via
 <a class="reference external" href="https://docs.pytest.org/en/6.2.x/skipping.html#xfail-mark-test-functions-as-expected-to-fail">pytest’s <code class="docutils literal notranslate"><span class="pre">&#64;xfail</span></code> decorator</a> with <a class="reference external" href="https://docs.pytest.org/en/6.2.x/skipping.html#strict-parameter"><code class="docutils literal notranslate"><span class="pre">strict=True</span></code></a> and the relevant issue linked in the
 disabling PR.</p>
-<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@pytest.mark.xfail</span><span class="p">(</span><span class="n">strict</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span> <span class="n">reason</span><span class="o">=</span><span class="s2">&quot;Flaky test: https://github.com/apache/tvm/issues/1234&quot;</span><span class="p">)</span>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@pytest</span><span class="o">.</span><span class="n">mark</span><span class="o">.</span><span class="n">xfail</span><span class="p">(</span><span class="n">strict</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">reason</span><span class="o">=</span><span class="s2">&quot;Flaky test: https://github.com/apache/tvm/issues/1234&quot;</span><s [...]
 <span class="k">def</span> <span class="nf">test_something_flaky</span><span class="p">():</span>
     <span class="k">pass</span>
 </pre></div>
diff --git a/docs/contribute/code_guide.html b/docs/contribute/code_guide.html
index b8b958cc2..4bb46dd20 100644
--- a/docs/contribute/code_guide.html
+++ b/docs/contribute/code_guide.html
@@ -356,9 +356,9 @@ python tests/scripts/ci.py lint
 </div>
 <p>clang-format is also not perfect, when necessary, you can use disble clang-format on certain code regions.</p>
 <div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="c1">// clang-format off</span>
-<span class="kt">void</span> <span class="nf">Test</span><span class="p">()</span> <span class="p">{</span>
-   <span class="c1">// clang-format will be disabled in this region.</span>
-<span class="p">}</span>
+<span class="kt">void</span><span class="w"> </span><span class="nf">Test</span><span class="p">()</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">   </span><span class="c1">// clang-format will be disabled in this region.</span>
+<span class="p">}</span><span class="w"></span>
 <span class="c1">// clang-format on</span>
 </pre></div>
 </div>
@@ -367,15 +367,15 @@ python tests/scripts/ci.py lint
 <span class="cp">#define MACRO_FUNC(x)</span>
 
 <span class="c1">// not preferred, because clang-format might recognize it as types.</span>
-<span class="n">virtual</span> <span class="kt">void</span> <span class="nf">Func1</span><span class="p">()</span> <span class="n">MACRO_IMPL</span>
+<span class="n">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">Func1</span><span class="p">()</span><span class="w"> </span><span class="n">MACRO_IMPL</span><span class="w"></span>
 
 <span class="c1">// preferred</span>
-<span class="n">virtual</span> <span class="kt">void</span> <span class="n">Func2</span><span class="p">()</span> <span class="n">MACRO_IMPL</span><span class="p">;</span>
+<span class="n">virtual</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">Func2</span><span class="p">()</span><span class="w"> </span><span class="n">MACRO_IMPL</span><span class="p">;</span><span class="w"></span>
 
-<span class="kt">void</span> <span class="nf">Func3</span><span class="p">()</span> <span class="p">{</span>
-  <span class="c1">// preferred</span>
-  <span class="n">MACRO_FUNC</span><span class="p">(</span><span class="n">xyz</span><span class="p">);</span>
-<span class="p">}</span>
+<span class="kt">void</span><span class="w"> </span><span class="nf">Func3</span><span class="p">()</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// preferred</span>
+<span class="w">  </span><span class="n">MACRO_FUNC</span><span class="p">(</span><span class="n">xyz</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
@@ -391,7 +391,7 @@ python tests/scripts/ci.py lint
 <h2><a class="toc-backref" href="#id3">Writing Python Tests</a><a class="headerlink" href="#writing-python-tests" title="Permalink to this headline">¶</a></h2>
 <p>We use <a class="reference external" href="https://docs.pytest.org/en/stable/">pytest</a> for all python testing. <code class="docutils literal notranslate"><span class="pre">tests/python</span></code> contains all the tests.</p>
 <p>If you want your test to run over a variety of targets, use the <code class="xref py py-func docutils literal notranslate"><span class="pre">tvm.testing.parametrize_targets()</span></code> decorator. For example:</p>
-<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@tvm.testing.parametrize_targets</span>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@tvm</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">parametrize_targets</span>
 <span class="k">def</span> <span class="nf">test_mytest</span><span class="p">(</span><span class="n">target</span><span class="p">,</span> <span class="n">dev</span><span class="p">):</span>
   <span class="o">...</span>
 </pre></div>
@@ -403,11 +403,11 @@ python tests/scripts/ci.py lint
 <p>We often need to handle constant integer expressions in TVM. Before we do so, the first question we want to ask is that is it really necessary to get a constant integer. If symbolic expression also works and let the logic flow, we should use symbolic expression as much as possible. So the generated code works for shapes that are not known ahead of time.</p>
 <p>Note that in some cases we cannot know certain information, e.g. sign of symbolic variable, it is ok to make assumptions in certain cases. While adding precise support if the variable is constant.</p>
 <p>If we do have to get constant integer expression, we should get the constant value using type <code class="docutils literal notranslate"><span class="pre">int64_t</span></code> instead of <code class="docutils literal notranslate"><span class="pre">int</span></code>, to avoid potential integer overflow. We can always reconstruct an integer with the corresponding expression type via <code class="docutils literal notranslate"><span class="pre">make_const</span></code>. The following cod [...]
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span> <span class="nf">CalculateExpr</span><span class="p">(</span><span class="n">Expr</span> <span class="n">value</span><span class="p">)</span> <span class="p">{</span>
-  <span class="kt">int64_t</span> <span class="n">int_value</span> <span class="o">=</span> <span class="n">GetConstInt</span><span class="o">&lt;</span><span class="kt">int64_t</span><span class="o">&gt;</span><span class="p">(</span><span class="n">value</span><span class="p">);</span>
-  <span class="n">int_value</span> <span class="o">=</span> <span class="n">CalculateExprInInt64</span><span class="p">(</span><span class="n">int_value</span><span class="p">);</span>
-  <span class="k">return</span> <span class="n">make_const</span><span class="p">(</span><span class="n">value</span><span class="p">.</span><span class="n">type</span><span class="p">(),</span> <span class="n">int_value</span><span class="p">);</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span><span class="w"> </span><span class="nf">CalculateExpr</span><span class="p">(</span><span class="n">Expr</span><span class="w"> </span><span class="n">value</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="kt">int64_t</span><span class="w"> </span><span class="n">int_value</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">GetConstInt</span><span class="o">&lt;</span><span class="kt">int64_t</span><span class="o">&gt;</span><span class="p">(</span><span class="n">value</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="n">int_value</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">CalculateExprInInt64</span><span class="p">(</span><span class="n">int_value</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="n">make_const</span><span class="p">(</span><span class="n">value</span><span class="p">.</span><span class="n">type</span><span class="p">(),</span><span class="w"> </span><span class="n">int_value</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
diff --git a/docs/contribute/document.html b/docs/contribute/document.html
index 9cc8b9968..828d52636 100644
--- a/docs/contribute/document.html
+++ b/docs/contribute/document.html
@@ -506,10 +506,10 @@ shows an example of c++ docstring.</p>
 <span class="cm"> * \param arg1 Description of arg1</span>
 <span class="cm"> * \param arg2 Descroption of arg2</span>
 <span class="cm"> * \returns describe return value</span>
-<span class="cm"> */</span>
-<span class="kt">int</span> <span class="nf">myfunction</span><span class="p">(</span><span class="kt">int</span> <span class="n">arg1</span><span class="p">,</span> <span class="kt">int</span> <span class="n">arg2</span><span class="p">)</span> <span class="p">{</span>
-  <span class="c1">// When necessary, also add comment to clarify internal logics</span>
-<span class="p">}</span>
+<span class="cm"> */</span><span class="w"></span>
+<span class="kt">int</span><span class="w"> </span><span class="nf">myfunction</span><span class="p">(</span><span class="kt">int</span><span class="w"> </span><span class="n">arg1</span><span class="p">,</span><span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="n">arg2</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// When necessary, also add comment to clarify internal logics</span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Besides documenting function usages, we also highly recommend contributors to
diff --git a/docs/contribute/error_handling.html b/docs/contribute/error_handling.html
index e43080834..b2eb1f3f2 100644
--- a/docs/contribute/error_handling.html
+++ b/docs/contribute/error_handling.html
@@ -346,12 +346,12 @@ there is no error type prefix in the message.
 This mechanism works for both <code class="docutils literal notranslate"><span class="pre">LOG(FATAL)</span></code> and <code class="docutils literal notranslate"><span class="pre">ICHECK</span></code> macros.
 The following code gives an example on how to do so.</p>
 <div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="c1">// src/api_test.cc</span>
-<span class="kt">void</span> <span class="nf">ErrorTest</span><span class="p">(</span><span class="kt">int</span> <span class="n">x</span><span class="p">,</span> <span class="kt">int</span> <span class="n">y</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">ICHECK_EQ</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">)</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;ValueError: expect x and y to be equal.&quot;</span>
-  <span class="k">if</span> <span class="p">(</span><span class="n">x</span> <span class="o">==</span> <span class="mi">1</span><span class="p">)</span> <span class="p">{</span>
-    <span class="n">LOG</span><span class="p">(</span><span class="n">FATAL</span><span class="p">)</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;InternalError: cannot reach here&quot;</span><span class="p">;</span>
-  <span class="p">}</span>
-<span class="p">}</span>
+<span class="kt">void</span><span class="w"> </span><span class="nf">ErrorTest</span><span class="p">(</span><span class="kt">int</span><span class="w"> </span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="n">y</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">ICHECK_EQ</span><span class="p">(</span><span class="n">x</span><span class="p">,</span><span class="w"> </span><span class="n">y</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;ValueError: expect x and y to be equal.&quot;</span><span class="w"></span>
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">LOG</span><span class="p">(</span><span class="n">FATAL</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;InternalError: cannot reach here&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>The above function is registered as PackedFunc into the python frontend,
@@ -410,7 +410,7 @@ error messages when necessary.</p>
     <span class="k">raise</span> <span class="n">OpNotImplemented</span><span class="p">(</span><span class="s2">&quot;Operator relu is not implemented in the MXNet frontend&quot;</span><span class="p">)</span>
 
 <span class="k">def</span> <span class="nf">_op_not_implemented</span><span class="p">(</span><span class="n">op_name</span><span class="p">):</span>
-    <span class="k">return</span> <span class="n">OpNotImplemented</span><span class="p">(</span><span class="s2">&quot;Operator {} is not implemented.&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">op_name</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">OpNotImplemented</span><span class="p">(</span><span class="s2">&quot;Operator </span><span class="si">{}</span><span class="s2"> is not implemented.&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">op_name</span><span class="p">)</span>
 
 <span class="k">def</span> <span class="nf">not_preferred</span><span class="p">():</span>
     <span class="c1"># Introduces another level of indirection.</span>
diff --git a/docs/dev/how_to/pytest_target_parametrization.html b/docs/dev/how_to/pytest_target_parametrization.html
index ecd874daa..782305da5 100644
--- a/docs/dev/how_to/pytest_target_parametrization.html
+++ b/docs/dev/how_to/pytest_target_parametrization.html
@@ -353,7 +353,7 @@ each target is reported separately.  If a target cannot be run because
 it is disabled in the <cite>config.cmake</cite>, or because no appropriate
 hardware is present, then that target will be reported as skipped.</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Explicit listing of targets to use.</span>
-<span class="nd">@tvm.testing.parametrize_target</span><span class="p">(</span><span class="s1">&#39;llvm&#39;</span><span class="p">,</span> <span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
+<span class="nd">@tvm</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">parametrize_target</span><span class="p">(</span><span class="s1">&#39;llvm&#39;</span><span class="p">,</span> <span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
 <span class="k">def</span> <span class="nf">test_function</span><span class="p">(</span><span class="n">target</span><span class="p">,</span> <span class="n">dev</span><span class="p">):</span>
     <span class="c1"># Test code goes here</span>
 </pre></div>
@@ -375,7 +375,7 @@ decorator to explicitly draw attention to the parametrization, but has
 no additional effect.</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Explicitly parametrized to run on all targets</span>
 <span class="c1"># in environment variable TVM_TEST_TARGETS</span>
-<span class="nd">@tvm.testing.parametrize_targets</span>
+<span class="nd">@tvm</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">parametrize_targets</span>
 <span class="k">def</span> <span class="nf">test_function</span><span class="p">(</span><span class="n">target</span><span class="p">,</span> <span class="n">dev</span><span class="p">):</span>
     <span class="c1"># Test code goes here</span>
 </pre></div>
@@ -392,7 +392,7 @@ parametrizing over tuples of arguments, such as shown below.  In these
 cases, only the explicitly listed targets will run, but they will
 still have the appropriate <code class="docutils literal notranslate"><span class="pre">&#64;tvm.testing.requires_RUNTIME</span></code> mark
 applied to them.</p>
-<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@pytest.mark.parametrize</span><span class="p">(</span><span class="s1">&#39;target,impl&#39;</span><span class="p">,</span> <span class="p">[</span>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@pytest</span><span class="o">.</span><span class="n">mark</span><span class="o">.</span><span class="n">parametrize</span><span class="p">(</span><span class="s1">&#39;target,impl&#39;</span><span class="p">,</span> <span class="p">[</span>
      <span class="p">(</span><span class="s1">&#39;llvm&#39;</span><span class="p">,</span> <span class="n">cpu_implementation</span><span class="p">),</span>
      <span class="p">(</span><span class="s1">&#39;cuda&#39;</span><span class="p">,</span> <span class="n">gpu_implementation_small_batch</span><span class="p">),</span>
      <span class="p">(</span><span class="s1">&#39;cuda&#39;</span><span class="p">,</span> <span class="n">gpu_implementation_large_batch</span><span class="p">),</span>
diff --git a/docs/dev/how_to/relay_add_op.html b/docs/dev/how_to/relay_add_op.html
index f4de2826c..a4affb34b 100644
--- a/docs/dev/how_to/relay_add_op.html
+++ b/docs/dev/how_to/relay_add_op.html
@@ -347,7 +347,7 @@ The PR itself builds upon another PR which adds a <a class="reference external"
 operator would be an appropriate example of fields which might belong in an attribute node for a convolution operator.</p>
 <p>Attributes should be defined in a file within the folder <a class="reference external" href="https://github.com/apache/tvm/tree/main/include/tvm/relay/attrs">include/tvm/relay/attrs/</a>.</p>
 <p>Ultimately we want to create an operator whose interface can be seen clearly in the final python interface:</p>
-<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">cumprod</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">exclusive</span><span class="o">=</span><span clas [...]
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">cumprod</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">exclusive</span><span class="o">=</span><span clas [...]
     <span class="sd">&quot;&quot;&quot;Numpy style cumprod op. Return the cumulative inclusive product of the elements along</span>
 <span class="sd">    a given axis.</span>
 <span class="sd">    Parameters</span>
@@ -376,19 +376,19 @@ operator would be an appropriate example of fields which might belong in an attr
 <p>A similiar interface exists for <code class="docutils literal notranslate"><span class="pre">cumsum()</span></code>.</p>
 <p>Therefore, when defining our attributes in <code class="docutils literal notranslate"><span class="pre">include/tvm/relay/attrs/transform.h</span></code> we choose the axis,
 accumulation dtype, and exclusivity of the operation as appropriate fields for the struct.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="cm">/*! \brief Attributes used in cumsum and cumprod operator */</span>
-<span class="k">struct</span> <span class="nl">ScanopAttrs</span> <span class="p">:</span> <span class="k">public</span> <span class="n">tvm</span><span class="o">::</span><span class="n">AttrsNode</span><span class="o">&lt;</span><span class="n">ScanopAttrs</span><span class="o">&gt;</span> <span class="p">{</span>
-  <span class="n">Integer</span> <span class="n">axis</span><span class="p">;</span>
-  <span class="n">DataType</span> <span class="n">dtype</span><span class="p">;</span>
-  <span class="n">Bool</span> <span class="n">exclusive</span> <span class="o">=</span> <span class="n">Bool</span><span class="p">(</span><span class="nb">false</span><span class="p">);</span>
-  <span class="n">TVM_DECLARE_ATTRS</span><span class="p">(</span><span class="n">ScanopAttrs</span><span class="p">,</span> <span class="s">&quot;relay.attrs.ScanopAttrs&quot;</span><span class="p">)</span> <span class="p">{</span>
-    <span class="n">TVM_ATTR_FIELD</span><span class="p">(</span><span class="n">axis</span><span class="p">).</span><span class="n">describe</span><span class="p">(</span><span class="s">&quot;The axis to operate over&quot;</span><span class="p">).</span><span class="n">set_default</span><span class="p">(</span><span class="n">NullValue</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;</span><span class="p">());</span>
-    <span class="n">TVM_ATTR_FIELD</span><span class="p">(</span><span class="n">dtype</span><span class="p">).</span><span class="n">describe</span><span class="p">(</span><span class="s">&quot;Output data type&quot;</span><span class="p">).</span><span class="n">set_default</span><span class="p">(</span><span class="n">NullValue</span><span class="o">&lt;</span><span class="n">DataType</span><span class="o">&gt;</span><span class="p">());</span>
-    <span class="n">TVM_ATTR_FIELD</span><span class="p">(</span><span class="n">exclusive</span><span class="p">)</span>
-        <span class="p">.</span><span class="n">describe</span><span class="p">(</span><span class="s">&quot;The first element is not included&quot;</span><span class="p">)</span>
-        <span class="p">.</span><span class="n">set_default</span><span class="p">(</span><span class="n">Bool</span><span class="p">(</span><span class="nb">false</span><span class="p">));</span>
-  <span class="p">}</span>
-<span class="p">};</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="cm">/*! \brief Attributes used in cumsum and cumprod operator */</span><span class="w"></span>
+<span class="k">struct</span><span class="w"> </span><span class="nc">ScanopAttrs</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">tvm</span><span class="o">::</span><span class="n">AttrsNode</span><span class="o">&lt;</span><span class="n">ScanopAttrs</span><span class="o">&gt;</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">Integer</span><span class="w"> </span><span class="n">axis</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">DataType</span><span class="w"> </span><span class="n">dtype</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">Bool</span><span class="w"> </span><span class="n">exclusive</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Bool</span><span class="p">(</span><span class="nb">false</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="n">TVM_DECLARE_ATTRS</span><span class="p">(</span><span class="n">ScanopAttrs</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;relay.attrs.ScanopAttrs&quot;</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">TVM_ATTR_FIELD</span><span class="p">(</span><span class="n">axis</span><span class="p">).</span><span class="n">describe</span><span class="p">(</span><span class="s">&quot;The axis to operate over&quot;</span><span class="p">).</span><span class="n">set_default</span><span class="p">(</span><span class="n">NullValue</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;</span><span class="p">());</span><span class= [...]
+<span class="w">    </span><span class="n">TVM_ATTR_FIELD</span><span class="p">(</span><span class="n">dtype</span><span class="p">).</span><span class="n">describe</span><span class="p">(</span><span class="s">&quot;Output data type&quot;</span><span class="p">).</span><span class="n">set_default</span><span class="p">(</span><span class="n">NullValue</span><span class="o">&lt;</span><span class="n">DataType</span><span class="o">&gt;</span><span class="p">());</span><span class="w"></span>
+<span class="w">    </span><span class="n">TVM_ATTR_FIELD</span><span class="p">(</span><span class="n">exclusive</span><span class="p">)</span><span class="w"></span>
+<span class="w">        </span><span class="p">.</span><span class="n">describe</span><span class="p">(</span><span class="s">&quot;The first element is not included&quot;</span><span class="p">)</span><span class="w"></span>
+<span class="w">        </span><span class="p">.</span><span class="n">set_default</span><span class="p">(</span><span class="n">Bool</span><span class="p">(</span><span class="nb">false</span><span class="p">));</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="p">};</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
@@ -406,36 +406,36 @@ relation for an operator can enforce all the necessary typing rules
 output type.</p>
 <p>Type relation for the cumulative product and sum operators can be found in
 <code class="docutils literal notranslate"><span class="pre">src/relay/op/tensor/transform.cc</span></code>:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_REGISTER_NODE_TYPE</span><span class="p">(</span><span class="n">ScanopAttrs</span><span class="p">);</span>
-<span class="kt">bool</span> <span class="nf">ScanopRel</span><span class="p">(</span><span class="k">const</span> <span class="n">Array</span><span class="o">&lt;</span><span class="n">Type</span><span class="o">&gt;&amp;</span> <span class="n">types</span><span class="p">,</span> <span class="kt">int</span> <span class="n">num_inputs</span><span class="p">,</span> <span class="k">const</span> <span class="n">Attrs</span><span class="o">&amp;</span> <span class="n">attrs</span><span cla [...]
-    <span class="c1">// types: [data, output]</span>
-    <span class="n">ICHECK_EQ</span><span class="p">(</span><span class="n">types</span><span class="p">.</span><span class="n">size</span><span class="p">(),</span> <span class="mi">2</span><span class="p">)</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;Expects two types, one for the input and another for the output&quot;</span><span class="p">;</span>
-    <span class="k">const</span> <span class="k">auto</span><span class="o">*</span> <span class="n">data</span> <span class="o">=</span> <span class="n">types</span><span class="p">[</span><span class="mi">0</span><span class="p">].</span><span class="n">as</span><span class="o">&lt;</span><span class="n">TensorTypeNode</span><span class="o">&gt;</span><span class="p">();</span>
-    <span class="k">if</span> <span class="p">(</span><span class="n">data</span> <span class="o">==</span> <span class="k">nullptr</span><span class="p">)</span> <span class="p">{</span>
-        <span class="n">ICHECK</span><span class="p">(</span><span class="n">types</span><span class="p">[</span><span class="mi">0</span><span class="p">].</span><span class="n">as</span><span class="o">&lt;</span><span class="n">IncompleteTypeNode</span><span class="o">&gt;</span><span class="p">())</span>
-        <span class="o">&lt;&lt;</span> <span class="s">&quot;Scanop: expect input type to be TensorType but get &quot;</span> <span class="o">&lt;&lt;</span> <span class="n">types</span><span class="p">[</span><span class="mi">0</span><span class="p">];</span>
-        <span class="k">return</span> <span class="nb">false</span><span class="p">;</span>
-    <span class="p">}</span>
-
-    <span class="k">const</span> <span class="k">auto</span><span class="o">*</span> <span class="n">param</span> <span class="o">=</span> <span class="n">attrs</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">ScanopAttrs</span><span class="o">&gt;</span><span class="p">();</span>
-
-    <span class="k">auto</span> <span class="n">dtype</span> <span class="o">=</span> <span class="n">param</span><span class="o">-&gt;</span><span class="n">dtype</span><span class="p">;</span>
-    <span class="k">if</span> <span class="p">(</span><span class="n">dtype</span><span class="p">.</span><span class="n">is_void</span><span class="p">())</span> <span class="p">{</span>
-        <span class="n">dtype</span> <span class="o">=</span> <span class="n">data</span><span class="o">-&gt;</span><span class="n">dtype</span><span class="p">;</span>
-    <span class="p">}</span>
-
-    <span class="k">if</span> <span class="p">(</span><span class="n">param</span><span class="o">-&gt;</span><span class="n">axis</span><span class="p">.</span><span class="n">defined</span><span class="p">())</span> <span class="p">{</span>
-        <span class="n">reporter</span><span class="o">-&gt;</span><span class="n">Assign</span><span class="p">(</span><span class="n">types</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">TensorType</span><span class="p">(</span><span class="n">data</span><span class="o">-&gt;</span><span class="n">shape</span><span class="p">,</span> <span class="n">dtype</span><span class="p">));</span>
-    <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-        <span class="k">auto</span> <span class="n">prod</span> <span class="o">=</span> <span class="n">data</span><span class="o">-&gt;</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">];</span>
-        <span class="k">for</span> <span class="p">(</span><span class="kt">size_t</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">data</span><span class="o">-&gt;</span><span class="n">shape</span><span class="p">.</span><span class="n">size</span><span class="p">();</span> <span class="o">++</span><span class="n">i</span><span class="p">)</span> <span class="p [...]
-            <span class="n">prod</span> <span class="o">=</span> <span class="n">prod</span> <span class="o">*</span> <span class="n">data</span><span class="o">-&gt;</span><span class="n">shape</span><span class="p">[</span><span class="n">i</span><span class="p">];</span>
-        <span class="p">}</span>
-        <span class="n">reporter</span><span class="o">-&gt;</span><span class="n">Assign</span><span class="p">(</span><span class="n">types</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">TensorType</span><span class="p">({</span><span class="n">prod</span><span class="p">},</span> <span class="n">dtype</span><span class="p">));</span>
-    <span class="p">}</span>
-
-    <span class="k">return</span> <span class="nb">true</span><span class="p">;</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_REGISTER_NODE_TYPE</span><span class="p">(</span><span class="n">ScanopAttrs</span><span class="p">);</span><span class="w"></span>
+<span class="kt">bool</span><span class="w"> </span><span class="nf">ScanopRel</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Type</span><span class="o">&gt;&amp;</span><span class="w"> </span><span class="n">types</span><span class="p">,</span><span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="n">num_inputs</span><span class="p">,</span><span cla [...]
+<span class="w">    </span><span class="c1">// types: [data, output]</span>
+<span class="w">    </span><span class="n">ICHECK_EQ</span><span class="p">(</span><span class="n">types</span><span class="p">.</span><span class="n">size</span><span class="p">(),</span><span class="w"> </span><span class="mi">2</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Expects two types, one for the input and another for the output&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">*</span><span class="w"> </span><span class="n">data</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">types</span><span class="p">[</span><span class="mi">0</span><span class="p">].</span><span class="n">as</span><span class="o">&lt;</span><span class="n">TensorTypeNode</span><span class="o">&gt;</span><span class="p">();</ [...]
+<span class="w">    </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">data</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="k">nullptr</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">        </span><span class="n">ICHECK</span><span class="p">(</span><span class="n">types</span><span class="p">[</span><span class="mi">0</span><span class="p">].</span><span class="n">as</span><span class="o">&lt;</span><span class="n">IncompleteTypeNode</span><span class="o">&gt;</span><span class="p">())</span><span class="w"></span>
+<span class="w">        </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Scanop: expect input type to be TensorType but get &quot;</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">types</span><span class="p">[</span><span class="mi">0</span><span class="p">];</span><span class="w"></span>
+<span class="w">        </span><span class="k">return</span><span class="w"> </span><span class="nb">false</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+
+<span class="w">    </span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">*</span><span class="w"> </span><span class="n">param</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">attrs</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">ScanopAttrs</span><span class="o">&gt;</span><span class="p">();</span><span class="w"></span>
+
+<span class="w">    </span><span class="k">auto</span><span class="w"> </span><span class="n">dtype</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">param</span><span class="o">-&gt;</span><span class="n">dtype</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">dtype</span><span class="p">.</span><span class="n">is_void</span><span class="p">())</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">        </span><span class="n">dtype</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">data</span><span class="o">-&gt;</span><span class="n">dtype</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+
+<span class="w">    </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">param</span><span class="o">-&gt;</span><span class="n">axis</span><span class="p">.</span><span class="n">defined</span><span class="p">())</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">        </span><span class="n">reporter</span><span class="o">-&gt;</span><span class="n">Assign</span><span class="p">(</span><span class="n">types</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span><span class="w"> </span><span class="n">TensorType</span><span class="p">(</span><span class="n">data</span><span class="o">-&gt;</span><span class="n">shape</span><span class="p">,</span><span class="w"> </span><span class="n">dtype</span><span c [...]
+<span class="w">    </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">        </span><span class="k">auto</span><span class="w"> </span><span class="n">prod</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">data</span><span class="o">-&gt;</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">];</span><span class="w"></span>
+<span class="w">        </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="kt">size_t</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">1</span><span class="p">;</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">&lt;</span><span class="w"> </span><span class="n">data</span><span class="o">-&gt;</span><span clas [...]
+<span class="w">            </span><span class="n">prod</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">prod</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="n">data</span><span class="o">-&gt;</span><span class="n">shape</span><span class="p">[</span><span class="n">i</span><span class="p">];</span><span class="w"></span>
+<span class="w">        </span><span class="p">}</span><span class="w"></span>
+<span class="w">        </span><span class="n">reporter</span><span class="o">-&gt;</span><span class="n">Assign</span><span class="p">(</span><span class="n">types</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span><span class="w"> </span><span class="n">TensorType</span><span class="p">({</span><span class="n">prod</span><span class="p">},</span><span class="w"> </span><span class="n">dtype</span><span class="p">));</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="nb">true</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
@@ -452,23 +452,23 @@ to specify the following information about an operator in Relay:</p>
 <li><p>Other annotations useful when optimizing the operation.</p></li>
 </ul>
 <p>Once again we add this to <code class="docutils literal notranslate"><span class="pre">src/relay/op/tensor/transform.cc</span></code>:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">RELAY_REGISTER_OP</span><span class="p">(</span><span class="s">&quot;cumsum&quot;</span><span class="p">)</span>
-    <span class="p">.</span><span class="n">describe</span><span class="p">(</span>
-        <span class="sa">R</span><span class="s">&quot;</span><span class="dl">doc(</span><span class="s">Return the cumulative sum of the elements along a given axis.</span><span class="dl">)doc</span><span class="s">&quot;</span> <span class="n">TVM_ADD_FILELINE</span><span class="p">)</span>
-    <span class="p">.</span><span class="n">set_num_inputs</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
-    <span class="p">.</span><span class="n">add_argument</span><span class="p">(</span><span class="s">&quot;data&quot;</span><span class="p">,</span> <span class="s">&quot;Tensor&quot;</span><span class="p">,</span> <span class="s">&quot;The input tensor.&quot;</span><span class="p">)</span>
-    <span class="p">.</span><span class="n">set_support_level</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
-    <span class="p">.</span><span class="n">add_type_rel</span><span class="p">(</span><span class="s">&quot;Cumsum&quot;</span><span class="p">,</span> <span class="n">ScanopRel</span><span class="p">)</span>
-    <span class="p">.</span><span class="n">set_attr</span><span class="o">&lt;</span><span class="n">TOpPattern</span><span class="o">&gt;</span><span class="p">(</span><span class="s">&quot;TOpPattern&quot;</span><span class="p">,</span> <span class="n">kOpaque</span><span class="p">);</span>
-
-<span class="n">RELAY_REGISTER_OP</span><span class="p">(</span><span class="s">&quot;cumprod&quot;</span><span class="p">)</span>
-    <span class="p">.</span><span class="n">describe</span><span class="p">(</span>
-        <span class="sa">R</span><span class="s">&quot;</span><span class="dl">doc(</span><span class="s">Return the cumulative product of the elements along a given axis.</span><span class="dl">)doc</span><span class="s">&quot;</span> <span class="n">TVM_ADD_FILELINE</span><span class="p">)</span>
-    <span class="p">.</span><span class="n">set_num_inputs</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
-    <span class="p">.</span><span class="n">add_argument</span><span class="p">(</span><span class="s">&quot;data&quot;</span><span class="p">,</span> <span class="s">&quot;Tensor&quot;</span><span class="p">,</span> <span class="s">&quot;The input tensor.&quot;</span><span class="p">)</span>
-    <span class="p">.</span><span class="n">set_support_level</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
-    <span class="p">.</span><span class="n">add_type_rel</span><span class="p">(</span><span class="s">&quot;Cumprod&quot;</span><span class="p">,</span> <span class="n">ScanopRel</span><span class="p">)</span>
-    <span class="p">.</span><span class="n">set_attr</span><span class="o">&lt;</span><span class="n">TOpPattern</span><span class="o">&gt;</span><span class="p">(</span><span class="s">&quot;TOpPattern&quot;</span><span class="p">,</span> <span class="n">kOpaque</span><span class="p">);</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">RELAY_REGISTER_OP</span><span class="p">(</span><span class="s">&quot;cumsum&quot;</span><span class="p">)</span><span class="w"></span>
+<span class="w">    </span><span class="p">.</span><span class="n">describe</span><span class="p">(</span><span class="w"></span>
+<span class="w">        </span><span class="sa">R</span><span class="s">&quot;</span><span class="dl">doc(</span><span class="s">Return the cumulative sum of the elements along a given axis.</span><span class="dl">)doc</span><span class="s">&quot;</span><span class="w"> </span><span class="n">TVM_ADD_FILELINE</span><span class="p">)</span><span class="w"></span>
+<span class="w">    </span><span class="p">.</span><span class="n">set_num_inputs</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="w"></span>
+<span class="w">    </span><span class="p">.</span><span class="n">add_argument</span><span class="p">(</span><span class="s">&quot;data&quot;</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;Tensor&quot;</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;The input tensor.&quot;</span><span class="p">)</span><span class="w"></span>
+<span class="w">    </span><span class="p">.</span><span class="n">set_support_level</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span><span class="w"></span>
+<span class="w">    </span><span class="p">.</span><span class="n">add_type_rel</span><span class="p">(</span><span class="s">&quot;Cumsum&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">ScanopRel</span><span class="p">)</span><span class="w"></span>
+<span class="w">    </span><span class="p">.</span><span class="n">set_attr</span><span class="o">&lt;</span><span class="n">TOpPattern</span><span class="o">&gt;</span><span class="p">(</span><span class="s">&quot;TOpPattern&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">kOpaque</span><span class="p">);</span><span class="w"></span>
+
+<span class="n">RELAY_REGISTER_OP</span><span class="p">(</span><span class="s">&quot;cumprod&quot;</span><span class="p">)</span><span class="w"></span>
+<span class="w">    </span><span class="p">.</span><span class="n">describe</span><span class="p">(</span><span class="w"></span>
+<span class="w">        </span><span class="sa">R</span><span class="s">&quot;</span><span class="dl">doc(</span><span class="s">Return the cumulative product of the elements along a given axis.</span><span class="dl">)doc</span><span class="s">&quot;</span><span class="w"> </span><span class="n">TVM_ADD_FILELINE</span><span class="p">)</span><span class="w"></span>
+<span class="w">    </span><span class="p">.</span><span class="n">set_num_inputs</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="w"></span>
+<span class="w">    </span><span class="p">.</span><span class="n">add_argument</span><span class="p">(</span><span class="s">&quot;data&quot;</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;Tensor&quot;</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;The input tensor.&quot;</span><span class="p">)</span><span class="w"></span>
+<span class="w">    </span><span class="p">.</span><span class="n">set_support_level</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span><span class="w"></span>
+<span class="w">    </span><span class="p">.</span><span class="n">add_type_rel</span><span class="p">(</span><span class="s">&quot;Cumprod&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">ScanopRel</span><span class="p">)</span><span class="w"></span>
+<span class="w">    </span><span class="p">.</span><span class="n">set_attr</span><span class="o">&lt;</span><span class="n">TOpPattern</span><span class="o">&gt;</span><span class="p">(</span><span class="s">&quot;TOpPattern&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">kOpaque</span><span class="p">);</span><span class="w"></span>
 </pre></div>
 </div>
 <p>In this case the <code class="docutils literal notranslate"><span class="pre">TOpPattern</span></code> is a hint to the compiler on the pattern of computation the operator does, which might be
@@ -530,7 +530,7 @@ add the following strategies:</p>
     <span class="p">)</span>
     <span class="k">return</span> <span class="n">strategy</span>
 
-<span class="nd">@cumsum_strategy.register</span><span class="p">([</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="s2">&quot;gpu&quot;</span><span class="p">])</span>
+<span class="nd">@cumsum_strategy</span><span class="o">.</span><span class="n">register</span><span class="p">([</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="s2">&quot;gpu&quot;</span><span class="p">])</span>
 <span class="k">def</span> <span class="nf">cumsum_strategy_cuda</span><span class="p">(</span><span class="n">attrs</span><span class="p">,</span> <span class="n">inputs</span><span class="p">,</span> <span class="n">out_type</span><span class="p">,</span> <span class="n">target</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;cumsum cuda strategy&quot;&quot;&quot;</span>
     <span class="n">strategy</span> <span class="o">=</span> <span class="n">_op</span><span class="o">.</span><span class="n">OpStrategy</span><span class="p">()</span>
@@ -542,7 +542,7 @@ add the following strategies:</p>
     <span class="k">return</span> <span class="n">strategy</span>
 
 
-<span class="nd">@cumprod_strategy.register</span><span class="p">([</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="s2">&quot;gpu&quot;</span><span class="p">])</span>
+<span class="nd">@cumprod_strategy</span><span class="o">.</span><span class="n">register</span><span class="p">([</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="s2">&quot;gpu&quot;</span><span class="p">])</span>
 <span class="k">def</span> <span class="nf">cumprod_strategy_cuda</span><span class="p">(</span><span class="n">attrs</span><span class="p">,</span> <span class="n">inputs</span><span class="p">,</span> <span class="n">out_type</span><span class="p">,</span> <span class="n">target</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;cumprod cuda strategy&quot;&quot;&quot;</span>
     <span class="n">strategy</span> <span class="o">=</span> <span class="n">_op</span><span class="o">.</span><span class="n">OpStrategy</span><span class="p">()</span>
@@ -557,24 +557,24 @@ add the following strategies:</p>
 <p>Where in each strategy we define the compute we wrote and the schedule to use within <code class="docutils literal notranslate"><span class="pre">add_implementation()</span></code>.
 We finally link the strategy and compute with the defined relay operator in <code class="docutils literal notranslate"><span class="pre">python/tvm/relay/op/_transform.py</span></code>:</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># cumsum</span>
-<span class="nd">@_reg.register_compute</span><span class="p">(</span><span class="s2">&quot;cumsum&quot;</span><span class="p">)</span>
+<span class="nd">@_reg</span><span class="o">.</span><span class="n">register_compute</span><span class="p">(</span><span class="s2">&quot;cumsum&quot;</span><span class="p">)</span>
 <span class="k">def</span> <span class="nf">compute_cumsum</span><span class="p">(</span><span class="n">attrs</span><span class="p">,</span> <span class="n">inputs</span><span class="p">,</span> <span class="n">output_type</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;Compute definition of cumsum&quot;&quot;&quot;</span>
     <span class="k">return</span> <span class="p">[</span><span class="n">topi</span><span class="o">.</span><span class="n">cumsum</span><span class="p">(</span><span class="n">inputs</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">attrs</span><span class="o">.</span><span class="n">axis</span><span class="p">,</span> <span class="n">attrs</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">a [...]
 
 
 <span class="n">_reg</span><span class="o">.</span><span class="n">register_strategy</span><span class="p">(</span><span class="s2">&quot;cumsum&quot;</span><span class="p">,</span> <span class="n">strategy</span><span class="o">.</span><span class="n">cumsum_strategy</span><span class="p">)</span>
-<span class="n">_reg</span><span class="o">.</span><span class="n">register_shape_func</span><span class="p">(</span><span class="s2">&quot;cumsum&quot;</span><span class="p">,</span> <span class="bp">False</span><span class="p">,</span> <span class="n">elemwise_shape_func</span><span class="p">)</span>
+<span class="n">_reg</span><span class="o">.</span><span class="n">register_shape_func</span><span class="p">(</span><span class="s2">&quot;cumsum&quot;</span><span class="p">,</span> <span class="kc">False</span><span class="p">,</span> <span class="n">elemwise_shape_func</span><span class="p">)</span>
 
 <span class="c1"># cumprod</span>
-<span class="nd">@_reg.register_compute</span><span class="p">(</span><span class="s2">&quot;cumprod&quot;</span><span class="p">)</span>
+<span class="nd">@_reg</span><span class="o">.</span><span class="n">register_compute</span><span class="p">(</span><span class="s2">&quot;cumprod&quot;</span><span class="p">)</span>
 <span class="k">def</span> <span class="nf">compute_cumprod</span><span class="p">(</span><span class="n">attrs</span><span class="p">,</span> <span class="n">inputs</span><span class="p">,</span> <span class="n">output_type</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;Compute definition of cumprod&quot;&quot;&quot;</span>
     <span class="k">return</span> <span class="p">[</span><span class="n">topi</span><span class="o">.</span><span class="n">cumprod</span><span class="p">(</span><span class="n">inputs</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">attrs</span><span class="o">.</span><span class="n">axis</span><span class="p">,</span> <span class="n">attrs</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n"> [...]
 
 
 <span class="n">_reg</span><span class="o">.</span><span class="n">register_strategy</span><span class="p">(</span><span class="s2">&quot;cumprod&quot;</span><span class="p">,</span> <span class="n">strategy</span><span class="o">.</span><span class="n">cumprod_strategy</span><span class="p">)</span>
-<span class="n">_reg</span><span class="o">.</span><span class="n">register_shape_func</span><span class="p">(</span><span class="s2">&quot;cumprod&quot;</span><span class="p">,</span> <span class="bp">False</span><span class="p">,</span> <span class="n">elemwise_shape_func</span><span class="p">)</span>
+<span class="n">_reg</span><span class="o">.</span><span class="n">register_shape_func</span><span class="p">(</span><span class="s2">&quot;cumprod&quot;</span><span class="p">,</span> <span class="kc">False</span><span class="p">,</span> <span class="n">elemwise_shape_func</span><span class="p">)</span>
 </pre></div>
 </div>
 <p>The shape functions are used for determining output shape given a dynamically shaped tensor. In this
@@ -592,27 +592,27 @@ operator is intended).</p>
 are not supported, so it suffices to use <code class="docutils literal notranslate"><span class="pre">Op::Get</span></code> to fetch
 the operator’s information from the operator registry and pass in
 the arguments to the call node, as below. In <code class="docutils literal notranslate"><span class="pre">src/relay/op/tensor/transform.cc</span></code>:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span> <span class="nf">MakeCumsum</span><span class="p">(</span><span class="n">Expr</span> <span class="n">data</span><span class="p">,</span> <span class="n">Integer</span> <span class="n">axis</span><span class="p">,</span> <span class="n">DataType</span> <span class="n">dtype</span><span class="p">,</span> <span class="n">Bool</span> <span class="n">exclusive</span><span class="p">)< [...]
-    <span class="k">auto</span> <span class="n">attrs</span> <span class="o">=</span> <span class="n">make_object</span><span class="o">&lt;</span><span class="n">ScanopAttrs</span><span class="o">&gt;</span><span class="p">();</span>
-    <span class="n">attrs</span><span class="o">-&gt;</span><span class="n">dtype</span> <span class="o">=</span> <span class="n">dtype</span><span class="p">;</span>
-    <span class="n">attrs</span><span class="o">-&gt;</span><span class="n">axis</span> <span class="o">=</span> <span class="n">axis</span><span class="p">;</span>
-    <span class="n">attrs</span><span class="o">-&gt;</span><span class="n">exclusive</span> <span class="o">=</span> <span class="n">exclusive</span><span class="p">;</span>
-    <span class="k">static</span> <span class="k">const</span> <span class="n">Op</span><span class="o">&amp;</span> <span class="n">op</span> <span class="o">=</span> <span class="n">Op</span><span class="o">::</span><span class="n">Get</span><span class="p">(</span><span class="s">&quot;cumsum&quot;</span><span class="p">);</span>
-    <span class="k">return</span> <span class="n">Call</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="p">{</span><span class="n">data</span><span class="p">},</span> <span class="n">Attrs</span><span class="p">(</span><span class="n">attrs</span><span class="p">),</span> <span class="p">{});</span>
-<span class="p">}</span>
-
-<span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;relay.op._make.cumsum&quot;</span><span class="p">).</span><span class="n">set_body_typed</span><span class="p">(</span><span class="n">MakeCumsum</span><span class="p">);</span>
-
-<span class="n">Expr</span> <span class="nf">MakeCumprod</span><span class="p">(</span><span class="n">Expr</span> <span class="n">data</span><span class="p">,</span> <span class="n">Integer</span> <span class="n">axis</span><span class="p">,</span> <span class="n">DataType</span> <span class="n">dtype</span><span class="p">,</span> <span class="n">Bool</span> <span class="n">exclusive</span><span class="p">)</span> <span class="p">{</span>
-    <span class="k">auto</span> <span class="n">attrs</span> <span class="o">=</span> <span class="n">make_object</span><span class="o">&lt;</span><span class="n">ScanopAttrs</span><span class="o">&gt;</span><span class="p">();</span>
-    <span class="n">attrs</span><span class="o">-&gt;</span><span class="n">dtype</span> <span class="o">=</span> <span class="n">dtype</span><span class="p">;</span>
-    <span class="n">attrs</span><span class="o">-&gt;</span><span class="n">axis</span> <span class="o">=</span> <span class="n">axis</span><span class="p">;</span>
-    <span class="n">attrs</span><span class="o">-&gt;</span><span class="n">exclusive</span> <span class="o">=</span> <span class="n">exclusive</span><span class="p">;</span>
-    <span class="k">static</span> <span class="k">const</span> <span class="n">Op</span><span class="o">&amp;</span> <span class="n">op</span> <span class="o">=</span> <span class="n">Op</span><span class="o">::</span><span class="n">Get</span><span class="p">(</span><span class="s">&quot;cumprod&quot;</span><span class="p">);</span>
-    <span class="k">return</span> <span class="n">Call</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="p">{</span><span class="n">data</span><span class="p">},</span> <span class="n">Attrs</span><span class="p">(</span><span class="n">attrs</span><span class="p">),</span> <span class="p">{});</span>
-<span class="p">}</span>
-
-<span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;relay.op._make.cumsum&quot;</span><span class="p">).</span><span class="n">set_body_typed</span><span class="p">(</span><span class="n">MakeCumprod</span><span class="p">);</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span><span class="w"> </span><span class="nf">MakeCumsum</span><span class="p">(</span><span class="n">Expr</span><span class="w"> </span><span class="n">data</span><span class="p">,</span><span class="w"> </span><span class="n">Integer</span><span class="w"> </span><span class="n">axis</span><span class="p">,</span><span class="w"> </span><span class="n">DataType</span><span class="w">  [...]
+<span class="w">    </span><span class="k">auto</span><span class="w"> </span><span class="n">attrs</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">make_object</span><span class="o">&lt;</span><span class="n">ScanopAttrs</span><span class="o">&gt;</span><span class="p">();</span><span class="w"></span>
+<span class="w">    </span><span class="n">attrs</span><span class="o">-&gt;</span><span class="n">dtype</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">dtype</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="n">attrs</span><span class="o">-&gt;</span><span class="n">axis</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">axis</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="n">attrs</span><span class="o">-&gt;</span><span class="n">exclusive</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">exclusive</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="k">static</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">Op</span><span class="o">&amp;</span><span class="w"> </span><span class="n">op</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Op</span><span class="o">::</span><span class="n">Get</span><span class="p">(</span><span class="s">&quot;cumsum&quot;</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="n">Call</span><span class="p">(</span><span class="n">op</span><span class="p">,</span><span class="w"> </span><span class="p">{</span><span class="n">data</span><span class="p">},</span><span class="w"> </span><span class="n">Attrs</span><span class="p">(</span><span class="n">attrs</span><span class="p">),</span><span class="w"> </span><span class="p">{});</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
+
+<span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;relay.op._make.cumsum&quot;</span><span class="p">).</span><span class="n">set_body_typed</span><span class="p">(</span><span class="n">MakeCumsum</span><span class="p">);</span><span class="w"></span>
+
+<span class="n">Expr</span><span class="w"> </span><span class="nf">MakeCumprod</span><span class="p">(</span><span class="n">Expr</span><span class="w"> </span><span class="n">data</span><span class="p">,</span><span class="w"> </span><span class="n">Integer</span><span class="w"> </span><span class="n">axis</span><span class="p">,</span><span class="w"> </span><span class="n">DataType</span><span class="w"> </span><span class="n">dtype</span><span class="p">,</span><span class="w"> </s [...]
+<span class="w">    </span><span class="k">auto</span><span class="w"> </span><span class="n">attrs</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">make_object</span><span class="o">&lt;</span><span class="n">ScanopAttrs</span><span class="o">&gt;</span><span class="p">();</span><span class="w"></span>
+<span class="w">    </span><span class="n">attrs</span><span class="o">-&gt;</span><span class="n">dtype</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">dtype</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="n">attrs</span><span class="o">-&gt;</span><span class="n">axis</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">axis</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="n">attrs</span><span class="o">-&gt;</span><span class="n">exclusive</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">exclusive</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="k">static</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">Op</span><span class="o">&amp;</span><span class="w"> </span><span class="n">op</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Op</span><span class="o">::</span><span class="n">Get</span><span class="p">(</span><span class="s">&quot;cumprod&quot;</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="n">Call</span><span class="p">(</span><span class="n">op</span><span class="p">,</span><span class="w"> </span><span class="p">{</span><span class="n">data</span><span class="p">},</span><span class="w"> </span><span class="n">Attrs</span><span class="p">(</span><span class="n">attrs</span><span class="p">),</span><span class="w"> </span><span class="p">{});</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
+
+<span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;relay.op._make.cumsum&quot;</span><span class="p">).</span><span class="n">set_body_typed</span><span class="p">(</span><span class="n">MakeCumprod</span><span class="p">);</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Where <code class="docutils literal notranslate"><span class="pre">TVM_REGISTER_GLOBAL</span></code> exposes the <code class="docutils literal notranslate"><span class="pre">MakeCumsum</span></code> and <code class="docutils literal notranslate"><span class="pre">MakeCumprod</span></code> functions
@@ -624,10 +624,10 @@ in Python via <code class="docutils literal notranslate"><span class="pre">relay
 through <code class="docutils literal notranslate"><span class="pre">TVM_REGISTER_GLOBAL</span></code> should be wrapped in a separate
 Python function rather than called directly in Python. For our
 operators we expose this cleaner interface in <code class="docutils literal notranslate"><span class="pre">python/tvm/relay/op/transform.py</span></code></p>
-<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">cumsum</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">exclusive</span><span class="o">=</span><span class [...]
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">cumsum</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">exclusive</span><span class="o">=</span><span class [...]
     <span class="k">return</span> <span class="n">_make</span><span class="o">.</span><span class="n">cumsum</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">axis</span><span class="p">,</span> <span class="n">dtype</span><span class="p">,</span> <span class="n">exclusive</span><span class="p">)</span>
 
-<span class="k">def</span> <span class="nf">cumprod</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">exclusive</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
+<span class="k">def</span> <span class="nf">cumprod</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">exclusive</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
     <span class="k">return</span> <span class="n">_make</span><span class="o">.</span><span class="n">cumprod</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">axis</span><span class="p">,</span> <span class="n">dtype</span><span class="p">,</span> <span class="n">exclusive</span><span class="p">)</span>
 </pre></div>
 </div>
@@ -726,11 +726,11 @@ interface for registering is slightly different.</p>
 <p>First, make sure <code class="docutils literal notranslate"><span class="pre">src/relay/transforms/pattern_utils.h</span></code> is included. It provides
 helper functions for creating nodes in the Relay AST. Then, define the
 gradient in a similar fashion as in the Python example:</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">tvm</span><span class="o">::</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Expr</span><span class="o">&gt;</span> <span class="n">MultiplyGrad</span><span class="p">(</span><span class="k">const</span> <span class="n">Expr</span><span class="o">&amp;</span> <span class="n">orig_call</span><span class="p">,</span> <span class="k">const</span> <span class="n">Expr</s [...]
-    <span class="k">const</span> <span class="n">Call</span><span class="o">&amp;</span> <span class="n">call</span> <span class="o">=</span> <span class="n">orig_call</span><span class="p">.</span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Call</span><span class="o">&gt;</span><span class="p">();</span>
-    <span class="k">return</span> <span class="p">{</span> <span class="n">CollapseSumLike</span><span class="p">(</span><span class="n">Multiply</span><span class="p">(</span><span class="n">output_grad</span><span class="p">,</span> <span class="n">call</span><span class="p">.</span><span class="n">args</span><span class="p">[</span><span class="mi">1</span><span class="p">]),</span> <span class="n">call</span><span class="p">.</span><span class="n">args</span><span class="p">[</span>< [...]
-             <span class="n">CollapseSumLike</span><span class="p">(</span><span class="n">Multiply</span><span class="p">(</span><span class="n">output_grad</span><span class="p">,</span> <span class="n">call</span><span class="p">.</span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">]),</span> <span class="n">call</span><span class="p">.</span><span class="n">args</span><span class="p">[</span><span class="mi">1</span><span class="p">])</sp [...]
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">tvm</span><span class="o">::</span><span class="n">Array</span><span class="o">&lt;</span><span class="n">Expr</span><span class="o">&gt;</span><span class="w"> </span><span class="n">MultiplyGrad</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Expr</span><span class="o">&amp;</span><span class="w"> </span><span class="n">orig_call</span><span cl [...]
+<span class="w">    </span><span class="k">const</span><span class="w"> </span><span class="n">Call</span><span class="o">&amp;</span><span class="w"> </span><span class="n">call</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">orig_call</span><span class="p">.</span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Call</span><span class="o">&gt;</span><span class="p">();</span><span class="w"></span>
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">CollapseSumLike</span><span class="p">(</span><span class="n">Multiply</span><span class="p">(</span><span class="n">output_grad</span><span class="p">,</span><span class="w"> </span><span class="n">call</span><span class="p">.</span><span class="n">args</span><span class="p">[</span><span class="mi">1</span><span class="p">]),</span><span class= [...]
+<span class="w">             </span><span class="n">CollapseSumLike</span><span class="p">(</span><span class="n">Multiply</span><span class="p">(</span><span class="n">output_grad</span><span class="p">,</span><span class="w"> </span><span class="n">call</span><span class="p">.</span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">]),</span><span class="w"> </span><span class="n">call</span><span class="p">.</span><span class="n">args</span><s [...]
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Notice that in C++ we can’t use the same operator overloading that we have in
@@ -740,11 +740,11 @@ Python.</p>
 <p>Now, instead of using a Python decorator, we need to tack a <code class="docutils literal notranslate"><span class="pre">set_attr</span></code> call
 for “FPrimalGradient” onto the end of the base operator’s registration, in
 order to register the gradient.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">RELAY_REGISTER_OP</span><span class="p">(</span><span class="s">&quot;multiply&quot;</span><span class="p">)</span>
-    <span class="c1">// ...</span>
-    <span class="c1">// Set other attributes</span>
-    <span class="c1">// ...</span>
-    <span class="p">.</span><span class="n">set_attr</span><span class="o">&lt;</span><span class="n">FPrimalGradient</span><span class="o">&gt;</span><span class="p">(</span><span class="s">&quot;FPrimalGradient&quot;</span><span class="p">,</span> <span class="n">MultiplyGrad</span><span class="p">);</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">RELAY_REGISTER_OP</span><span class="p">(</span><span class="s">&quot;multiply&quot;</span><span class="p">)</span><span class="w"></span>
+<span class="w">    </span><span class="c1">// ...</span>
+<span class="w">    </span><span class="c1">// Set other attributes</span>
+<span class="w">    </span><span class="c1">// ...</span>
+<span class="w">    </span><span class="p">.</span><span class="n">set_attr</span><span class="o">&lt;</span><span class="n">FPrimalGradient</span><span class="o">&gt;</span><span class="p">(</span><span class="s">&quot;FPrimalGradient&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">MultiplyGrad</span><span class="p">);</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
diff --git a/docs/dev/how_to/relay_add_pass.html b/docs/dev/how_to/relay_add_pass.html
index e2fb61558..07837e063 100644
--- a/docs/dev/how_to/relay_add_pass.html
+++ b/docs/dev/how_to/relay_add_pass.html
@@ -354,10 +354,10 @@ own vtable, which <code class="docutils literal notranslate"><span class="pre">V
 more control over dispatch. For example, if we wanted to define a
 <code class="docutils literal notranslate"><span class="pre">PrintVisitor</span></code> traverser that printed “Here” before every visit, we
 could override <code class="docutils literal notranslate"><span class="pre">VisitExpr</span></code>:</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span> <span class="n">PrintVisitor</span><span class="o">::</span><span class="n">VisitExpr</span><span class="p">(</span><span class="k">const</span> <span class="n">Expr</span><span class="o">&amp;</span> <span class="n">expr</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">cout</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;Here&quot;</span> <span class="o">&lt;&lt;</span> <span class="n">std</span><span class="o">::</span><span class="n">endl</span><span class="p">;</span>
-  <span class="n">ExprFunctor</span><span class="o">::</span><span class="n">VisitExpr</span><span class="p">(</span><span class="n">expr</span><span class="p">);</span>
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span><span class="w"> </span><span class="nf">PrintVisitor::VisitExpr</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Expr</span><span class="o">&amp;</span><span class="w"> </span><span class="n">expr</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">cout</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Here&quot;</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">endl</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">ExprFunctor</span><span class="o">::</span><span class="n">VisitExpr</span><span class="p">(</span><span class="n">expr</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p><code class="docutils literal notranslate"><span class="pre">ExprFunctor</span></code> itself is a very general class, which is why more often than
@@ -375,11 +375,11 @@ perform program analyses and collect information. With this class,
 implementations provided by this class simply visit all of the expression’s
 fields that are expressions. The default implementation for <code class="docutils literal notranslate"><span class="pre">IfNode</span></code> is
 shown below.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span> <span class="n">ExprVisitor</span><span class="o">::</span><span class="n">VisitExpr_</span><span class="p">(</span><span class="k">const</span> <span class="n">IfNode</span><span class="o">*</span> <span class="n">op</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">this</span><span class="o">-&gt;</span><span class="n">VisitExpr</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">cond</span><span class="p">);</span>
-  <span class="n">this</span><span class="o">-&gt;</span><span class="n">VisitExpr</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">true_branch</span><span class="p">);</span>
-  <span class="n">this</span><span class="o">-&gt;</span><span class="n">VisitExpr</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">false_branch</span><span class="p">);</span>
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span><span class="w"> </span><span class="nf">ExprVisitor::VisitExpr_</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">IfNode</span><span class="o">*</span><span class="w"> </span><span class="n">op</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">this</span><span class="o">-&gt;</span><span class="n">VisitExpr</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">cond</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="n">this</span><span class="o">-&gt;</span><span class="n">VisitExpr</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">true_branch</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="n">this</span><span class="o">-&gt;</span><span class="n">VisitExpr</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">false_branch</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Note that we’re calling <code class="docutils literal notranslate"><span class="pre">VisitExpr</span></code> and not <code class="docutils literal notranslate"><span class="pre">VisitExpr_</span></code> here, so we can
@@ -387,20 +387,20 @@ use the vtable in <code class="docutils literal notranslate"><span class="pre">E
 <p>Now, if we wanted to write a class <code class="docutils literal notranslate"><span class="pre">CallChecker</span></code> that checks if any
 function calls appear in the program, we would only need to extend
 <code class="docutils literal notranslate"><span class="pre">ExprVisitor</span></code> and define the following <code class="docutils literal notranslate"><span class="pre">VisitExpr_</span></code> method:</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span> <span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span> <span class="n">CallNode</span><span class="o">*</span> <span class="n">n</span><span class="p">)</span> <span class="n">final</span> <span class="p">{</span>
-  <span class="n">result_</span> <span class="o">=</span> <span class="nb">true</span><span class="p">;</span>
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span><span class="w"> </span><span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">CallNode</span><span class="o">*</span><span class="w"> </span><span class="n">n</span><span class="p">)</span><span class="w"> </span><span class="n">final</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">result_</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">true</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>where <code class="docutils literal notranslate"><span class="pre">result_</span></code> is a field. In this case, we don’t need to further recurse
 on the fields of the <code class="docutils literal notranslate"><span class="pre">CallNode</span></code>, because <code class="docutils literal notranslate"><span class="pre">result_</span></code> is already true and we
 now know the original expression contains a call. To make this visitor
 usable, we would provide the following public method:</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="kt">bool</span> <span class="nf">Check</span><span class="p">(</span><span class="k">const</span> <span class="n">Expr</span><span class="o">&amp;</span> <span class="n">expr</span><span class="p">)</span> <span class="n">final</span> <span class="p">{</span>
-  <span class="n">result_</span> <span class="o">=</span> <span class="nb">false</span><span class="p">;</span>
-  <span class="n">VisitExpr</span><span class="p">(</span><span class="n">expr</span><span class="p">);</span>
-  <span class="k">return</span> <span class="n">result_</span><span class="p">;</span>
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="kt">bool</span><span class="w"> </span><span class="nf">Check</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Expr</span><span class="o">&amp;</span><span class="w"> </span><span class="n">expr</span><span class="p">)</span><span class="w"> </span><span class="n">final</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">result_</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">false</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">VisitExpr</span><span class="p">(</span><span class="n">expr</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="n">result_</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>And that’s all we need. It is very common to define a public interface that
@@ -417,14 +417,14 @@ default <code class="docutils literal notranslate"><span class="pre">VisitExpr_<
 the expression’s fields that are expressions and set the fields to be the
 result of visiting them. The default implementation for <code class="docutils literal notranslate"><span class="pre">TupleGetItemNode</span></code>
 is shown below.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span> <span class="n">ExprMutator</span><span class="o">::</span><span class="n">VisitExpr_</span><span class="p">(</span><span class="k">const</span> <span class="n">TupleGetItemNode</span><span class="o">*</span> <span class="n">g</span><span class="p">)</span> <span class="p">{</span>
-  <span class="k">auto</span> <span class="n">t</span> <span class="o">=</span> <span class="n">this</span><span class="o">-&gt;</span><span class="n">Mutate</span><span class="p">(</span><span class="n">g</span><span class="o">-&gt;</span><span class="n">tuple</span><span class="p">);</span>
-  <span class="k">if</span> <span class="p">(</span><span class="n">g</span><span class="o">-&gt;</span><span class="n">tuple</span> <span class="o">==</span> <span class="n">t</span><span class="p">)</span> <span class="p">{</span>
-    <span class="k">return</span> <span class="n">GetRef</span><span class="o">&lt;</span><span class="n">Expr</span><span class="o">&gt;</span><span class="p">(</span><span class="n">g</span><span class="p">);</span>
-  <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-    <span class="k">return</span> <span class="n">TupleGetItem</span><span class="p">(</span><span class="n">t</span><span class="p">,</span> <span class="n">g</span><span class="o">-&gt;</span><span class="n">index</span><span class="p">);</span>
-  <span class="p">}</span>
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span><span class="w"> </span><span class="nf">ExprMutator::VisitExpr_</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">TupleGetItemNode</span><span class="o">*</span><span class="w"> </span><span class="n">g</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="k">auto</span><span class="w"> </span><span class="n">t</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">this</span><span class="o">-&gt;</span><span class="n">Mutate</span><span class="p">(</span><span class="n">g</span><span class="o">-&gt;</span><span class="n">tuple</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">g</span><span class="o">-&gt;</span><span class="n">tuple</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="n">t</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="n">GetRef</span><span class="o">&lt;</span><span class="n">Expr</span><span class="o">&gt;</span><span class="p">(</span><span class="n">g</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="n">TupleGetItem</span><span class="p">(</span><span class="n">t</span><span class="p">,</span><span class="w"> </span><span class="n">g</span><span class="o">-&gt;</span><span class="n">index</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>There are a few things to notice here. First, <code class="docutils literal notranslate"><span class="pre">Mutate</span></code> is an alias for
@@ -440,17 +440,17 @@ anything. Usually, when we want to cache results in a subclass of
 <p>Now, if we wanted to write a class <code class="docutils literal notranslate"><span class="pre">IfCollapser</span></code> that replaces every if
 statement with its true branch, we would override <code class="docutils literal notranslate"><span class="pre">VisitExpr_</span></code> for
 <code class="docutils literal notranslate"><span class="pre">IfNode</span></code>:</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span> <span class="n">ExprMutator</span><span class="o">::</span><span class="n">VisitExpr_</span><span class="p">(</span><span class="k">const</span> <span class="n">IfNode</span><span class="o">*</span> <span class="n">op</span><span class="p">)</span> <span class="p">{</span>
-  <span class="k">return</span> <span class="n">this</span><span class="o">-&gt;</span><span class="n">Mutate</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">true_branch</span><span class="p">);</span>
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span><span class="w"> </span><span class="nf">ExprMutator::VisitExpr_</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">IfNode</span><span class="o">*</span><span class="w"> </span><span class="n">op</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="n">this</span><span class="o">-&gt;</span><span class="n">Mutate</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">true_branch</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Note that the returned expression will not necessarily be an <code class="docutils literal notranslate"><span class="pre">IfNode</span></code>, and
 this is fine, because the return type is <code class="docutils literal notranslate"><span class="pre">Expr</span></code>. Now, we create the public
 interface:</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span> <span class="nf">CollapseIfs</span><span class="p">(</span><span class="k">const</span> <span class="n">Expr</span><span class="o">&amp;</span> <span class="n">expr</span><span class="p">)</span> <span class="n">final</span> <span class="p">{</span>
-  <span class="k">return</span> <span class="n">this</span><span class="o">-&gt;</span><span class="n">Mutate</span><span class="p">(</span><span class="n">expr</span><span class="p">);</span>
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span><span class="w"> </span><span class="nf">CollapseIfs</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Expr</span><span class="o">&amp;</span><span class="w"> </span><span class="n">expr</span><span class="p">)</span><span class="w"> </span><span class="n">final</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="n">this</span><span class="o">-&gt;</span><span class="n">Mutate</span><span class="p">(</span><span class="n">expr</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>With this mutator, we didn’t need to do any bookkeeping, but we still want to
@@ -476,31 +476,31 @@ define an expression to be constant if it is a <code class="docutils literal not
 <p>We use a <code class="docutils literal notranslate"><span class="pre">memo_</span></code> field to map from nodes to whether they are constant and
 to cache these results. Below are the <code class="docutils literal notranslate"><span class="pre">VisitExpr_</span></code> definitions in the
 <code class="docutils literal notranslate"><span class="pre">ConstantChecker</span></code>.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span> <span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span> <span class="n">ConstantNode</span><span class="o">*</span> <span class="n">n</span><span class="p">)</span> <span class="n">final</span> <span class="p">{</span>
-  <span class="n">memo_</span><span class="p">[</span><span class="n">GetRef</span><span class="o">&lt;</span><span class="n">Constant</span><span class="o">&gt;</span><span class="p">(</span><span class="n">n</span><span class="p">)]</span> <span class="o">=</span> <span class="nb">true</span><span class="p">;</span>
-<span class="p">}</span>
-
-<span class="kt">void</span> <span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span> <span class="n">TupleNode</span><span class="o">*</span> <span class="n">n</span><span class="p">)</span> <span class="n">final</span> <span class="p">{</span>
-  <span class="kt">bool</span> <span class="n">result</span> <span class="o">=</span> <span class="nb">true</span><span class="p">;</span>
-  <span class="k">for</span> <span class="p">(</span><span class="k">const</span> <span class="k">auto</span><span class="o">&amp;</span> <span class="nl">field</span> <span class="p">:</span> <span class="n">n</span><span class="o">-&gt;</span><span class="n">fields</span><span class="p">)</span> <span class="p">{</span>
-    <span class="k">if</span> <span class="p">(</span><span class="o">!</span><span class="n">Check</span><span class="p">(</span><span class="n">field</span><span class="p">))</span> <span class="p">{</span>
-      <span class="n">result</span> <span class="o">=</span> <span class="nb">false</span><span class="p">;</span>
-      <span class="k">break</span><span class="p">;</span>
-    <span class="p">}</span>
-  <span class="p">}</span>
-  <span class="n">memo_</span><span class="p">[</span><span class="n">GetRef</span><span class="o">&lt;</span><span class="n">Tuple</span><span class="o">&gt;</span><span class="p">(</span><span class="n">n</span><span class="p">)]</span> <span class="o">=</span> <span class="n">result</span><span class="p">;</span>
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span><span class="w"> </span><span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">ConstantNode</span><span class="o">*</span><span class="w"> </span><span class="n">n</span><span class="p">)</span><span class="w"> </span><span class="n">final</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">memo_</span><span class="p">[</span><span class="n">GetRef</span><span class="o">&lt;</span><span class="n">Constant</span><span class="o">&gt;</span><span class="p">(</span><span class="n">n</span><span class="p">)]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">true</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
+
+<span class="kt">void</span><span class="w"> </span><span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">TupleNode</span><span class="o">*</span><span class="w"> </span><span class="n">n</span><span class="p">)</span><span class="w"> </span><span class="n">final</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="kt">bool</span><span class="w"> </span><span class="n">result</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">true</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">&amp;</span><span class="w"> </span><span class="n">field</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">n</span><span class="o">-&gt;</span><span class="n">fields</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span cla [...]
+<span class="w">    </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="o">!</span><span class="n">Check</span><span class="p">(</span><span class="n">field</span><span class="p">))</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">      </span><span class="n">result</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">false</span><span class="p">;</span><span class="w"></span>
+<span class="w">      </span><span class="k">break</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="w">  </span><span class="n">memo_</span><span class="p">[</span><span class="n">GetRef</span><span class="o">&lt;</span><span class="n">Tuple</span><span class="o">&gt;</span><span class="p">(</span><span class="n">n</span><span class="p">)]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">result</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>The bookkeeping used to coordinate these definitions is a <code class="docutils literal notranslate"><span class="pre">Check</span></code> method
 that returns whether the given expression is considered constant.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="kt">bool</span> <span class="nf">Check</span><span class="p">(</span><span class="k">const</span> <span class="n">Expr</span><span class="o">&amp;</span> <span class="n">expr</span><span class="p">)</span> <span class="p">{</span>
-  <span class="k">const</span> <span class="k">auto</span> <span class="n">it</span> <span class="o">=</span> <span class="n">memo_</span><span class="p">.</span><span class="n">find</span><span class="p">(</span><span class="n">expr</span><span class="p">);</span>
-  <span class="k">if</span> <span class="p">(</span><span class="n">it</span> <span class="o">!=</span> <span class="n">memo_</span><span class="p">.</span><span class="n">end</span><span class="p">())</span>
-    <span class="k">return</span> <span class="n">it</span><span class="o">-&gt;</span><span class="n">second</span><span class="p">;</span>
-  <span class="n">VisitExpr</span><span class="p">(</span><span class="n">expr</span><span class="p">);</span>
-  <span class="k">return</span> <span class="n">memo_</span><span class="p">[</span><span class="n">expr</span><span class="p">];</span>
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="kt">bool</span><span class="w"> </span><span class="nf">Check</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Expr</span><span class="o">&amp;</span><span class="w"> </span><span class="n">expr</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">it</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">memo_</span><span class="p">.</span><span class="n">find</span><span class="p">(</span><span class="n">expr</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">it</span><span class="w"> </span><span class="o">!=</span><span class="w"> </span><span class="n">memo_</span><span class="p">.</span><span class="n">end</span><span class="p">())</span><span class="w"></span>
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="n">it</span><span class="o">-&gt;</span><span class="n">second</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">VisitExpr</span><span class="p">(</span><span class="n">expr</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="n">memo_</span><span class="p">[</span><span class="n">expr</span><span class="p">];</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>We don’t modify <code class="docutils literal notranslate"><span class="pre">memo_</span></code> for every node we encounter; instead we only modify
@@ -515,23 +515,23 @@ uses <code class="docutils literal notranslate"><span class="pre">ConstantChecke
 involved in constant folding: <code class="docutils literal notranslate"><span class="pre">LetNode</span></code>, <code class="docutils literal notranslate"><span class="pre">TupleItemGetNode</span></code>, and
 <code class="docutils literal notranslate"><span class="pre">CallNode</span></code>. In the following paragraphs, we explain the roles of each in
 the pass.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span> <span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span> <span class="n">LetNode</span><span class="o">*</span> <span class="n">op</span><span class="p">)</span> <span class="n">final</span> <span class="p">{</span>
-  <span class="n">Expr</span> <span class="n">value</span> <span class="o">=</span> <span class="n">this</span><span class="o">-&gt;</span><span class="n">Mutate</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">value</span><span class="p">);</span>
-  <span class="k">if</span> <span class="p">(</span><span class="n">value</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">ConstantNode</span><span class="o">&gt;</span><span class="p">())</span> <span class="p">{</span>
-    <span class="n">memo_</span><span class="p">[</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">var</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span><span class="p">;</span>
-    <span class="k">return</span> <span class="n">this</span><span class="o">-&gt;</span><span class="n">Mutate</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">body</span><span class="p">);</span>
-  <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-    <span class="n">Var</span> <span class="n">var</span> <span class="o">=</span> <span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Var</span><span class="o">&gt;</span><span class="p">(</span><span class="n">this</span><span class="o">-&gt;</span><span class="n">Mutate</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">var</span><span class="p">));</span>
-    <span class="n">Expr</span> <span class="n">body</span> <span class="o">=</span> <span class="n">this</span><span class="o">-&gt;</span><span class="n">Mutate</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">body</span><span class="p">);</span>
-    <span class="k">if</span> <span class="p">(</span><span class="n">var</span><span class="p">.</span><span class="n">same_as</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">var</span><span class="p">)</span> <span class="o">&amp;&amp;</span>
-        <span class="n">value</span><span class="p">.</span><span class="n">same_as</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">value</span><span class="p">)</span> <span class="o">&amp;&amp;</span>
-        <span class="n">body</span><span class="p">.</span><span class="n">same_as</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">body</span><span class="p">))</span> <span class="p">{</span>
-      <span class="k">return</span> <span class="n">GetRef</span><span class="o">&lt;</span><span class="n">Expr</span><span class="o">&gt;</span><span class="p">(</span><span class="n">op</span><span class="p">);</span>
-    <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-      <span class="k">return</span> <span class="n">Let</span><span class="p">(</span><span class="n">var</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="n">body</span><span class="p">);</span>
-    <span class="p">}</span>
-  <span class="p">}</span>
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span><span class="w"> </span><span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">LetNode</span><span class="o">*</span><span class="w"> </span><span class="n">op</span><span class="p">)</span><span class="w"> </span><span class="n">final</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">Expr</span><span class="w"> </span><span class="n">value</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">this</span><span class="o">-&gt;</span><span class="n">Mutate</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">value</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">value</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">ConstantNode</span><span class="o">&gt;</span><span class="p">())</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">memo_</span><span class="p">[</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">var</span><span class="p">]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">value</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="n">this</span><span class="o">-&gt;</span><span class="n">Mutate</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">body</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">Var</span><span class="w"> </span><span class="n">var</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Var</span><span class="o">&gt;</span><span class="p">(</span><span class="n">this</span><span class="o">-&gt;</span><span class="n">Mutate</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">var</ [...]
+<span class="w">    </span><span class="n">Expr</span><span class="w"> </span><span class="n">body</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">this</span><span class="o">-&gt;</span><span class="n">Mutate</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">body</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">var</span><span class="p">.</span><span class="n">same_as</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">var</span><span class="p">)</span><span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"></span>
+<span class="w">        </span><span class="n">value</span><span class="p">.</span><span class="n">same_as</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">value</span><span class="p">)</span><span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"></span>
+<span class="w">        </span><span class="n">body</span><span class="p">.</span><span class="n">same_as</span><span class="p">(</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">body</span><span class="p">))</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">      </span><span class="k">return</span><span class="w"> </span><span class="n">GetRef</span><span class="o">&lt;</span><span class="n">Expr</span><span class="o">&gt;</span><span class="p">(</span><span class="n">op</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">      </span><span class="k">return</span><span class="w"> </span><span class="n">Let</span><span class="p">(</span><span class="n">var</span><span class="p">,</span><span class="w"> </span><span class="n">value</span><span class="p">,</span><span class="w"> </span><span class="n">body</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>In the <code class="docutils literal notranslate"><span class="pre">LetNode</span></code> case, we first attempt to const-fold the value being bound
@@ -539,45 +539,45 @@ in the expression. If we can, then we populate <code class="docutils literal not
 result of visiting the body—essentially, propagating the bound value to its
 use sites in the body. If we can’t const-fold the bound value, we mimic the
 default implementation.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span> <span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span> <span class="n">TupleGetItemNode</span><span class="o">*</span> <span class="n">op</span><span class="p">)</span> <span class="n">final</span> <span class="p">{</span>
-  <span class="n">Expr</span> <span class="n">res</span> <span class="o">=</span> <span class="n">ExprMutator</span><span class="o">::</span><span class="n">VisitExpr_</span><span class="p">(</span><span class="n">op</span><span class="p">);</span>
-  <span class="n">op</span> <span class="o">=</span> <span class="n">res</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">TupleGetItemNode</span><span class="o">&gt;</span><span class="p">();</span>
-  <span class="k">if</span> <span class="p">(</span><span class="k">const</span> <span class="k">auto</span><span class="o">*</span> <span class="n">tuple</span> <span class="o">=</span> <span class="n">op</span><span class="o">-&gt;</span><span class="n">tuple</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">TupleNode</span><span class="o">&gt;</span><span class="p">())</span> <span class="p">{</span>
-    <span class="k">return</span> <span class="n">tuple</span><span class="o">-&gt;</span><span class="n">fields</span><span class="p">[</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">index</span><span class="p">];</span>
-  <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-    <span class="k">return</span> <span class="n">res</span><span class="p">;</span>
-  <span class="p">}</span>
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span><span class="w"> </span><span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">TupleGetItemNode</span><span class="o">*</span><span class="w"> </span><span class="n">op</span><span class="p">)</span><span class="w"> </span><span class="n">final</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">Expr</span><span class="w"> </span><span class="n">res</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ExprMutator</span><span class="o">::</span><span class="n">VisitExpr_</span><span class="p">(</span><span class="n">op</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="n">op</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">res</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">TupleGetItemNode</span><span class="o">&gt;</span><span class="p">();</span><span class="w"></span>
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">*</span><span class="w"> </span><span class="n">tuple</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">op</span><span class="o">-&gt;</span><span class="n">tuple</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span clas [...]
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="n">tuple</span><span class="o">-&gt;</span><span class="n">fields</span><span class="p">[</span><span class="n">op</span><span class="o">-&gt;</span><span class="n">index</span><span class="p">];</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="n">res</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>In the <code class="docutils literal notranslate"><span class="pre">TupleItemGetNode</span></code> case, we check if <code class="docutils literal notranslate"><span class="pre">op-&gt;tuple</span></code> field is a
 <code class="docutils literal notranslate"><span class="pre">TupleNode</span></code>. If so, we replace the tuple get with the field of the tuple
 pointed to by <code class="docutils literal notranslate"><span class="pre">op-&gt;index</span></code>. The reason we need to check is because
 <code class="docutils literal notranslate"><span class="pre">op-&gt;tuple</span></code> might evaluate to a tuple, without itself being a tuple.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span> <span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span> <span class="n">CallNode</span><span class="o">*</span> <span class="n">call</span><span class="p">)</span> <span class="n">final</span> <span class="p">{</span>
-  <span class="k">static</span> <span class="k">auto</span> <span class="n">op_stateful</span> <span class="o">=</span> <span class="n">Op</span><span class="o">::</span><span class="n">GetAttrMap</span><span class="o">&lt;</span><span class="n">TOpIsStateful</span><span class="o">&gt;</span><span class="p">(</span><span class="s">&quot;TOpIsStateful&quot;</span><span class="p">);</span>
-  <span class="n">Expr</span> <span class="n">res</span> <span class="o">=</span> <span class="n">ExprMutator</span><span class="o">::</span><span class="n">VisitExpr_</span><span class="p">(</span><span class="n">call</span><span class="p">);</span>
-  <span class="n">call</span> <span class="o">=</span> <span class="n">res</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">CallNode</span><span class="o">&gt;</span><span class="p">();</span>
-  <span class="c1">// We don&#39;t constant fold function with zero arguments.</span>
-  <span class="c1">// This is a heuristic that is useful.</span>
-  <span class="c1">// For example it is harmful to fold ones(shape=(4, 5)).</span>
-  <span class="k">if</span> <span class="p">(</span><span class="n">call</span><span class="o">-&gt;</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">()</span> <span class="o">==</span> <span class="mi">0</span><span class="p">)</span> <span class="k">return</span> <span class="n">res</span><span class="p">;</span>
-  <span class="k">const</span> <span class="n">OpNode</span><span class="o">*</span> <span class="n">op</span> <span class="o">=</span> <span class="n">call</span><span class="o">-&gt;</span><span class="n">op</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">OpNode</span><span class="o">&gt;</span><span class="p">();</span>
-  <span class="k">if</span> <span class="p">(</span><span class="n">op</span> <span class="o">==</span> <span class="n">nullptr</span><span class="p">)</span> <span class="k">return</span> <span class="n">res</span><span class="p">;</span>
-  <span class="c1">// skip stateful ops.</span>
-  <span class="k">if</span> <span class="p">(</span><span class="n">op_stateful</span><span class="p">.</span><span class="n">get</span><span class="p">(</span><span class="n">GetRef</span><span class="o">&lt;</span><span class="n">Op</span><span class="o">&gt;</span><span class="p">(</span><span class="n">op</span><span class="p">),</span> <span class="nb">false</span><span class="p">))</span> <span class="k">return</span> <span class="n">res</span><span class="p">;</span>
-  <span class="kt">bool</span> <span class="n">all_const_args</span> <span class="o">=</span> <span class="nb">true</span><span class="p">;</span>
-  <span class="k">for</span> <span class="p">(</span><span class="n">Expr</span> <span class="nl">arg</span> <span class="p">:</span> <span class="n">call</span><span class="o">-&gt;</span><span class="n">args</span><span class="p">)</span> <span class="p">{</span>
-    <span class="k">if</span> <span class="p">(</span><span class="o">!</span><span class="n">checker_</span><span class="p">.</span><span class="n">Check</span><span class="p">(</span><span class="n">arg</span><span class="p">))</span> <span class="p">{</span>
-      <span class="n">all_const_args</span> <span class="o">=</span> <span class="nb">false</span><span class="p">;</span>
-    <span class="p">}</span>
-  <span class="p">}</span>
-  <span class="k">if</span> <span class="p">(</span><span class="n">all_const_args</span><span class="p">)</span> <span class="p">{</span>
-    <span class="k">return</span> <span class="n">ConstEvaluate</span><span class="p">(</span><span class="n">res</span><span class="p">);</span>
-  <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-    <span class="k">return</span> <span class="n">res</span><span class="p">;</span>
-  <span class="p">}</span>
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">Expr</span><span class="w"> </span><span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">CallNode</span><span class="o">*</span><span class="w"> </span><span class="n">call</span><span class="p">)</span><span class="w"> </span><span class="n">final</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="k">static</span><span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">op_stateful</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Op</span><span class="o">::</span><span class="n">GetAttrMap</span><span class="o">&lt;</span><span class="n">TOpIsStateful</span><span class="o">&gt;</span><span class="p">(</span><span class="s">&quot;TOpIsStateful&quot;</span><span class= [...]
+<span class="w">  </span><span class="n">Expr</span><span class="w"> </span><span class="n">res</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ExprMutator</span><span class="o">::</span><span class="n">VisitExpr_</span><span class="p">(</span><span class="n">call</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="n">call</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">res</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">CallNode</span><span class="o">&gt;</span><span class="p">();</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// We don&#39;t constant fold function with zero arguments.</span>
+<span class="w">  </span><span class="c1">// This is a heuristic that is useful.</span>
+<span class="w">  </span><span class="c1">// For example it is harmful to fold ones(shape=(4, 5)).</span>
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">call</span><span class="o">-&gt;</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">()</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="mi">0</span><span class="p">)</span><span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">res</span><span class [...]
+<span class="w">  </span><span class="k">const</span><span class="w"> </span><span class="n">OpNode</span><span class="o">*</span><span class="w"> </span><span class="n">op</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">call</span><span class="o">-&gt;</span><span class="n">op</span><span class="p">.</span><span class="n">as</span><span class="o">&lt;</span><span class="n">OpNode</span><span class="o">&gt;</span><span class="p">();</span><sp [...]
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">op</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="n">nullptr</span><span class="p">)</span><span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">res</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// skip stateful ops.</span>
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">op_stateful</span><span class="p">.</span><span class="n">get</span><span class="p">(</span><span class="n">GetRef</span><span class="o">&lt;</span><span class="n">Op</span><span class="o">&gt;</span><span class="p">(</span><span class="n">op</span><span class="p">),</span><span class="w"> </span><span class="nb">false</span><span class="p">))</span><span class="w"> </span><s [...]
+<span class="w">  </span><span class="kt">bool</span><span class="w"> </span><span class="n">all_const_args</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">true</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="n">Expr</span><span class="w"> </span><span class="n">arg</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">call</span><span class="o">-&gt;</span><span class="n">args</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="o">!</span><span class="n">checker_</span><span class="p">.</span><span class="n">Check</span><span class="p">(</span><span class="n">arg</span><span class="p">))</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">      </span><span class="n">all_const_args</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">false</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">all_const_args</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="n">ConstEvaluate</span><span class="p">(</span><span class="n">res</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="n">res</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>In the <code class="docutils literal notranslate"><span class="pre">CallNode</span></code> case, we first use the <code class="docutils literal notranslate"><span class="pre">VisitExpr_</span></code> of <code class="docutils literal notranslate"><span class="pre">ExprMutator</span></code>
@@ -599,17 +599,17 @@ class that takes an expression and internally creates and uses a
 <p><em>Note: please see the documentation on the :ref:`pass-infra` for more specific detail on this subject.</em></p>
 <p>With the AST traversers written, the pass can be registered to become a TVM
 API endpoint with the following code:</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">namespace</span> <span class="n">transform</span> <span class="p">{</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">namespace</span><span class="w"> </span><span class="n">transform</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
 
-<span class="n">Pass</span> <span class="n">FoldConstant</span><span class="p">()</span> <span class="p">{</span>
-  <span class="n">runtime</span><span class="o">::</span><span class="n">TypedPackedFunc</span><span class="o">&lt;</span><span class="n">Function</span><span class="p">(</span><span class="n">Function</span><span class="p">,</span> <span class="n">Module</span><span class="p">,</span> <span class="n">PassContext</span><span class="p">)</span><span class="o">&gt;</span> <span class="n">pass_func</span> <span class="o">=</span>
-    <span class="p">[</span><span class="o">=</span><span class="p">](</span><span class="n">Function</span> <span class="n">f</span><span class="p">,</span> <span class="n">Module</span> <span class="n">m</span><span class="p">,</span> <span class="n">PassContext</span> <span class="n">pc</span><span class="p">)</span> <span class="p">{</span>
-      <span class="k">return</span> <span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Function</span><span class="o">&gt;</span><span class="p">(</span><span class="n">FoldConstant</span><span class="p">(</span><span class="n">f</span><span class="p">));</span>
-  <span class="p">};</span>
-  <span class="k">return</span> <span class="nf">CreateFunctionPass</span><span class="p">(</span><span class="n">pass_func</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="s">&quot;FoldConstant&quot;</span><span class="p">,</span> <span class="p">{});</span>
-<span class="p">}</span>
+<span class="n">Pass</span><span class="w"> </span><span class="nf">FoldConstant</span><span class="p">()</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">runtime</span><span class="o">::</span><span class="n">TypedPackedFunc</span><span class="o">&lt;</span><span class="n">Function</span><span class="p">(</span><span class="n">Function</span><span class="p">,</span><span class="w"> </span><span class="n">Module</span><span class="p">,</span><span class="w"> </span><span class="n">PassContext</span><span class="p">)</span><span class="o">&gt;</span><span class="w"> </span><span class="n">pass_func</ [...]
+<span class="w">    </span><span class="p">[</span><span class="o">=</span><span class="p">](</span><span class="n">Function</span><span class="w"> </span><span class="n">f</span><span class="p">,</span><span class="w"> </span><span class="n">Module</span><span class="w"> </span><span class="n">m</span><span class="p">,</span><span class="w"> </span><span class="n">PassContext</span><span class="w"> </span><span class="n">pc</span><span class="p">)</span><span class="w"> </span><span cla [...]
+<span class="w">      </span><span class="k">return</span><span class="w"> </span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Function</span><span class="o">&gt;</span><span class="p">(</span><span class="n">FoldConstant</span><span class="p">(</span><span class="n">f</span><span class="p">));</span><span class="w"></span>
+<span class="w">  </span><span class="p">};</span><span class="w"></span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="n">CreateFunctionPass</span><span class="p">(</span><span class="n">pass_func</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;FoldConstant&quot;</span><span class="p">,</span><span class="w"> </span><span class="p">{});</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 
-<span class="p">}</span>  <span class="c1">// namespace transform</span>
+<span class="p">}</span><span class="w">  </span><span class="c1">// namespace transform</span>
 </pre></div>
 </div>
 <p>If the <code class="docutils literal notranslate"><span class="pre">Pass</span></code> object produced by the above code is given to the pass infrastructure,
@@ -629,8 +629,8 @@ error reporting and configuration options; <code class="docutils literal notrans
 this information but other passes may reference their <code class="docutils literal notranslate"><span class="pre">PassContext</span></code> objects.</p>
 <p>The pass can now be invoked via the pass infrastructure, though it’s a good idea to
 also add a Python binding for the pass, as in this code snippet:</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;relay._transform.FoldConstant&quot;</span><span class="p">)</span>
-<span class="p">.</span><span class="n">set_body_typed</span><span class="p">(</span><span class="n">FoldConstant</span><span class="p">);</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;relay._transform.FoldConstant&quot;</span><span class="p">)</span><span class="w"></span>
+<span class="p">.</span><span class="n">set_body_typed</span><span class="p">(</span><span class="n">FoldConstant</span><span class="p">);</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Once <code class="docutils literal notranslate"><span class="pre">Pass</span></code> objects are defined in the above fashion, they can be invoked using the
diff --git a/docs/dev/how_to/relay_bring_your_own_codegen.html b/docs/dev/how_to/relay_bring_your_own_codegen.html
index 175bce4e8..23abc24d4 100644
--- a/docs/dev/how_to/relay_bring_your_own_codegen.html
+++ b/docs/dev/how_to/relay_bring_your_own_codegen.html
@@ -365,12 +365,12 @@
 </pre></div>
 </div>
 <p>Our goal is to generate the following compilable code to execute the subgraph:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="cp">#include</span> <span class="cpf">&lt;tvm/runtime/c_runtime_api.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/runtime/packed_func.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;dlpack/dlpack.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;cstdint&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;cstring&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;iostream&gt;</span><span class="cp"></span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/runtime/c_runtime_api.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/runtime/packed_func.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;dlpack/dlpack.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;cstdint&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;cstring&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;iostream&gt;</span><span class="cp"></span>
 
 <span class="cp">#define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)           \</span>
 <span class="cp">  extern &quot;C&quot; void p_ID_(float* a, float* b, float* out) { \</span>
@@ -390,31 +390,31 @@
 <span class="cp">  }</span>
 
 <span class="c1">// Note 1</span>
-<span class="n">GCC_BINARY_OP_2D</span><span class="p">(</span><span class="n">gcc_0_0</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">10</span><span class="p">);</span>
-<span class="n">GCC_BINARY_OP_2D</span><span class="p">(</span><span class="n">gcc_0_1</span><span class="p">,</span> <span class="o">-</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">10</span><span class="p">);</span>
-<span class="n">GCC_BINARY_OP_2D</span><span class="p">(</span><span class="n">gcc_0_2</span><span class="p">,</span> <span class="o">+</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">10</span><span class="p">);</span>
+<span class="n">GCC_BINARY_OP_2D</span><span class="p">(</span><span class="n">gcc_0_0</span><span class="p">,</span><span class="w"> </span><span class="o">*</span><span class="p">,</span><span class="w"> </span><span class="mi">10</span><span class="p">,</span><span class="w"> </span><span class="mi">10</span><span class="p">);</span><span class="w"></span>
+<span class="n">GCC_BINARY_OP_2D</span><span class="p">(</span><span class="n">gcc_0_1</span><span class="p">,</span><span class="w"> </span><span class="o">-</span><span class="p">,</span><span class="w"> </span><span class="mi">10</span><span class="p">,</span><span class="w"> </span><span class="mi">10</span><span class="p">);</span><span class="w"></span>
+<span class="n">GCC_BINARY_OP_2D</span><span class="p">(</span><span class="n">gcc_0_2</span><span class="p">,</span><span class="w"> </span><span class="o">+</span><span class="p">,</span><span class="w"> </span><span class="mi">10</span><span class="p">,</span><span class="w"> </span><span class="mi">10</span><span class="p">);</span><span class="w"></span>
 
 <span class="c1">// Note 2</span>
-<span class="k">extern</span> <span class="s">&quot;C&quot;</span> <span class="kt">void</span> <span class="n">gcc_0_</span><span class="p">(</span><span class="kt">float</span><span class="o">*</span> <span class="n">gcc_input0</span><span class="p">,</span> <span class="kt">float</span><span class="o">*</span> <span class="n">gcc_input1</span><span class="p">,</span>
-                       <span class="kt">float</span><span class="o">*</span> <span class="n">gcc_input2</span><span class="p">,</span> <span class="kt">float</span><span class="o">*</span> <span class="n">gcc_input3</span><span class="p">,</span> <span class="kt">float</span><span class="o">*</span> <span class="n">out</span><span class="p">)</span> <span class="p">{</span>
-  <span class="kt">float</span><span class="o">*</span> <span class="n">buf_0</span> <span class="o">=</span> <span class="p">(</span><span class="kt">float</span><span class="o">*</span><span class="p">)</span><span class="n">malloc</span><span class="p">(</span><span class="mi">4</span> <span class="o">*</span> <span class="mi">100</span><span class="p">);</span>
-  <span class="kt">float</span><span class="o">*</span> <span class="n">buf_1</span> <span class="o">=</span> <span class="p">(</span><span class="kt">float</span><span class="o">*</span><span class="p">)</span><span class="n">malloc</span><span class="p">(</span><span class="mi">4</span> <span class="o">*</span> <span class="mi">100</span><span class="p">);</span>
-  <span class="n">gcc_0_2</span><span class="p">(</span><span class="n">gcc_input0</span><span class="p">,</span> <span class="n">gcc_input1</span><span class="p">,</span> <span class="n">buf_0</span><span class="p">);</span>
-  <span class="n">gcc_0_1</span><span class="p">(</span><span class="n">buf_0</span><span class="p">,</span> <span class="n">gcc_input2</span><span class="p">,</span> <span class="n">buf_1</span><span class="p">);</span>
-  <span class="n">gcc_0_0</span><span class="p">(</span><span class="n">buf_1</span><span class="p">,</span> <span class="n">gcc_input3</span><span class="p">,</span> <span class="n">out</span><span class="p">);</span>
-  <span class="n">free</span><span class="p">(</span><span class="n">buf_0</span><span class="p">);</span>
-  <span class="n">free</span><span class="p">(</span><span class="n">buf_1</span><span class="p">);</span>
-<span class="p">}</span>
+<span class="k">extern</span><span class="w"> </span><span class="s">&quot;C&quot;</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">gcc_0_</span><span class="p">(</span><span class="kt">float</span><span class="o">*</span><span class="w"> </span><span class="n">gcc_input0</span><span class="p">,</span><span class="w"> </span><span class="kt">float</span><span class="o">*</span><span class="w"> </span><span class="n">gcc_input1</span><span  [...]
+<span class="w">                       </span><span class="kt">float</span><span class="o">*</span><span class="w"> </span><span class="n">gcc_input2</span><span class="p">,</span><span class="w"> </span><span class="kt">float</span><span class="o">*</span><span class="w"> </span><span class="n">gcc_input3</span><span class="p">,</span><span class="w"> </span><span class="kt">float</span><span class="o">*</span><span class="w"> </span><span class="n">out</span><span class="p">)</span><sp [...]
+<span class="w">  </span><span class="kt">float</span><span class="o">*</span><span class="w"> </span><span class="n">buf_0</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="kt">float</span><span class="o">*</span><span class="p">)</span><span class="n">malloc</span><span class="p">(</span><span class="mi">4</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">100</span><span class= [...]
+<span class="w">  </span><span class="kt">float</span><span class="o">*</span><span class="w"> </span><span class="n">buf_1</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="kt">float</span><span class="o">*</span><span class="p">)</span><span class="n">malloc</span><span class="p">(</span><span class="mi">4</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">100</span><span class= [...]
+<span class="w">  </span><span class="n">gcc_0_2</span><span class="p">(</span><span class="n">gcc_input0</span><span class="p">,</span><span class="w"> </span><span class="n">gcc_input1</span><span class="p">,</span><span class="w"> </span><span class="n">buf_0</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="n">gcc_0_1</span><span class="p">(</span><span class="n">buf_0</span><span class="p">,</span><span class="w"> </span><span class="n">gcc_input2</span><span class="p">,</span><span class="w"> </span><span class="n">buf_1</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="n">gcc_0_0</span><span class="p">(</span><span class="n">buf_1</span><span class="p">,</span><span class="w"> </span><span class="n">gcc_input3</span><span class="p">,</span><span class="w"> </span><span class="n">out</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="n">free</span><span class="p">(</span><span class="n">buf_0</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="n">free</span><span class="p">(</span><span class="n">buf_1</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 
 <span class="c1">// Note 3</span>
-<span class="k">extern</span> <span class="s">&quot;C&quot;</span> <span class="kt">int</span> <span class="n">gcc_0_wrapper</span><span class="p">(</span><span class="n">DLTensor</span><span class="o">*</span> <span class="n">arg0</span><span class="p">,</span> <span class="n">DLTensor</span><span class="o">*</span> <span class="n">arg1</span><span class="p">,</span> <span class="n">DLTensor</span><span class="o">*</span> <span class="n">arg2</span><span class="p">,</span>
-                             <span class="n">DLTensor</span><span class="o">*</span> <span class="n">arg3</span><span class="p">,</span> <span class="n">DLTensor</span><span class="o">*</span> <span class="n">out</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">gcc_0_</span><span class="p">(</span><span class="k">static_cast</span><span class="o">&lt;</span><span class="kt">float</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">arg0</span><span class="o">-&gt;</span><span class="n">data</span><span class="p">),</span> <span class="k">static_cast</span><span class="o">&lt;</span><span class="kt">float</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">arg1</span><span class="o">-& [...]
-         <span class="k">static_cast</span><span class="o">&lt;</span><span class="kt">float</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">arg2</span><span class="o">-&gt;</span><span class="n">data</span><span class="p">),</span> <span class="k">static_cast</span><span class="o">&lt;</span><span class="kt">float</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">arg3</span><span class="o">-&gt;</span><span class="n">data</span><span cla [...]
-         <span class="k">static_cast</span><span class="o">&lt;</span><span class="kt">float</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">out</span><span class="o">-&gt;</span><span class="n">data</span><span class="p">));</span>
-  <span class="k">return</span> <span class="mi">0</span><span class="p">;</span>
-<span class="p">}</span>
-<span class="n">TVM_DLL_EXPORT_TYPED_FUNC</span><span class="p">(</span><span class="n">gcc_0</span><span class="p">,</span> <span class="n">gcc_0_wrapper</span><span class="p">);</span>
+<span class="k">extern</span><span class="w"> </span><span class="s">&quot;C&quot;</span><span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="n">gcc_0_wrapper</span><span class="p">(</span><span class="n">DLTensor</span><span class="o">*</span><span class="w"> </span><span class="n">arg0</span><span class="p">,</span><span class="w"> </span><span class="n">DLTensor</span><span class="o">*</span><span class="w"> </span><span class="n">arg1</span><span cl [...]
+<span class="w">                             </span><span class="n">DLTensor</span><span class="o">*</span><span class="w"> </span><span class="n">arg3</span><span class="p">,</span><span class="w"> </span><span class="n">DLTensor</span><span class="o">*</span><span class="w"> </span><span class="n">out</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">gcc_0_</span><span class="p">(</span><span class="k">static_cast</span><span class="o">&lt;</span><span class="kt">float</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">arg0</span><span class="o">-&gt;</span><span class="n">data</span><span class="p">),</span><span class="w"> </span><span class="k">static_cast</span><span class="o">&lt;</span><span class="kt">float</span><span class="o">*&gt;</span><span class="p">(</span [...]
+<span class="w">         </span><span class="k">static_cast</span><span class="o">&lt;</span><span class="kt">float</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">arg2</span><span class="o">-&gt;</span><span class="n">data</span><span class="p">),</span><span class="w"> </span><span class="k">static_cast</span><span class="o">&lt;</span><span class="kt">float</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">arg3</span><span class="o">-& [...]
+<span class="w">         </span><span class="k">static_cast</span><span class="o">&lt;</span><span class="kt">float</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">out</span><span class="o">-&gt;</span><span class="n">data</span><span class="p">));</span><span class="w"></span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
+<span class="n">TVM_DLL_EXPORT_TYPED_FUNC</span><span class="p">(</span><span class="n">gcc_0</span><span class="p">,</span><span class="w"> </span><span class="n">gcc_0_wrapper</span><span class="p">);</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Here we highlight the notes marked in the above code:</p>
@@ -437,47 +437,47 @@
 <div class="section" id="implement-codegenc">
 <h3>Implement CodegenC<a class="headerlink" href="#implement-codegenc" title="Permalink to this headline">¶</a></h3>
 <p>In <code class="docutils literal notranslate"><span class="pre">src/relay/backend/contrib/codegen_c/codegen.cc</span></code>, we first create a codegen class skeleton under the namespace of <code class="docutils literal notranslate"><span class="pre">tvm.relay.contrib</span></code>:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="cp">#include</span> <span class="cpf">&lt;tvm/relay/expr_functor.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/relay/transform.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/relay/type.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/runtime/module.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/runtime/object.h&gt;</span><span class="cp"></span>
-
-<span class="cp">#include</span> <span class="cpf">&lt;fstream&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;sstream&gt;</span><span class="cp"></span>
-
-<span class="cp">#include</span> <span class="cpf">&quot;codegen_c.h&quot;</span><span class="cp"></span>
-
-<span class="k">namespace</span> <span class="n">tvm</span> <span class="p">{</span>
-<span class="k">namespace</span> <span class="n">relay</span> <span class="p">{</span>
-<span class="k">namespace</span> <span class="n">contrib</span> <span class="p">{</span>
-
-<span class="k">class</span> <span class="nc">CodegenC</span> <span class="o">:</span> <span class="k">public</span> <span class="n">ExprVisitor</span><span class="p">,</span> <span class="k">public</span> <span class="n">CodegenCBase</span> <span class="p">{</span>
-  <span class="k">public</span><span class="o">:</span>
-    <span class="k">explicit</span> <span class="n">CodegenC</span><span class="p">(</span><span class="k">const</span> <span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span> <span class="n">id</span><span class="p">)</span> <span class="p">{</span> <span class="k">this</span><span class="o">-&gt;</span><span class="n">ext_func_id_</span> <span class="o">=</span> <span class="n">id</span><span class="p">;</span> <span class="p">}</span>
-
-    <span class="kt">void</span> <span class="n">VisitExpr_</span><span class="p">(</span><span class="k">const</span> <span class="n">VarNode</span><span class="o">*</span> <span class="n">node</span><span class="p">)</span> <span class="p">{</span> <span class="p">;</span> <span class="p">}</span>
-    <span class="kt">void</span> <span class="n">VisitExpr_</span><span class="p">(</span><span class="k">const</span> <span class="n">CallNode</span><span class="o">*</span> <span class="n">call</span><span class="p">)</span> <span class="k">final</span> <span class="p">{</span> <span class="p">;</span> <span class="p">}</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">JIT</span><span class="p">()</span> <span class="p">{</span> <span class="p">;</span> <span class="p">}</span>
-
-  <span class="k">private</span><span class="o">:</span>
-    <span class="cm">/*! \brief The function id that represents a C source function. */</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">ext_func_id_</span> <span class="o">=</span> <span class="s">&quot;&quot;</span><span class="p">;</span>
-    <span class="cm">/*! \brief The index of a wrapped C function. */</span>
-    <span class="kt">int</span> <span class="n">func_idx</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-    <span class="cm">/*! \brief The index of allocated buffers. */</span>
-    <span class="kt">int</span> <span class="n">buf_idx_</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-    <span class="cm">/*! \brief The arguments of a C compiler compatible function. */</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&gt;</span> <span class="n">ext_func_args_</span><span class="p">;</span>
-    <span class="cm">/*! \brief The statements of a C compiler compatible function. */</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&gt;</span> <span class="n">ext_func_body</span><span class="p">;</span>
-    <span class="cm">/*! \brief The declaration statements of a C compiler compatible function. */</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&gt;</span> <span class="n">func_decl_</span><span class="p">;</span>
-    <span class="cm">/*! \brief The declaration statements of buffers. */</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&gt;</span> <span class="n">buf_decl_</span><span class="p">;</span>
-    <span class="cm">/*! \brief The name and index pairs for output. */</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">pair</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="p">,</span> <span class="kt">int</span><span class="o">&gt;&gt;</span> <span class="n">out_</span><span class="p">;</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/relay/expr_functor.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/relay/transform.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/relay/type.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/runtime/module.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/runtime/object.h&gt;</span><span class="cp"></span>
+
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;fstream&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;sstream&gt;</span><span class="cp"></span>
+
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&quot;codegen_c.h&quot;</span><span class="cp"></span>
+
+<span class="k">namespace</span><span class="w"> </span><span class="nn">tvm</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="k">namespace</span><span class="w"> </span><span class="nn">relay</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="k">namespace</span><span class="w"> </span><span class="nn">contrib</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+
+<span class="k">class</span><span class="w"> </span><span class="nc">CodegenC</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">ExprVisitor</span><span class="p">,</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">CodegenCBase</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">    </span><span class="k">explicit</span><span class="w"> </span><span class="n">CodegenC</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span><span class="w"> </span><span class="n">id</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="k">this</span><span class="o">-&gt;</ [...]
+
+<span class="w">    </span><span class="kt">void</span><span class="w"> </span><span class="n">VisitExpr_</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">VarNode</span><span class="o">*</span><span class="w"> </span><span class="n">node</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="p">;</span><span class="w"> </span><span class="p">}</span><span class="w"></span>
+<span class="w">    </span><span class="kt">void</span><span class="w"> </span><span class="n">VisitExpr_</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">CallNode</span><span class="o">*</span><span class="w"> </span><span class="n">call</span><span class="p">)</span><span class="w"> </span><span class="k">final</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="p">;</span><span class="w"> </span><s [...]
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">JIT</span><span class="p">()</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="p">;</span><span class="w"> </span><span class="p">}</span><span class="w"></span>
+
+<span class="w">  </span><span class="k">private</span><span class="o">:</span><span class="w"></span>
+<span class="w">    </span><span class="cm">/*! \brief The function id that represents a C source function. */</span><span class="w"></span>
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">ext_func_id_</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="cm">/*! \brief The index of a wrapped C function. */</span><span class="w"></span>
+<span class="w">    </span><span class="kt">int</span><span class="w"> </span><span class="n">func_idx</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="cm">/*! \brief The index of allocated buffers. */</span><span class="w"></span>
+<span class="w">    </span><span class="kt">int</span><span class="w"> </span><span class="n">buf_idx_</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="cm">/*! \brief The arguments of a C compiler compatible function. */</span><span class="w"></span>
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&gt;</span><span class="w"> </span><span class="n">ext_func_args_</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="cm">/*! \brief The statements of a C compiler compatible function. */</span><span class="w"></span>
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&gt;</span><span class="w"> </span><span class="n">ext_func_body</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="cm">/*! \brief The declaration statements of a C compiler compatible function. */</span><span class="w"></span>
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&gt;</span><span class="w"> </span><span class="n">func_decl_</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="cm">/*! \brief The declaration statements of buffers. */</span><span class="w"></span>
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&gt;</span><span class="w"> </span><span class="n">buf_decl_</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="cm">/*! \brief The name and index pairs for output. */</span><span class="w"></span>
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">pair</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="p">,</span><span class="w"> </span><span class="kt">int</span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="n">out_</span><span class="p"> [...]
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>The <code class="docutils literal notranslate"><span class="pre">CodegenC</span></code> class inherits two classes: <code class="docutils literal notranslate"><span class="pre">ExprVisitor</span></code> provides abilities to traverse subgraphs and collects the required information and generate subgraph functions such as <code class="docutils literal notranslate"><span class="pre">gcc_0_</span></code>; <code class="docutils literal notranslate"><span class="pre">CodegenCBase</span></co [...]
@@ -487,52 +487,52 @@
 <p><strong>1. Generate the function declaration</strong></p>
 <p>Example Result: <code class="docutils literal notranslate"><span class="pre">GCC_BINARY_OP_2D(gcc_0_0,</span> <span class="pre">*,</span> <span class="pre">10,</span> <span class="pre">10);</span></code></p>
 <p>To generate the function declaration, as shown above, we need 1) a function name (e.g., <code class="docutils literal notranslate"><span class="pre">gcc_0_0</span></code>), 2) the type of operator (e.g., <code class="docutils literal notranslate"><span class="pre">*</span></code>), and 3) the input tensor shape (e.g., <code class="docutils literal notranslate"><span class="pre">(10,</span> <span class="pre">10)</span></code>). Fortunately, this information can be obtained easily from  [...]
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">std</span><span class="o">::</span><span class="n">ostringstream</span> <span class="n">macro_stream</span><span class="p">;</span>
-<span class="n">std</span><span class="o">::</span><span class="n">ostringstream</span> <span class="n">decl_stream</span><span class="p">;</span>
-<span class="n">std</span><span class="o">::</span><span class="n">ostringstream</span> <span class="n">buf_stream</span><span class="p">;</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">std</span><span class="o">::</span><span class="n">ostringstream</span><span class="w"> </span><span class="n">macro_stream</span><span class="p">;</span><span class="w"></span>
+<span class="n">std</span><span class="o">::</span><span class="n">ostringstream</span><span class="w"> </span><span class="n">decl_stream</span><span class="p">;</span><span class="w"></span>
+<span class="n">std</span><span class="o">::</span><span class="n">ostringstream</span><span class="w"> </span><span class="n">buf_stream</span><span class="p">;</span><span class="w"></span>
 
 <span class="c1">// Generate a unique function name you like.</span>
-<span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">func_name</span> <span class="o">=</span> <span class="n">ext_func_id_</span> <span class="o">+</span> <span class="s">&quot;_&quot;</span> <span class="o">+</span> <span class="n">std</span><span class="o">::</span><span class="n">to_string</span><span class="p">(</span><span class="n">func_idx</span><span class="o">++</span><span class="p">);</span>
+<span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">func_name</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ext_func_id_</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="s">&quot;_&quot;</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">to [...]
 
 <span class="c1">// Make function declaration string.</span>
-<span class="n">macro_stream</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;CSOURCE_BINARY_OP_&quot;</span> <span class="o">&lt;&lt;</span> <span class="n">call</span><span class="o">-&gt;</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">()</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;D(&quot;</span> <span class="o">&lt;&lt;</span> <span class="n">func_name</span> <span class="o">&lt;&lt;</span> <span class= [...]
+<span class="n">macro_stream</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;CSOURCE_BINARY_OP_&quot;</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">call</span><span class="o">-&gt;</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">()</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span cla [...]
 
 <span class="c1">// Check the operator type.</span>
-<span class="k">if</span> <span class="p">(</span><span class="n">IsOp</span><span class="p">(</span><span class="n">call</span><span class="p">,</span> <span class="s">&quot;add&quot;</span><span class="p">))</span> <span class="p">{</span>
-  <span class="n">macro_stream</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;+&quot;</span><span class="p">;</span>
-<span class="p">}</span> <span class="k">else</span> <span class="k">if</span> <span class="p">(</span><span class="n">IsOp</span><span class="p">(</span><span class="n">call</span><span class="p">,</span> <span class="s">&quot;subtract&quot;</span><span class="p">))</span> <span class="p">{</span>
-  <span class="n">macro_stream</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;-&quot;</span><span class="p">;</span>
-<span class="p">}</span> <span class="k">else</span> <span class="k">if</span> <span class="p">(</span><span class="n">IsOp</span><span class="p">(</span><span class="n">call</span><span class="p">,</span> <span class="s">&quot;multiply&quot;</span><span class="p">))</span> <span class="p">{</span>
-  <span class="n">macro_stream</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;*&quot;</span><span class="p">;</span>
-<span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-  <span class="n">LOG</span><span class="p">(</span><span class="n">FATAL</span><span class="p">)</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;Unrecognized op&quot;</span><span class="p">;</span>
-<span class="p">}</span>
+<span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">IsOp</span><span class="p">(</span><span class="n">call</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;add&quot;</span><span class="p">))</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">macro_stream</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;+&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">IsOp</span><span class="p">(</span><span class="n">call</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;subtract&quot;</span><span class="p">))</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">macro_stream</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;-&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">IsOp</span><span class="p">(</span><span class="n">call</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;multiply&quot;</span><span class="p">))</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">macro_stream</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;*&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">LOG</span><span class="p">(</span><span class="n">FATAL</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Unrecognized op&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 
 <span class="c1">// Extract the input tensor shape.</span>
-<span class="k">auto</span> <span class="n">in_shape</span> <span class="o">=</span> <span class="n">GetShape</span><span class="p">(</span><span class="n">call</span><span class="o">-&gt;</span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">-&gt;</span><span class="n">checked_type</span><span class="p">());</span>
-<span class="k">for</span> <span class="p">(</span><span class="kt">size_t</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">in_shape</span><span class="p">.</span><span class="n">size</span><span class="p">();</span> <span class="o">++</span><span class="n">i</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">macro_stream</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;, &quot;</span> <span class="o">&lt;&lt;</span> <span class="n">in_shape</span><span class="p">[</span><span class="n">i</span><span class="p">];</span>
-<span class="p">}</span>
-<span class="n">macro_stream</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;);&quot;</span><span class="p">;</span>
-<span class="n">func_decl_</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">macro_stream</span><span class="p">.</span><span class="n">str</span><span class="p">());</span>
+<span class="k">auto</span><span class="w"> </span><span class="n">in_shape</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">GetShape</span><span class="p">(</span><span class="n">call</span><span class="o">-&gt;</span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">-&gt;</span><span class="n">checked_type</span><span class="p">());</span><span class="w"></span>
+<span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="kt">size_t</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">&lt;</span><span class="w"> </span><span class="n">in_shape</span><span class="p">.</span><span class="n">size</span><span class="p [...]
+<span class="w">  </span><span class="n">macro_stream</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;, &quot;</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">in_shape</span><span class="p">[</span><span class="n">i</span><span class="p">];</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
+<span class="n">macro_stream</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;);&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="n">func_decl_</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">macro_stream</span><span class="p">.</span><span class="n">str</span><span class="p">());</span><span class="w"></span>
 </pre></div>
 </div>
 <p>As can be seen, we push the generated code to class member variables <code class="docutils literal notranslate"><span class="pre">func_decl_</span></code>. It means after we finish traversing the entire subgraph, we have collected all required function declarations and the only thing we need to do is having them compiled by GCC. The rest implementation of <code class="docutils literal notranslate"><span class="pre">VisitExpr_(const</span> <span class="pre">CallNode*</span> <span class [...]
 <p><strong>2. Generate the function call</strong></p>
 <p>Example Result: <code class="docutils literal notranslate"><span class="pre">gcc_0_0(buf_1,</span> <span class="pre">gcc_input3,</span> <span class="pre">out);</span></code></p>
 <p>After generating the function declaration, we need to generate a function call with proper inputs and outputs. To know which inputs or buffers we should put when calling this function, we have to visit its arguments:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="kt">bool</span> <span class="n">first</span> <span class="o">=</span> <span class="nb">true</span><span class="p">;</span>
-<span class="n">decl_stream</span> <span class="o">&lt;&lt;</span> <span class="n">func_name</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;(&quot;</span><span class="p">;</span>
-<span class="k">for</span> <span class="p">(</span><span class="kt">size_t</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">call</span><span class="o">-&gt;</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">();</span> <span class="o">++</span><span class="n">i</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">VisitExpr</span><span class="p">(</span><span class="n">call</span><span class="o">-&gt;</span><span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">]);</span> <span class="c1">// Note 1</span>
-  <span class="k">for</span> <span class="p">(</span><span class="k">auto</span> <span class="nl">out</span> <span class="p">:</span> <span class="n">out_</span><span class="p">)</span> <span class="p">{</span>
-    <span class="k">if</span> <span class="p">(</span><span class="o">!</span><span class="n">first</span><span class="p">)</span> <span class="p">{</span>
-      <span class="n">decl_stream</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;, &quot;</span><span class="p">;</span>
-    <span class="p">}</span>
-    <span class="n">first</span> <span class="o">=</span> <span class="nb">false</span><span class="p">;</span>
-    <span class="n">decl_stream</span> <span class="o">&lt;&lt;</span> <span class="n">out</span><span class="p">.</span><span class="n">first</span><span class="p">;</span>
-  <span class="p">}</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="kt">bool</span><span class="w"> </span><span class="n">first</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">true</span><span class="p">;</span><span class="w"></span>
+<span class="n">decl_stream</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">func_name</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;(&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="kt">size_t</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">&lt;</span><span class="w"> </span><span class="n">call</span><span class="o">-&gt;</span><span class="n">args</span><span class="p [...]
+<span class="w">  </span><span class="n">VisitExpr</span><span class="p">(</span><span class="n">call</span><span class="o">-&gt;</span><span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">]);</span><span class="w"> </span><span class="c1">// Note 1</span>
+<span class="w">  </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">out</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">out_</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="o">!</span><span class="n">first</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">      </span><span class="n">decl_stream</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;, &quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+<span class="w">    </span><span class="n">first</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">false</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="n">decl_stream</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">out</span><span class="p">.</span><span class="n">first</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 <span class="c1">// Note 2</span>
 </pre></div>
 </div>
@@ -551,34 +551,34 @@
 <p>Example Result: <code class="docutils literal notranslate"><span class="pre">float*</span> <span class="pre">buf_0</span> <span class="pre">=</span> <span class="pre">(float*)malloc(4</span> <span class="pre">*</span> <span class="pre">100);</span></code></p>
 <p>As mentioned in the previous step, in addition to the subgraph input and output tensors, we may also need buffers to keep the intermediate results. To generate the buffer, we extract the shape information to determine the buffer type and size:</p>
 <div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="c1">// This example only supports single output.</span>
-<span class="k">auto</span> <span class="n">type_node</span> <span class="o">=</span> <span class="n">call</span><span class="o">-&gt;</span><span class="n">checked_type</span><span class="p">().</span><span class="n">as</span><span class="o">&lt;</span><span class="n">TensorTypeNode</span><span class="o">&gt;</span><span class="p">();</span>
-<span class="n">ICHECK</span><span class="p">(</span><span class="n">type_node</span> <span class="o">!=</span> <span class="k">nullptr</span> <span class="o">&amp;&amp;</span> <span class="n">runtime</span><span class="o">::</span><span class="n">TypeMatch</span><span class="p">(</span><span class="n">type_node</span><span class="o">-&gt;</span><span class="n">dtype</span><span class="p">,</span> <span class="n">kDLFloat</span><span class="p">,</span> <span class="mi">32</span><span cla [...]
-      <span class="o">&lt;&lt;</span> <span class="s">&quot;Only support single output tensor with float type&quot;</span><span class="p">;</span>
+<span class="k">auto</span><span class="w"> </span><span class="n">type_node</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">call</span><span class="o">-&gt;</span><span class="n">checked_type</span><span class="p">().</span><span class="n">as</span><span class="o">&lt;</span><span class="n">TensorTypeNode</span><span class="o">&gt;</span><span class="p">();</span><span class="w"></span>
+<span class="n">ICHECK</span><span class="p">(</span><span class="n">type_node</span><span class="w"> </span><span class="o">!=</span><span class="w"> </span><span class="k">nullptr</span><span class="w"> </span><span class="o">&amp;&amp;</span><span class="w"> </span><span class="n">runtime</span><span class="o">::</span><span class="n">TypeMatch</span><span class="p">(</span><span class="n">type_node</span><span class="o">-&gt;</span><span class="n">dtype</span><span class="p">,</span> [...]
+<span class="w">      </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Only support single output tensor with float type&quot;</span><span class="p">;</span><span class="w"></span>
 
 <span class="c1">// Generate a unique buffer name.</span>
-<span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">out</span> <span class="o">=</span> <span class="s">&quot;buf_&quot;</span> <span class="o">+</span> <span class="n">std</span><span class="o">::</span><span class="n">to_string</span><span class="p">(</span><span class="n">buf_idx_</span><span class="o">++</span><span class="p">);</span>
+<span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">out</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;buf_&quot;</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">to_string</span><span class="p">(</span><span class="n">buf_idx_</span><span class="o">++</span><span class="p"> [...]
 
 <span class="c1">// Extract the shape to be the buffer size.</span>
-<span class="k">auto</span> <span class="n">out_shape</span> <span class="o">=</span> <span class="n">GetShape</span><span class="p">(</span><span class="n">call</span><span class="o">-&gt;</span><span class="n">checked_type</span><span class="p">());</span>
-<span class="kt">int</span> <span class="n">out_size</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span>
-<span class="k">for</span> <span class="p">(</span><span class="kt">size_t</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">out_shape</span><span class="p">.</span><span class="n">size</span><span class="p">();</span> <span class="o">++</span><span class="n">i</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">out_size</span> <span class="o">*=</span> <span class="n">out_shape</span><span class="p">[</span><span class="n">i</span><span class="p">];</span>
-<span class="p">}</span>
+<span class="k">auto</span><span class="w"> </span><span class="n">out_shape</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">GetShape</span><span class="p">(</span><span class="n">call</span><span class="o">-&gt;</span><span class="n">checked_type</span><span class="p">());</span><span class="w"></span>
+<span class="kt">int</span><span class="w"> </span><span class="n">out_size</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">1</span><span class="p">;</span><span class="w"></span>
+<span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="kt">size_t</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">&lt;</span><span class="w"> </span><span class="n">out_shape</span><span class="p">.</span><span class="n">size</span><span class=" [...]
+<span class="w">  </span><span class="n">out_size</span><span class="w"> </span><span class="o">*=</span><span class="w"> </span><span class="n">out_shape</span><span class="p">[</span><span class="n">i</span><span class="p">];</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 
 <span class="c1">// Make the buffer allocation and push to the buffer declarations.</span>
-<span class="n">buf_stream</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;float* &quot;</span> <span class="o">&lt;&lt;</span> <span class="n">out</span> <span class="o">&lt;&lt;</span> <span class="s">&quot; = (float*)std::malloc(4 * &quot;</span> <span class="o">&lt;&lt;</span> <span class="n">out_size</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;);&quot;</span><span class="p">;</span>
-<span class="n">buf_decl_</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">buf_stream</span><span class="p">.</span><span class="n">str</span><span class="p">());</span>
+<span class="n">buf_stream</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;float* &quot;</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">out</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot; = (float*)std::malloc(4 * &quot;</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class=" [...]
+<span class="n">buf_decl_</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">buf_stream</span><span class="p">.</span><span class="n">str</span><span class="p">());</span><span class="w"></span>
 </pre></div>
 </div>
 <p>After we have allocated the output buffer, we can now close the function call string and push the generated function call to a class variable <code class="docutils literal notranslate"><span class="pre">ext_func_body</span></code>.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">decl_stream</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;, &quot;</span> <span class="o">&lt;&lt;</span> <span class="n">out</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;);&quot;</span><span class="p">;</span>
-<span class="n">ext_func_body</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">decl_stream</span><span class="p">.</span><span class="n">str</span><span class="p">());</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">decl_stream</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;, &quot;</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">out</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;);&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="n">ext_func_body</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">decl_stream</span><span class="p">.</span><span class="n">str</span><span class="p">());</span><span class="w"></span>
 </pre></div>
 </div>
 <p><strong>4. Update output buffer</strong></p>
 <p>To let the next node, which accepts the output of the current call node as its input, know which buffer it should take, we need to update the class variable <code class="docutils literal notranslate"><span class="pre">out_</span></code> before leaving this visit function:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">out_</span><span class="p">.</span><span class="n">clear</span><span class="p">();</span>
-<span class="n">out_</span><span class="p">.</span><span class="n">push_back</span><span class="p">({</span><span class="n">out</span><span class="p">,</span> <span class="n">out_size</span><span class="p">});</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">out_</span><span class="p">.</span><span class="n">clear</span><span class="p">();</span><span class="w"></span>
+<span class="n">out_</span><span class="p">.</span><span class="n">push_back</span><span class="p">({</span><span class="n">out</span><span class="p">,</span><span class="w"> </span><span class="n">out_size</span><span class="p">});</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Congratulations! we have finished the most difficult function in this class. In the next two sections, we just need to make up some minor missing parts in this function.</p>
@@ -587,11 +587,11 @@
 <h4>Code Generation for Input Variables<a class="headerlink" href="#code-generation-for-input-variables" title="Permalink to this headline">¶</a></h4>
 <p>Recall that we collected the input buffer information by visiting the arguments of a call node (2nd step in the previous section), and handled the case when its argument is another call node (4th step). In this section, we demonstrate how to handle other nodes by taking <code class="docutils literal notranslate"><span class="pre">VarNode</span></code> as an example.</p>
 <p><code class="docutils literal notranslate"><span class="pre">VarNode</span></code> represents input tensors in a model. The only but important information it has is a name hint (e.g., <code class="docutils literal notranslate"><span class="pre">data</span></code>, <code class="docutils literal notranslate"><span class="pre">weight</span></code>, etc). When visiting a <code class="docutils literal notranslate"><span class="pre">VarNode</span></code>, we simply update class variable <co [...]
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span> <span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span> <span class="n">VarNode</span><span class="o">*</span> <span class="n">node</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">ext_func_args_</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">node</span><span class="o">-&gt;</span><span class="n">name_hint</span><span class="p">());</span>
-  <span class="n">out_</span><span class="p">.</span><span class="n">clear</span><span class="p">();</span>
-  <span class="n">out_</span><span class="p">.</span><span class="n">push_back</span><span class="p">({</span><span class="n">node</span><span class="o">-&gt;</span><span class="n">name_hint</span><span class="p">(),</span> <span class="mi">0</span><span class="p">});</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span><span class="w"> </span><span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">VarNode</span><span class="o">*</span><span class="w"> </span><span class="n">node</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">ext_func_args_</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">node</span><span class="o">-&gt;</span><span class="n">name_hint</span><span class="p">());</span><span class="w"></span>
+<span class="w">  </span><span class="n">out_</span><span class="p">.</span><span class="n">clear</span><span class="p">();</span><span class="w"></span>
+<span class="w">  </span><span class="n">out_</span><span class="p">.</span><span class="n">push_back</span><span class="p">({</span><span class="n">node</span><span class="o">-&gt;</span><span class="n">name_hint</span><span class="p">(),</span><span class="w"> </span><span class="mi">0</span><span class="p">});</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Note that in this example we assume the subgraph we are offloading has only call nodes and variable nodes. If your subgraphs contain other types of nodes, such as <code class="docutils literal notranslate"><span class="pre">TupleNode</span></code>, then you also need to visit them and bypass the output buffer information.</p>
@@ -599,11 +599,11 @@
 <div class="section" id="code-emitting">
 <h4>Code Emitting<a class="headerlink" href="#code-emitting" title="Permalink to this headline">¶</a></h4>
 <p>The final part in this codegen class is a <code class="docutils literal notranslate"><span class="pre">JIT</span></code> function that emits a C function for the subgraph and uses the C code we just generated as the function body. Remember, in addition to the subgraph function we generated in the previous sections, we also need a wrapper function with a unified argument for TVM runtime to invoke and pass data. Fortunately, the base class we inherited already provides an implementation [...]
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">JitImpl</span><span class="p">(</span><span class="s">&quot;gcc_0&quot;</span> <span class="cm">/* Subgraph symbol (ID) */</span><span class="p">,</span>
-        <span class="p">{</span><span class="s">&quot;gcc_input0&quot;</span><span class="p">,</span> <span class="s">&quot;gcc_input1&quot;</span><span class="p">,</span> <span class="s">&quot;gcc_input2&quot;</span><span class="p">,</span> <span class="s">&quot;gcc_input3&quot;</span><span class="p">}</span> <span class="cm">/* Input arguments */</span><span class="p">,</span>
-        <span class="p">{</span><span class="s">&quot;float *buf_0 = (float*)malloc(4 * 20)&quot;</span><span class="p">,</span> <span class="p">...}</span> <span class="cm">/* Buffer allocations */</span><span class="p">,</span>
-        <span class="p">{</span><span class="s">&quot;gcc_0_2(gcc_input0, gcc_input1, buf_0);&quot;</span><span class="p">}</span> <span class="cm">/* Function body */</span><span class="p">,</span>
-        <span class="p">{</span><span class="s">&quot;out&quot;</span><span class="p">}</span> <span class="cm">/* Output */</span><span class="p">);</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">JitImpl</span><span class="p">(</span><span class="s">&quot;gcc_0&quot;</span><span class="w"> </span><span class="cm">/* Subgraph symbol (ID) */</span><span class="p">,</span><span class="w"></span>
+<span class="w">        </span><span class="p">{</span><span class="s">&quot;gcc_input0&quot;</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;gcc_input1&quot;</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;gcc_input2&quot;</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;gcc_input3&quot;</span><span class="p">}</span><span class="w"> </span><span class="cm">/* Input arguments */</span><span class="p">,</s [...]
+<span class="w">        </span><span class="p">{</span><span class="s">&quot;float *buf_0 = (float*)malloc(4 * 20)&quot;</span><span class="p">,</span><span class="w"> </span><span class="p">...}</span><span class="w"> </span><span class="cm">/* Buffer allocations */</span><span class="p">,</span><span class="w"></span>
+<span class="w">        </span><span class="p">{</span><span class="s">&quot;gcc_0_2(gcc_input0, gcc_input1, buf_0);&quot;</span><span class="p">}</span><span class="w"> </span><span class="cm">/* Function body */</span><span class="p">,</span><span class="w"></span>
+<span class="w">        </span><span class="p">{</span><span class="s">&quot;out&quot;</span><span class="p">}</span><span class="w"> </span><span class="cm">/* Output */</span><span class="p">);</span><span class="w"></span>
 </pre></div>
 </div>
 <p>The above call will generate three functions (one from the TVM wrapper macro):</p>
@@ -613,13 +613,13 @@
 <li><p>The TVM runtime compatible function <code class="docutils literal notranslate"><span class="pre">gcc_0</span></code> with TVM unified function arguments that unpacks TVM packed tensors and invokes <code class="docutils literal notranslate"><span class="pre">gcc_0__wrapper_</span></code>.</p></li>
 </ol>
 <p>Accordingly, the only thing we need in <code class="docutils literal notranslate"><span class="pre">JIT</span></code> implementation is passing all subgraph function code we generated to <code class="docutils literal notranslate"><span class="pre">JitImpl</span></code>:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">JIT</span><span class="p">()</span> <span class="p">{</span>
-  <span class="c1">// Write function macros</span>
-  <span class="k">for</span> <span class="p">(</span><span class="k">auto</span> <span class="nl">decl</span> <span class="p">:</span> <span class="n">func_decl_</span><span class="p">)</span> <span class="p">{</span>
-    <span class="n">code_stream_</span> <span class="o">&lt;&lt;</span> <span class="n">decl</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span>
-  <span class="p">}</span>
-  <span class="k">return</span> <span class="n">JitImpl</span><span class="p">(</span><span class="n">ext_func_id_</span><span class="p">,</span> <span class="n">ext_func_args_</span><span class="p">,</span> <span class="n">buf_decl_</span><span class="p">,</span> <span class="n">ext_func_body</span><span class="p">,</span> <span class="n">out_</span><span class="p">);</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="nf">JIT</span><span class="p">()</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// Write function macros</span>
+<span class="w">  </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">decl</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">func_decl_</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">code_stream_</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">decl</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="n">JitImpl</span><span class="p">(</span><span class="n">ext_func_id_</span><span class="p">,</span><span class="w"> </span><span class="n">ext_func_args_</span><span class="p">,</span><span class="w"> </span><span class="n">buf_decl_</span><span class="p">,</span><span class="w"> </span><span class="n">ext_func_body</span><span class="p">,</span><span class="w"> </span><span class="n">out_</span>< [...]
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>All variables (<code class="docutils literal notranslate"><span class="pre">ext_func_id</span></code>, etc) we passed are class variables and were filled when we traversed the subgraph.</p>
@@ -628,50 +628,50 @@
 <div class="section" id="implement-csourcecodegen">
 <h3>Implement CSourceCodegen<a class="headerlink" href="#implement-csourcecodegen" title="Permalink to this headline">¶</a></h3>
 <p>Again, let’s create a class skeleton and implement the required functions. Note that it inherits <code class="docutils literal notranslate"><span class="pre">CSourceModuleCodegenBase</span></code></p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">CSourceCodegen</span> <span class="o">:</span> <span class="k">public</span> <span class="n">CSourceModuleCodegenBase</span> <span class="p">{</span>
- <span class="k">public</span><span class="o">:</span>
-  <span class="c1">// Pass a subgraph function, and generate the C code.</span>
-  <span class="kt">void</span> <span class="n">GenCFunc</span><span class="p">(</span><span class="k">const</span> <span class="n">Function</span><span class="o">&amp;</span> <span class="n">func</span><span class="p">)</span> <span class="p">{</span> <span class="p">;</span> <span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">CSourceCodegen</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">CSourceModuleCodegenBase</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w"> </span><span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">  </span><span class="c1">// Pass a subgraph function, and generate the C code.</span>
+<span class="w">  </span><span class="kt">void</span><span class="w"> </span><span class="n">GenCFunc</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Function</span><span class="o">&amp;</span><span class="w"> </span><span class="n">func</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="p">;</span><span class="w"> </span><span class="p">}</span><span class="w"></span>
 
-  <span class="c1">// Use GenCFunc to generate the C code and wrap it as a C source module.</span>
-  <span class="n">runtime</span><span class="o">::</span><span class="n">Module</span> <span class="n">CreateCSourceModule</span><span class="p">(</span><span class="k">const</span> <span class="n">NodeRef</span><span class="o">&amp;</span> <span class="n">ref</span><span class="p">)</span> <span class="k">override</span> <span class="p">{</span> <span class="p">;</span> <span class="p">}</span>
+<span class="w">  </span><span class="c1">// Use GenCFunc to generate the C code and wrap it as a C source module.</span>
+<span class="w">  </span><span class="n">runtime</span><span class="o">::</span><span class="n">Module</span><span class="w"> </span><span class="n">CreateCSourceModule</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">NodeRef</span><span class="o">&amp;</span><span class="w"> </span><span class="n">ref</span><span class="p">)</span><span class="w"> </span><span class="k">override</span><span class="w"> </span><span class="p">{</span><span  [...]
 
- <span class="k">private</span><span class="o">:</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">ostringstream</span> <span class="n">code_stream_</span><span class="p">;</span>
-<span class="p">};</span>
+<span class="w"> </span><span class="k">private</span><span class="o">:</span><span class="w"></span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">ostringstream</span><span class="w"> </span><span class="n">code_stream_</span><span class="p">;</span><span class="w"></span>
+<span class="p">};</span><span class="w"></span>
 </pre></div>
 </div>
 <div class="section" id="implement-gencfunc">
 <h4>Implement GenCFunc<a class="headerlink" href="#implement-gencfunc" title="Permalink to this headline">¶</a></h4>
 <p><code class="docutils literal notranslate"><span class="pre">GenCFunc</span></code> simply uses the <code class="docutils literal notranslate"><span class="pre">CodegenC</span></code> we just implemented to traverse a Relay function (subgraph) and obtains the generated C code. The builtin function <code class="docutils literal notranslate"><span class="pre">GetExtSymbol</span></code> retrieves a unique symbol name (e.g., <code class="docutils literal notranslate"><span class="pre">gcc [...]
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span> <span class="nf">GenCFunc</span><span class="p">(</span><span class="k">const</span> <span class="n">Function</span><span class="o">&amp;</span> <span class="n">func</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">ICHECK</span><span class="p">(</span><span class="n">func</span><span class="p">.</span><span class="n">defined</span><span class="p">())</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;Input error: expect a Relay function.&quot;</span><span class="p">;</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span><span class="w"> </span><span class="nf">GenCFunc</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">Function</span><span class="o">&amp;</span><span class="w"> </span><span class="n">func</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">ICHECK</span><span class="p">(</span><span class="n">func</span><span class="p">.</span><span class="n">defined</span><span class="p">())</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Input error: expect a Relay function.&quot;</span><span class="p">;</span><span class="w"></span>
 
-  <span class="c1">// Record the external symbol for runtime lookup.</span>
-  <span class="k">auto</span> <span class="n">sid</span> <span class="o">=</span> <span class="n">GetExtSymbol</span><span class="p">(</span><span class="n">func</span><span class="p">);</span>
+<span class="w">  </span><span class="c1">// Record the external symbol for runtime lookup.</span>
+<span class="w">  </span><span class="k">auto</span><span class="w"> </span><span class="n">sid</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">GetExtSymbol</span><span class="p">(</span><span class="n">func</span><span class="p">);</span><span class="w"></span>
 
-  <span class="n">CodeGenC</span> <span class="n">builder</span><span class="p">(</span><span class="n">sid</span><span class="p">);</span>
-  <span class="n">builder</span><span class="p">.</span><span class="n">VisitExpr</span><span class="p">(</span><span class="n">func</span><span class="o">-&gt;</span><span class="n">body</span><span class="p">);</span>
-  <span class="n">code_stream_</span> <span class="o">&lt;&lt;</span> <span class="n">builder</span><span class="p">.</span><span class="n">JIT</span><span class="p">();</span>
-<span class="p">}</span>
+<span class="w">  </span><span class="n">CodeGenC</span><span class="w"> </span><span class="n">builder</span><span class="p">(</span><span class="n">sid</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="n">builder</span><span class="p">.</span><span class="n">VisitExpr</span><span class="p">(</span><span class="n">func</span><span class="o">-&gt;</span><span class="n">body</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="n">code_stream_</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">builder</span><span class="p">.</span><span class="n">JIT</span><span class="p">();</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
 <div class="section" id="implement-createcsourcemodule">
 <h4>Implement CreateCSourceModule<a class="headerlink" href="#implement-createcsourcemodule" title="Permalink to this headline">¶</a></h4>
 <p>This function creates a runtime module for the external library. In this example, we create a CSourceModule that can be directly compiled and linked together with a TVM generated DSOModule. After you have implemented <code class="docutils literal notranslate"><span class="pre">CodegenC</span></code>, implementing this function is relatively straightforward:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">runtime</span><span class="o">::</span><span class="n">Module</span> <span class="n">CreateCSourceModule</span><span class="p">(</span><span class="k">const</span> <span class="n">NodeRef</span><span class="o">&amp;</span> <span class="n">ref</span><span class="p">)</span> <span class="k">override</span> <span class="p">{</span>
-  <span class="c1">// Create headers</span>
-  <span class="n">code_stream_</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;#include &lt;cstdint&gt;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span>
-  <span class="n">code_stream_</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;#include &lt;iostream&gt;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span>
-  <span class="n">code_stream_</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;#include &lt;cstdlib&gt;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span>
-  <span class="n">code_stream_</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;#include &lt;stdio.h&gt;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span>
-  <span class="n">code_stream_</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;#include &lt;cstring&gt;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span>
-  <span class="n">code_stream_</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;#include &lt;tvm/runtime/c_runtime_api.h&gt;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span>
-  <span class="n">code_stream_</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;#include &lt;dlpack/dlpack.h&gt;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span>
-
-  <span class="c1">// Append some common macro for operator definition.</span>
-  <span class="k">const</span> <span class="kt">char</span><span class="o">*</span> <span class="n">operator_macro</span> <span class="o">=</span> <span class="sa">R</span><span class="s">&quot;</span><span class="dl">op_macro(</span><span class="s"></span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">runtime</span><span class="o">::</span><span class="n">Module</span><span class="w"> </span><span class="nf">CreateCSourceModule</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">NodeRef</span><span class="o">&amp;</span><span class="w"> </span><span class="n">ref</span><span class="p">)</span><span class="w"> </span><span class="k">override</spa [...]
+<span class="w">  </span><span class="c1">// Create headers</span>
+<span class="w">  </span><span class="n">code_stream_</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;#include &lt;cstdint&gt;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">code_stream_</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;#include &lt;iostream&gt;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">code_stream_</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;#include &lt;cstdlib&gt;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">code_stream_</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;#include &lt;stdio.h&gt;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">code_stream_</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;#include &lt;cstring&gt;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">code_stream_</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;#include &lt;tvm/runtime/c_runtime_api.h&gt;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">code_stream_</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;#include &lt;dlpack/dlpack.h&gt;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span><span class="w"></span>
+
+<span class="w">  </span><span class="c1">// Append some common macro for operator definition.</span>
+<span class="w">  </span><span class="k">const</span><span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">operator_macro</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="sa">R</span><span class="s">&quot;</span><span class="dl">op_macro(</span><span class="s"></span>
 <span class="s">  #define CSOURCE_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)       \</span>
 <span class="s">    extern &quot;C&quot; void p_ID_(float* a, float* b, float* out) { \</span>
 <span class="s">      for (int64_t i = 0; i &lt; p_DIM1_; ++i) {               \</span>
@@ -688,28 +688,28 @@
 <span class="s">        }                                                       \</span>
 <span class="s">      }                                                         \</span>
 <span class="s">    }</span>
-<span class="s">  </span><span class="dl">)op_macro</span><span class="s">&quot;</span><span class="p">;</span>
-
-  <span class="n">code_stream_</span> <span class="o">&lt;&lt;</span> <span class="n">operator_macro</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;</span><span class="se">\n\n</span><span class="s">&quot;</span><span class="p">;</span>
-
-  <span class="c1">// Generate C code for the subgraph.</span>
-  <span class="k">if</span> <span class="p">(</span><span class="n">ref</span><span class="o">-&gt;</span><span class="n">IsInstance</span><span class="o">&lt;</span><span class="n">FunctionNode</span><span class="o">&gt;</span><span class="p">())</span> <span class="p">{</span>
-    <span class="n">GenCFunc</span><span class="p">(</span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Function</span><span class="o">&gt;</span><span class="p">(</span><span class="n">ref</span><span class="p">));</span>
-  <span class="p">}</span> <span class="k">else</span> <span class="k">if</span> <span class="p">(</span><span class="n">ref</span><span class="o">-&gt;</span><span class="n">IsInstance</span><span class="o">&lt;</span><span class="n">relay</span><span class="o">::</span><span class="n">ModuleNode</span><span class="o">&gt;</span><span class="p">())</span> <span class="p">{</span>
-    <span class="n">relay</span><span class="o">::</span><span class="n">Module</span> <span class="n">mod</span> <span class="o">=</span> <span class="n">Downcast</span><span class="o">&lt;</span><span class="n">relay</span><span class="o">::</span><span class="n">Module</span><span class="o">&gt;</span><span class="p">(</span><span class="n">ref</span><span class="p">);</span>
-    <span class="k">for</span> <span class="p">(</span><span class="k">const</span> <span class="k">auto</span><span class="o">&amp;</span> <span class="nl">it</span> <span class="p">:</span> <span class="n">mod</span><span class="o">-&gt;</span><span class="n">functions</span><span class="p">)</span> <span class="p">{</span>
-      <span class="n">GenCFunc</span><span class="p">(</span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Function</span><span class="o">&gt;</span><span class="p">(</span><span class="n">it</span><span class="p">.</span><span class="n">second</span><span class="p">));</span>
-    <span class="p">}</span>
-  <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-    <span class="n">LOG</span><span class="p">(</span><span class="n">FATAL</span><span class="p">)</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;The input ref is expected to be a Relay function or module&quot;</span>
-               <span class="o">&lt;&lt;</span> <span class="s">&quot;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span>
-  <span class="p">}</span>
-
-  <span class="c1">// Create a CSourceModule</span>
-  <span class="k">const</span> <span class="k">auto</span><span class="o">*</span> <span class="n">pf</span> <span class="o">=</span> <span class="n">runtime</span><span class="o">::</span><span class="n">Registry</span><span class="o">::</span><span class="n">Get</span><span class="p">(</span><span class="s">&quot;module.csource_module_create&quot;</span><span class="p">);</span>
-  <span class="n">ICHECK</span><span class="p">(</span><span class="n">pf</span> <span class="o">!=</span> <span class="k">nullptr</span><span class="p">)</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;Cannot find csource module to create the external runtime module&quot;</span><span class="p">;</span>
-  <span class="k">return</span> <span class="p">(</span><span class="o">*</span><span class="n">pf</span><span class="p">)(</span><span class="n">code_stream_</span><span class="p">.</span><span class="n">str</span><span class="p">(),</span> <span class="s">&quot;cc&quot;</span><span class="p">);</span>
-<span class="p">}</span>
+<span class="s">  </span><span class="dl">)op_macro</span><span class="s">&quot;</span><span class="p">;</span><span class="w"></span>
+
+<span class="w">  </span><span class="n">code_stream_</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">operator_macro</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;</span><span class="se">\n\n</span><span class="s">&quot;</span><span class="p">;</span><span class="w"></span>
+
+<span class="w">  </span><span class="c1">// Generate C code for the subgraph.</span>
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">ref</span><span class="o">-&gt;</span><span class="n">IsInstance</span><span class="o">&lt;</span><span class="n">FunctionNode</span><span class="o">&gt;</span><span class="p">())</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">GenCFunc</span><span class="p">(</span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Function</span><span class="o">&gt;</span><span class="p">(</span><span class="n">ref</span><span class="p">));</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">ref</span><span class="o">-&gt;</span><span class="n">IsInstance</span><span class="o">&lt;</span><span class="n">relay</span><span class="o">::</span><span class="n">ModuleNode</span><span class="o">&gt;</span><span class="p">())</span><span class="w"> </span><span class="p">{< [...]
+<span class="w">    </span><span class="n">relay</span><span class="o">::</span><span class="n">Module</span><span class="w"> </span><span class="n">mod</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">relay</span><span class="o">::</span><span class="n">Module</span><span class="o">&gt;</span><span class="p">(</span><span class="n">ref</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">&amp;</span><span class="w"> </span><span class="n">it</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">mod</span><span class="o">-&gt;</span><span class="n">functions</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span [...]
+<span class="w">      </span><span class="n">GenCFunc</span><span class="p">(</span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Function</span><span class="o">&gt;</span><span class="p">(</span><span class="n">it</span><span class="p">.</span><span class="n">second</span><span class="p">));</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">LOG</span><span class="p">(</span><span class="n">FATAL</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;The input ref is expected to be a Relay function or module&quot;</span><span class="w"></span>
+<span class="w">               </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+
+<span class="w">  </span><span class="c1">// Create a CSourceModule</span>
+<span class="w">  </span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">*</span><span class="w"> </span><span class="n">pf</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">runtime</span><span class="o">::</span><span class="n">Registry</span><span class="o">::</span><span class="n">Get</span><span class="p">(</span><span class="s">&quot;module.csource_module_create&quot;</span><span class="p">);< [...]
+<span class="w">  </span><span class="n">ICHECK</span><span class="p">(</span><span class="n">pf</span><span class="w"> </span><span class="o">!=</span><span class="w"> </span><span class="k">nullptr</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Cannot find csource module to create the external runtime module&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="p">(</span><span class="o">*</span><span class="n">pf</span><span class="p">)(</span><span class="n">code_stream_</span><span class="p">.</span><span class="n">str</span><span class="p">(),</span><span class="w"> </span><span class="s">&quot;cc&quot;</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
@@ -717,26 +717,26 @@
 <div class="section" id="register-your-codegen">
 <h3>Register Your Codegen<a class="headerlink" href="#register-your-codegen" title="Permalink to this headline">¶</a></h3>
 <p>The last step is registering your codegen to TVM backend. We first implement a simple function to invoke our codegen and generate a runtime module.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">runtime</span><span class="o">::</span><span class="n">Module</span> <span class="n">CCompiler</span><span class="p">(</span><span class="k">const</span> <span class="n">NodeRef</span><span class="o">&amp;</span> <span class="n">ref</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">CSourceCodegen</span> <span class="n">csource</span><span class="p">;</span>
-  <span class="k">return</span> <span class="n">csource</span><span class="p">.</span><span class="n">CreateCSourceModule</span><span class="p">(</span><span class="n">ref</span><span class="p">);</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">runtime</span><span class="o">::</span><span class="n">Module</span><span class="w"> </span><span class="nf">CCompiler</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">NodeRef</span><span class="o">&amp;</span><span class="w"> </span><span class="n">ref</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w" [...]
+<span class="w">  </span><span class="n">CSourceCodegen</span><span class="w"> </span><span class="n">csource</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="n">csource</span><span class="p">.</span><span class="n">CreateCSourceModule</span><span class="p">(</span><span class="n">ref</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Finally, we register this function to TVM backend:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;relay.ext.ccompiler&quot;</span><span class="p">).</span><span class="n">set_body_typed</span><span class="p">(</span><span class="n">CCompiler</span><span class="p">);</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;relay.ext.ccompiler&quot;</span><span class="p">).</span><span class="n">set_body_typed</span><span class="p">(</span><span class="n">CCompiler</span><span class="p">);</span><span class="w"></span>
 </pre></div>
 </div>
 <p>where <code class="docutils literal notranslate"><span class="pre">ccompiler</span></code> is a customized tag to let TVM know this is the codegen it should use to generate and offload subgraphs when the subgraph is annotated with <code class="docutils literal notranslate"><span class="pre">ccompiler</span></code>.</p>
 <p>Finally, a good practice is to set up a CMake configuration flag to include your compiler only for your customers. We first create a cmake file: <code class="docutils literal notranslate"><span class="pre">cmake/modules/contrib/CODEGENC.cmake</span></code>:</p>
 <div class="highlight-cmake notranslate"><div class="highlight"><pre><span></span><span class="nb">if</span><span class="p">(</span><span class="s">USE_CODEGENC</span><span class="p">)</span>
-  <span class="nb">file</span><span class="p">(</span><span class="s">GLOB</span> <span class="s">CSOURCE_RELAY_CONTRIB_SRC</span> <span class="s">src/relay/backend/contrib/codegen_c/codegen.cc</span><span class="p">)</span>
-  <span class="nb">list</span><span class="p">(</span><span class="s">APPEND</span> <span class="s">COMPILER_SRCS</span> <span class="o">${</span><span class="nv">CSOURCE_RELAY_CONTRIB_SRC</span><span class="o">}</span><span class="p">)</span>
+<span class="w">  </span><span class="nb">file</span><span class="p">(</span><span class="s">GLOB</span><span class="w"> </span><span class="s">CSOURCE_RELAY_CONTRIB_SRC</span><span class="w"> </span><span class="s">src/relay/backend/contrib/codegen_c/codegen.cc</span><span class="p">)</span>
+<span class="w">  </span><span class="nb">list</span><span class="p">(</span><span class="s">APPEND</span><span class="w"> </span><span class="s">COMPILER_SRCS</span><span class="w"> </span><span class="o">${</span><span class="nv">CSOURCE_RELAY_CONTRIB_SRC</span><span class="o">}</span><span class="p">)</span>
 <span class="nb">endif</span><span class="p">(</span><span class="s">USE_CODEGENC</span><span class="p">)</span>
 </pre></div>
 </div>
 <p>So that users can configure whether to include your compiler when configuring TVM using <code class="docutils literal notranslate"><span class="pre">config.cmake</span></code>:</p>
-<div class="highlight-cmake notranslate"><div class="highlight"><pre><span></span><span class="nb">set</span><span class="p">(</span><span class="s">USE_CODEGENC</span> <span class="s">ON</span><span class="p">)</span>
+<div class="highlight-cmake notranslate"><div class="highlight"><pre><span></span><span class="nb">set</span><span class="p">(</span><span class="s">USE_CODEGENC</span><span class="w"> </span><span class="s">ON</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
@@ -769,14 +769,14 @@
 </div>
 <p>The <code class="docutils literal notranslate"><span class="pre">input</span></code> keyword declares an input tensor with its ID and shape; while the other statements describes computations in <code class="docutils literal notranslate"><span class="pre">&lt;op&gt;</span> <span class="pre">&lt;output</span> <span class="pre">ID&gt;</span> <span class="pre">inputs:</span> <span class="pre">[input</span> <span class="pre">ID]</span> <span class="pre">shape:</span> <span class="pre">[sha [...]
 <p>In this section, our goal is to implement the following customized TVM runtime module to execute ExampleJSON graphs.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">runtime</span><span class="o">::</span><span class="n">Module</span> <span class="n">ExampleJsonCompiler</span><span class="p">(</span><span class="k">const</span> <span class="n">NodeRef</span><span class="o">&amp;</span> <span class="n">ref</span><span class="p">)</span> <span class="p">{</span>
-    <span class="n">ExampleJsonCodeGen</span> <span class="n">codegen</span><span class="p">(</span><span class="n">ref</span><span class="p">);</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">code</span> <span class="o">=</span> <span class="n">codegen</span><span class="p">.</span><span class="n">gen</span><span class="p">();</span> <span class="c1">// Note 1</span>
-    <span class="k">const</span> <span class="k">auto</span><span class="o">*</span> <span class="n">pf</span> <span class="o">=</span> <span class="n">runtime</span><span class="o">::</span><span class="n">Registry</span><span class="o">::</span><span class="n">Get</span><span class="p">(</span><span class="s">&quot;module.examplejson_module_create&quot;</span><span class="p">);</span> <span class="c1">// Note 2</span>
-    <span class="n">ICHECK</span><span class="p">(</span><span class="n">pf</span> <span class="o">!=</span> <span class="k">nullptr</span><span class="p">)</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;Cannot find ExampleJson module to create the external runtime module&quot;</span><span class="p">;</span>
-    <span class="k">return</span> <span class="p">(</span><span class="o">*</span><span class="n">pf</span><span class="p">)(</span><span class="n">code</span><span class="p">);</span>
-<span class="p">}</span>
-<span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;relay.ext.examplejsoncompiler&quot;</span><span class="p">).</span><span class="n">set_body_typed</span><span class="p">(</span><span class="n">ExampleJsonCompiler</span><span class="p">);</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">runtime</span><span class="o">::</span><span class="n">Module</span><span class="w"> </span><span class="nf">ExampleJsonCompiler</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">NodeRef</span><span class="o">&amp;</span><span class="w"> </span><span class="n">ref</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span [...]
+<span class="w">    </span><span class="n">ExampleJsonCodeGen</span><span class="w"> </span><span class="n">codegen</span><span class="p">(</span><span class="n">ref</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">code</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">codegen</span><span class="p">.</span><span class="n">gen</span><span class="p">();</span><span class="w"> </span><span class="c1">// Note 1</span>
+<span class="w">    </span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">*</span><span class="w"> </span><span class="n">pf</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">runtime</span><span class="o">::</span><span class="n">Registry</span><span class="o">::</span><span class="n">Get</span><span class="p">(</span><span class="s">&quot;module.examplejson_module_create&quot;</span><span class=" [...]
+<span class="w">    </span><span class="n">ICHECK</span><span class="p">(</span><span class="n">pf</span><span class="w"> </span><span class="o">!=</span><span class="w"> </span><span class="k">nullptr</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Cannot find ExampleJson module to create the external runtime module&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="p">(</span><span class="o">*</span><span class="n">pf</span><span class="p">)(</span><span class="n">code</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
+<span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;relay.ext.examplejsoncompiler&quot;</span><span class="p">).</span><span class="n">set_body_typed</span><span class="p">(</span><span class="n">ExampleJsonCompiler</span><span class="p">);</span><span class="w"></span>
 </pre></div>
 </div>
 <p><strong>Note 1</strong>: We will implement a customized codegen later to generate a ExampleJSON code string by taking a subgraph.</p>
@@ -785,47 +785,47 @@
 <div class="section" id="implement-examplejsoncodegen">
 <h3>Implement ExampleJsonCodeGen<a class="headerlink" href="#implement-examplejsoncodegen" title="Permalink to this headline">¶</a></h3>
 <p>Similar to the C codegen, we also derive <code class="docutils literal notranslate"><span class="pre">ExampleJsonCodeGen</span></code> from <code class="docutils literal notranslate"><span class="pre">ExprVisitor</span></code> to make use of visitor patterns for subgraph traversing. On the other hand, we do not have to inherit <code class="docutils literal notranslate"><span class="pre">CodegenCBase</span></code> because we do not need TVM C++ wrappers. The codegen class is implemente [...]
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="cp">#include</span> <span class="cpf">&lt;tvm/relay/expr_functor.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/relay/transform.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/relay/type.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/runtime/module.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/runtime/object.h&gt;</span><span class="cp"></span>
-
-<span class="cp">#include</span> <span class="cpf">&lt;fstream&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;sstream&gt;</span><span class="cp"></span>
-
-<span class="k">namespace</span> <span class="n">tvm</span> <span class="p">{</span>
-<span class="k">namespace</span> <span class="n">relay</span> <span class="p">{</span>
-<span class="k">namespace</span> <span class="n">contrib</span> <span class="p">{</span>
-
-<span class="k">class</span> <span class="nc">ExampleJsonCodeGen</span> <span class="o">:</span> <span class="k">public</span> <span class="n">ExprVisitor</span> <span class="p">{</span>
-  <span class="k">public</span><span class="o">:</span>
-    <span class="k">explicit</span> <span class="n">ExampleJsonCodeGen</span><span class="p">();</span>
-
-    <span class="c1">// Note 1</span>
-    <span class="kt">void</span> <span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span> <span class="n">VarNode</span><span class="o">*</span> <span class="n">node</span><span class="p">)</span> <span class="p">{</span> <span class="cm">/* Skip in this example. */</span> <span class="p">}</span>
-    <span class="kt">void</span> <span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span> <span class="n">CallNode</span><span class="o">*</span> <span class="n">call</span><span class="p">)</span> <span class="k">final</span> <span class="p">{</span> <span class="cm">/* Skip in this example. */</span> <span class="p">}</span>
-
-    <span class="c1">// Note 2</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">gen</span><span class="p">(</span><span class="n">NodeRef</span><span class="o">&amp;</span> <span class="n">ref</span><span class="p">)</span> <span class="p">{</span>
-        <span class="k">this</span><span class="o">-&gt;</span><span class="n">code</span> <span class="o">=</span> <span class="s">&quot;&quot;</span><span class="p">;</span>
-        <span class="k">if</span> <span class="p">(</span><span class="n">ref</span><span class="o">-&gt;</span><span class="n">IsInstance</span><span class="o">&lt;</span><span class="n">FunctionNode</span><span class="o">&gt;</span><span class="p">())</span> <span class="p">{</span>
-            <span class="k">this</span><span class="o">-&gt;</span><span class="n">visit</span><span class="p">(</span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Function</span><span class="o">&gt;</span><span class="p">(</span><span class="n">ref</span><span class="p">));</span>
-        <span class="p">}</span> <span class="k">else</span> <span class="k">if</span> <span class="p">(</span><span class="n">ref</span><span class="o">-&gt;</span><span class="n">IsInstance</span><span class="o">&lt;</span><span class="n">relay</span><span class="o">::</span><span class="n">ModuleNode</span><span class="o">&gt;</span><span class="p">())</span> <span class="p">{</span>
-            <span class="n">relay</span><span class="o">::</span><span class="n">Module</span> <span class="n">mod</span> <span class="o">=</span> <span class="n">Downcast</span><span class="o">&lt;</span><span class="n">relay</span><span class="o">::</span><span class="n">Module</span><span class="o">&gt;</span><span class="p">(</span><span class="n">ref</span><span class="p">);</span>
-            <span class="k">for</span> <span class="p">(</span><span class="k">const</span> <span class="k">auto</span><span class="o">&amp;</span> <span class="nl">it</span> <span class="p">:</span> <span class="n">mod</span><span class="o">-&gt;</span><span class="n">functions</span><span class="p">)</span> <span class="p">{</span>
-                <span class="k">this</span><span class="o">-&gt;</span><span class="n">visit</span><span class="p">(</span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Function</span><span class="o">&gt;</span><span class="p">(</span><span class="n">it</span><span class="p">.</span><span class="n">second</span><span class="p">));</span>
-            <span class="p">}</span>
-        <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-            <span class="n">LOG</span><span class="p">(</span><span class="n">FATAL</span><span class="p">)</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;The input ref is expected to be a Relay function or module&quot;</span><span class="p">;</span>
-        <span class="p">}</span>
-        <span class="k">return</span> <span class="k">this</span><span class="o">-&gt;</span><span class="n">code</span><span class="p">;</span>
-    <span class="p">}</span>
-
-  <span class="k">private</span><span class="o">:</span>
-      <span class="cm">/*! \brief The function id that represents a C source function. */</span>
-     <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">code</span><span class="p">;</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/relay/expr_functor.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/relay/transform.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/relay/type.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/runtime/module.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/runtime/object.h&gt;</span><span class="cp"></span>
+
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;fstream&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;sstream&gt;</span><span class="cp"></span>
+
+<span class="k">namespace</span><span class="w"> </span><span class="nn">tvm</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="k">namespace</span><span class="w"> </span><span class="nn">relay</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="k">namespace</span><span class="w"> </span><span class="nn">contrib</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+
+<span class="k">class</span><span class="w"> </span><span class="nc">ExampleJsonCodeGen</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">ExprVisitor</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">    </span><span class="k">explicit</span><span class="w"> </span><span class="n">ExampleJsonCodeGen</span><span class="p">();</span><span class="w"></span>
+
+<span class="w">    </span><span class="c1">// Note 1</span>
+<span class="w">    </span><span class="kt">void</span><span class="w"> </span><span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">VarNode</span><span class="o">*</span><span class="w"> </span><span class="n">node</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="cm">/* Skip in this example. */</span><span class="w"> </span><span class="p">}</span><sp [...]
+<span class="w">    </span><span class="kt">void</span><span class="w"> </span><span class="nf">VisitExpr_</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">CallNode</span><span class="o">*</span><span class="w"> </span><span class="n">call</span><span class="p">)</span><span class="w"> </span><span class="k">final</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="cm">/* Skip in this example. */</spa [...]
+
+<span class="w">    </span><span class="c1">// Note 2</span>
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="nf">gen</span><span class="p">(</span><span class="n">NodeRef</span><span class="o">&amp;</span><span class="w"> </span><span class="n">ref</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">        </span><span class="k">this</span><span class="o">-&gt;</span><span class="n">code</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">        </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">ref</span><span class="o">-&gt;</span><span class="n">IsInstance</span><span class="o">&lt;</span><span class="n">FunctionNode</span><span class="o">&gt;</span><span class="p">())</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">            </span><span class="k">this</span><span class="o">-&gt;</span><span class="n">visit</span><span class="p">(</span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Function</span><span class="o">&gt;</span><span class="p">(</span><span class="n">ref</span><span class="p">));</span><span class="w"></span>
+<span class="w">        </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">ref</span><span class="o">-&gt;</span><span class="n">IsInstance</span><span class="o">&lt;</span><span class="n">relay</span><span class="o">::</span><span class="n">ModuleNode</span><span class="o">&gt;</span><span class="p">())</span><span class="w"> </span><span class= [...]
+<span class="w">            </span><span class="n">relay</span><span class="o">::</span><span class="n">Module</span><span class="w"> </span><span class="n">mod</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">relay</span><span class="o">::</span><span class="n">Module</span><span class="o">&gt;</span><span class="p">(</span><span class="n">ref</span><span class="p">);</span><span class [...]
+<span class="w">            </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">&amp;</span><span class="w"> </span><span class="n">it</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">mod</span><span class="o">-&gt;</span><span class="n">functions</span><span class="p">)</span><span class="w"> </span><span class="p">{</sp [...]
+<span class="w">                </span><span class="k">this</span><span class="o">-&gt;</span><span class="n">visit</span><span class="p">(</span><span class="n">Downcast</span><span class="o">&lt;</span><span class="n">Function</span><span class="o">&gt;</span><span class="p">(</span><span class="n">it</span><span class="p">.</span><span class="n">second</span><span class="p">));</span><span class="w"></span>
+<span class="w">            </span><span class="p">}</span><span class="w"></span>
+<span class="w">        </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">            </span><span class="n">LOG</span><span class="p">(</span><span class="n">FATAL</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;The input ref is expected to be a Relay function or module&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">        </span><span class="p">}</span><span class="w"></span>
+<span class="w">        </span><span class="k">return</span><span class="w"> </span><span class="k">this</span><span class="o">-&gt;</span><span class="n">code</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+
+<span class="w">  </span><span class="k">private</span><span class="o">:</span><span class="w"></span>
+<span class="w">      </span><span class="cm">/*! \brief The function id that represents a C source function. */</span><span class="w"></span>
+<span class="w">     </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">code</span><span class="p">;</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p><strong>Note 1</strong>: We again implement corresponding visitor functions to generate ExampleJSON code and store it to a class variable <code class="docutils literal notranslate"><span class="pre">code</span></code> (we skip the visitor function implementation in this example as their concepts are basically the same as C codegen). After finished the graph visiting, we should have an ExampleJSON graph in <code class="docutils literal notranslate"><span class="pre">code</span></code>.</p>
@@ -836,57 +836,57 @@
 <h3>Implement a Customized Runtime<a class="headerlink" href="#implement-a-customized-runtime" title="Permalink to this headline">¶</a></h3>
 <p>In this section, we will implement a customized TVM runtime step-by-step and register it to TVM runtime modules. The customized runtime should be located at <code class="docutils literal notranslate"><span class="pre">src/runtime/contrib/&lt;your-runtime-name&gt;/</span></code>. In our example, we name our runtime “example_ext_runtime”.</p>
 <p>Again, we first define a customized runtime class as follows. The class has to be derived from TVM <code class="docutils literal notranslate"><span class="pre">ModuleNode</span></code> in order to be compatible with other TVM runtime modules.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="cp">#include</span> <span class="cpf">&lt;dmlc/logging.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/runtime/c_runtime_api.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/runtime/memory.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/runtime/module.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/runtime/ndarray.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/runtime/object.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/runtime/packed_func.h&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;tvm/runtime/registry.h&gt;</span><span class="cp"></span>
-
-<span class="cp">#include</span> <span class="cpf">&lt;fstream&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;cmath&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;map&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;sstream&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;string&gt;</span><span class="cp"></span>
-<span class="cp">#include</span> <span class="cpf">&lt;vector&gt;</span><span class="cp"></span>
-
-<span class="k">namespace</span> <span class="n">tvm</span> <span class="p">{</span>
-<span class="k">namespace</span> <span class="n">runtime</span> <span class="p">{</span>
-<span class="k">class</span> <span class="nc">ExampleJsonModule</span> <span class="o">:</span> <span class="k">public</span> <span class="n">ModuleNode</span> <span class="p">{</span>
- <span class="k">public</span><span class="o">:</span>
-  <span class="k">explicit</span> <span class="n">ExampleJsonModule</span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">graph_json</span><span class="p">);</span>
-
-  <span class="n">PackedFunc</span> <span class="nf">GetFunction</span><span class="p">(</span><span class="k">const</span> <span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span> <span class="n">name</span><span class="p">,</span>
-                         <span class="k">const</span> <span class="n">ObjectPtr</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">&gt;&amp;</span> <span class="n">sptr_to_self</span><span class="p">)</span> <span class="k">final</span><span class="p">;</span>
-
-  <span class="k">const</span> <span class="kt">char</span><span class="o">*</span> <span class="nf">type_key</span><span class="p">()</span> <span class="k">const</span> <span class="p">{</span> <span class="k">return</span> <span class="s">&quot;examplejson&quot;</span><span class="p">;</span> <span class="p">}</span>
-
-  <span class="kt">void</span> <span class="nf">SaveToBinary</span><span class="p">(</span><span class="n">dmlc</span><span class="o">::</span><span class="n">Stream</span><span class="o">*</span> <span class="n">stream</span><span class="p">)</span> <span class="k">final</span><span class="p">;</span>
-
-  <span class="k">static</span> <span class="n">Module</span> <span class="nf">LoadFromBinary</span><span class="p">(</span><span class="kt">void</span><span class="o">*</span> <span class="n">strm</span><span class="p">);</span>
-
-  <span class="k">static</span> <span class="n">Module</span> <span class="nf">Create</span><span class="p">(</span><span class="k">const</span> <span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span> <span class="n">path</span><span class="p">);</span>
-
-  <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">GetSource</span><span class="p">(</span><span class="k">const</span> <span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span> <span class="n">format</span> <span class="o">=</span> <span class="s">&quot;&quot;</span><span class="p">);</span>
-
-  <span class="kt">void</span> <span class="nf">Run</span><span class="p">(</span><span class="kt">int</span> <span class="n">id</span><span class="p">,</span> <span class="k">const</span> <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="kt">int</span><span class="o">&gt;&amp;</span> <span class="n">inputs</span><span class="p">,</span> <span class="kt">int</span> <span class="n">output</span><span class="p">);</span>
-
-  <span class="kt">void</span> <span class="nf">ParseJson</span><span class="p">(</span><span class="k">const</span> <span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span> <span class="n">json</span><span class="p">);</span>
-
- <span class="k">private</span><span class="o">:</span>
-  <span class="cm">/* \brief The json string that represents a computational graph. */</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">graph_json_</span><span class="p">;</span>
-  <span class="cm">/* \brief The subgraph that being processed. */</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">curr_subgraph_</span><span class="p">;</span>
-  <span class="cm">/*! \brief A simple graph from subgraph id to node entries. */</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">map</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="p">,</span> <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">NodeEntry</span><span class="o">&gt;</span> <span class="o">&gt;</span> <span class="n">graph_</span><span class="p">;</span>
-  <span class="cm">/* \brief A simple pool to contain the tensor for each node in the graph. */</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">NDArray</span><span class="o">&gt;</span> <span class="n">data_entry_</span><span class="p">;</span>
-  <span class="cm">/* \brief A mapping from node id to op name. */</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&gt;</span> <span class="n">op_id_</span><span class="p">;</span>
-<span class="p">};</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;dmlc/logging.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/runtime/c_runtime_api.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/runtime/memory.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/runtime/module.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/runtime/ndarray.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/runtime/object.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/runtime/packed_func.h&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm/runtime/registry.h&gt;</span><span class="cp"></span>
+
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;fstream&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;cmath&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;map&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;sstream&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;string&gt;</span><span class="cp"></span>
+<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;vector&gt;</span><span class="cp"></span>
+
+<span class="k">namespace</span><span class="w"> </span><span class="nn">tvm</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="k">namespace</span><span class="w"> </span><span class="nn">runtime</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="k">class</span><span class="w"> </span><span class="nc">ExampleJsonModule</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">public</span><span class="w"> </span><span class="n">ModuleNode</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w"> </span><span class="k">public</span><span class="o">:</span><span class="w"></span>
+<span class="w">  </span><span class="k">explicit</span><span class="w"> </span><span class="n">ExampleJsonModule</span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">graph_json</span><span class="p">);</span><span class="w"></span>
+
+<span class="w">  </span><span class="n">PackedFunc</span><span class="w"> </span><span class="nf">GetFunction</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span><span class="w"> </span><span class="n">name</span><span class="p">,</span><span class="w"></span>
+<span class="w">                         </span><span class="k">const</span><span class="w"> </span><span class="n">ObjectPtr</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">&gt;&amp;</span><span class="w"> </span><span class="n">sptr_to_self</span><span class="p">)</span><span class="w"> </span><span class="k">final</span><span class="p">;</span><span class="w"></span>
+
+<span class="w">  </span><span class="k">const</span><span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="nf">type_key</span><span class="p">()</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="s">&quot;examplejson&quot;</span><span class="p">;</span><span class="w"> </span><span class [...]
+
+<span class="w">  </span><span class="kt">void</span><span class="w"> </span><span class="nf">SaveToBinary</span><span class="p">(</span><span class="n">dmlc</span><span class="o">::</span><span class="n">Stream</span><span class="o">*</span><span class="w"> </span><span class="n">stream</span><span class="p">)</span><span class="w"> </span><span class="k">final</span><span class="p">;</span><span class="w"></span>
+
+<span class="w">  </span><span class="k">static</span><span class="w"> </span><span class="n">Module</span><span class="w"> </span><span class="nf">LoadFromBinary</span><span class="p">(</span><span class="kt">void</span><span class="o">*</span><span class="w"> </span><span class="n">strm</span><span class="p">);</span><span class="w"></span>
+
+<span class="w">  </span><span class="k">static</span><span class="w"> </span><span class="n">Module</span><span class="w"> </span><span class="nf">Create</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span><span class="w"> </span><span class="n">path</span><span class="p">);</span><span class="w"></span>
+
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="nf">GetSource</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span><span class="w"> </span><span class="n">format</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot [...]
+
+<span class="w">  </span><span class="kt">void</span><span class="w"> </span><span class="nf">Run</span><span class="p">(</span><span class="kt">int</span><span class="w"> </span><span class="n">id</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="kt">int</span><span class="o">&gt;&amp;</span><span class="w"> </spa [...]
+
+<span class="w">  </span><span class="kt">void</span><span class="w"> </span><span class="nf">ParseJson</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span><span class="w"> </span><span class="n">json</span><span class="p">);</span><span class="w"></span>
+
+<span class="w"> </span><span class="k">private</span><span class="o">:</span><span class="w"></span>
+<span class="w">  </span><span class="cm">/* \brief The json string that represents a computational graph. */</span><span class="w"></span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">graph_json_</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="cm">/* \brief The subgraph that being processed. */</span><span class="w"></span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">curr_subgraph_</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="cm">/*! \brief A simple graph from subgraph id to node entries. */</span><span class="w"></span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">map</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="p">,</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">NodeEntry</span><span class="o">&gt;</span><span class="w"> </span><span class="o">&gt;</span><span class="w"> < [...]
+<span class="w">  </span><span class="cm">/* \brief A simple pool to contain the tensor for each node in the graph. */</span><span class="w"></span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">NDArray</span><span class="o">&gt;</span><span class="w"> </span><span class="n">data_entry_</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="cm">/* \brief A mapping from node id to op name. */</span><span class="w"></span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&gt;</span><span class="w"> </span><span class="n">op_id_</span><span class="p">;</span><span class="w"></span>
+<span class="p">};</span><span class="w"></span>
 </pre></div>
 </div>
 <p>In particular, there are some functions derived from <code class="docutils literal notranslate"><span class="pre">ModuleNode</span></code> that we must implement in <code class="docutils literal notranslate"><span class="pre">ExampleJsonModule</span></code>:</p>
@@ -899,67 +899,67 @@
 <p>Other functions and class variables will be introduced along with the implementation of above must-have functions.</p>
 <div class="section" id="implement-constructor">
 <h4>Implement Constructor<a class="headerlink" href="#implement-constructor" title="Permalink to this headline">¶</a></h4>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">explicit</span> <span class="nf">ExampleJsonModule</span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">graph_json</span><span class="p">)</span> <span class="p">{</span>
-  <span class="k">this</span><span class="o">-&gt;</span><span class="n">graph_json_</span> <span class="o">=</span> <span class="n">graph_json</span><span class="p">;</span>
-  <span class="n">ParseJson</span><span class="p">(</span><span class="k">this</span><span class="o">-&gt;</span><span class="n">graph_json_</span><span class="p">);</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">explicit</span><span class="w"> </span><span class="n">ExampleJsonModule</span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">graph_json</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="k">this</span><span class="o">-&gt;</span><span class="n">graph_json_</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">graph_json</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">ParseJson</span><span class="p">(</span><span class="k">this</span><span class="o">-&gt;</span><span class="n">graph_json_</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>Then, we implement <code class="docutils literal notranslate"><span class="pre">ParseJson</span></code> to parse a subgraph in ExampleJSON format and construct a graph in memory for later usage. Since we do not support subgraph with branches in this example, we simply use an array to store every nodes in a subgraph in order.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span> <span class="nf">ParseJson</span><span class="p">(</span><span class="k">const</span> <span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span> <span class="n">json</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">line</span><span class="p">;</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">curr_subgraph</span><span class="p">;</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">stringstream</span> <span class="n">ss</span><span class="p">(</span><span class="n">json</span><span class="p">);</span>
-
-  <span class="k">while</span> <span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">getline</span><span class="p">(</span><span class="n">ss</span><span class="p">,</span> <span class="n">line</span><span class="p">,</span> <span class="sc">&#39;\n&#39;</span><span class="p">))</span> <span class="p">{</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">stringstream</span> <span class="n">ss2</span><span class="p">(</span><span class="n">line</span><span class="p">);</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">token</span><span class="p">;</span>
-    <span class="kt">int</span> <span class="n">id</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-
-    <span class="n">ss2</span> <span class="o">&gt;&gt;</span> <span class="n">token</span><span class="p">;</span>
-    <span class="k">if</span> <span class="p">(</span><span class="n">token</span><span class="p">.</span><span class="n">find</span><span class="p">(</span><span class="s">&quot;subgraph_&quot;</span><span class="p">)</span> <span class="o">!=</span> <span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">::</span><span class="n">npos</span><span class="p">)</span> <span class="p">{</span>
-      <span class="n">curr_subgraph</span> <span class="o">=</span> <span class="n">token</span><span class="p">;</span>
-      <span class="k">continue</span><span class="p">;</span>
-    <span class="p">}</span>
-
-    <span class="n">ss2</span> <span class="o">&gt;&gt;</span> <span class="n">id</span><span class="p">;</span>
-    <span class="k">if</span> <span class="p">(</span><span class="n">op_id_</span><span class="p">.</span><span class="n">size</span><span class="p">()</span> <span class="o">&lt;=</span> <span class="k">static_cast</span><span class="o">&lt;</span><span class="kt">size_t</span><span class="o">&gt;</span><span class="p">(</span><span class="n">id</span><span class="p">))</span> <span class="p">{</span>
-      <span class="n">op_id_</span><span class="p">.</span><span class="n">resize</span><span class="p">(</span><span class="n">id</span> <span class="o">+</span> <span class="mi">1</span><span class="p">);</span>
-      <span class="n">data_entry_</span><span class="p">.</span><span class="n">resize</span><span class="p">(</span><span class="n">id</span> <span class="o">+</span> <span class="mi">1</span><span class="p">);</span>
-    <span class="p">}</span>
-
-    <span class="kt">int64_t</span> <span class="n">total_elements</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="kt">int64_t</span><span class="o">&gt;</span> <span class="n">shape</span><span class="p">;</span>
-    <span class="k">if</span> <span class="p">(</span><span class="n">token</span> <span class="o">==</span> <span class="s">&quot;input&quot;</span><span class="p">)</span> <span class="p">{</span>
-      <span class="kt">int64_t</span> <span class="n">size</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-      <span class="k">while</span> <span class="p">(</span><span class="n">ss2</span> <span class="o">&gt;&gt;</span> <span class="n">size</span><span class="p">)</span> <span class="p">{</span>
-        <span class="n">total_elements</span> <span class="o">*=</span> <span class="n">size</span><span class="p">;</span>
-        <span class="n">shape</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">size</span><span class="p">);</span>
-      <span class="p">}</span>
-    <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-      <span class="n">op_id_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span> <span class="o">=</span> <span class="n">token</span><span class="p">;</span> <span class="c1">// Note 1</span>
-      <span class="kt">bool</span> <span class="n">shape_data</span> <span class="o">=</span> <span class="nb">false</span><span class="p">;</span>
-      <span class="n">NodeEntry</span> <span class="n">entry</span><span class="p">;</span>
-      <span class="k">while</span> <span class="p">(</span><span class="n">ss2</span> <span class="o">&gt;&gt;</span> <span class="n">token</span><span class="p">)</span> <span class="p">{</span>
-        <span class="k">if</span> <span class="p">(</span><span class="n">token</span> <span class="o">==</span> <span class="s">&quot;shape:&quot;</span><span class="p">)</span> <span class="p">{</span>
-          <span class="n">shape_data</span> <span class="o">=</span> <span class="nb">true</span><span class="p">;</span>
-        <span class="p">}</span> <span class="k">else</span> <span class="k">if</span> <span class="p">(</span><span class="n">shape_data</span><span class="p">)</span> <span class="p">{</span>
-          <span class="n">total_elements</span> <span class="o">*=</span> <span class="n">std</span><span class="o">::</span><span class="n">stoll</span><span class="p">(</span><span class="n">token</span><span class="p">);</span>
-          <span class="n">shape</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">stoll</span><span class="p">(</span><span class="n">token</span><span class="p">));</span>
-        <span class="p">}</span> <span class="k">else</span> <span class="k">if</span> <span class="p">(</span><span class="n">token</span> <span class="o">!=</span> <span class="s">&quot;inputs:&quot;</span><span class="p">)</span> <span class="p">{</span>
-          <span class="n">entry</span><span class="p">.</span><span class="n">inputs</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">stoi</span><span class="p">(</span><span class="n">token</span><span class="p">));</span>
-        <span class="p">}</span>
-      <span class="p">}</span>
-      <span class="n">entry</span><span class="p">.</span><span class="n">id</span> <span class="o">=</span> <span class="n">id</span><span class="p">;</span>
-      <span class="n">entry</span><span class="p">.</span><span class="n">output</span> <span class="o">=</span> <span class="n">id</span><span class="p">;</span>
-      <span class="n">graph_</span><span class="p">[</span><span class="n">curr_subgraph</span><span class="p">].</span><span class="n">push_back</span><span class="p">(</span><span class="n">entry</span><span class="p">);</span> <span class="c1">// Note 2</span>
-    <span class="p">}</span>
-    <span class="n">DLDevice</span> <span class="n">dev</span><span class="p">;</span>
-    <span class="n">dev</span><span class="p">.</span><span class="n">device_type</span> <span class="o">=</span> <span class="k">static_cast</span><span class="o">&lt;</span><span class="n">DLDeviceType</span><span class="o">&gt;</span><span class="p">(</span><span class="mi">1</span><span class="p">);</span>
-    <span class="n">dev</span><span class="p">.</span><span class="n">device_id</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-    <span class="n">data_entry_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span> <span class="o">=</span> <span class="n">NDArray</span><span class="o">::</span><span class="n">Empty</span><span class="p">(</span><span class="n">shape</span><span class="p">,</span> <span class="n">DLDataType</span><span class="p">{</span><span class="n">kDLFloat</span><span class="p">,</span> <span class="mi">32</span><span class="p">,</span> <span class="mi">1</span><span  [...]
-  <span class="p">}</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span><span class="w"> </span><span class="nf">ParseJson</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span><span class="w"> </span><span class="n">json</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">line</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">curr_subgraph</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">stringstream</span><span class="w"> </span><span class="n">ss</span><span class="p">(</span><span class="n">json</span><span class="p">);</span><span class="w"></span>
+
+<span class="w">  </span><span class="k">while</span><span class="w"> </span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">getline</span><span class="p">(</span><span class="n">ss</span><span class="p">,</span><span class="w"> </span><span class="n">line</span><span class="p">,</span><span class="w"> </span><span class="sc">&#39;\n&#39;</span><span class="p">))</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">stringstream</span><span class="w"> </span><span class="n">ss2</span><span class="p">(</span><span class="n">line</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">token</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="kt">int</span><span class="w"> </span><span class="n">id</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"></span>
+
+<span class="w">    </span><span class="n">ss2</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="n">token</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">token</span><span class="p">.</span><span class="n">find</span><span class="p">(</span><span class="s">&quot;subgraph_&quot;</span><span class="p">)</span><span class="w"> </span><span class="o">!=</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">::</span><span class="n">npos</span><span class="p"> [...]
+<span class="w">      </span><span class="n">curr_subgraph</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">token</span><span class="p">;</span><span class="w"></span>
+<span class="w">      </span><span class="k">continue</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+
+<span class="w">    </span><span class="n">ss2</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="n">id</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">op_id_</span><span class="p">.</span><span class="n">size</span><span class="p">()</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="k">static_cast</span><span class="o">&lt;</span><span class="kt">size_t</span><span class="o">&gt;</span><span class="p">(</span><span class="n">id</span><span class="p">))</span><span class="w"> </ [...]
+<span class="w">      </span><span class="n">op_id_</span><span class="p">.</span><span class="n">resize</span><span class="p">(</span><span class="n">id</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="mi">1</span><span class="p">);</span><span class="w"></span>
+<span class="w">      </span><span class="n">data_entry_</span><span class="p">.</span><span class="n">resize</span><span class="p">(</span><span class="n">id</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="mi">1</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+
+<span class="w">    </span><span class="kt">int64_t</span><span class="w"> </span><span class="n">total_elements</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">1</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="kt">int64_t</span><span class="o">&gt;</span><span class="w"> </span><span class="n">shape</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">token</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s">&quot;input&quot;</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">      </span><span class="kt">int64_t</span><span class="w"> </span><span class="n">size</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"></span>
+<span class="w">      </span><span class="k">while</span><span class="w"> </span><span class="p">(</span><span class="n">ss2</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="n">size</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">        </span><span class="n">total_elements</span><span class="w"> </span><span class="o">*=</span><span class="w"> </span><span class="n">size</span><span class="p">;</span><span class="w"></span>
+<span class="w">        </span><span class="n">shape</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">size</span><span class="p">);</span><span class="w"></span>
+<span class="w">      </span><span class="p">}</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">      </span><span class="n">op_id_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">token</span><span class="p">;</span><span class="w"> </span><span class="c1">// Note 1</span>
+<span class="w">      </span><span class="kt">bool</span><span class="w"> </span><span class="n">shape_data</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">false</span><span class="p">;</span><span class="w"></span>
+<span class="w">      </span><span class="n">NodeEntry</span><span class="w"> </span><span class="n">entry</span><span class="p">;</span><span class="w"></span>
+<span class="w">      </span><span class="k">while</span><span class="w"> </span><span class="p">(</span><span class="n">ss2</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="n">token</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">        </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">token</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s">&quot;shape:&quot;</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">          </span><span class="n">shape_data</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">true</span><span class="p">;</span><span class="w"></span>
+<span class="w">        </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">shape_data</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">          </span><span class="n">total_elements</span><span class="w"> </span><span class="o">*=</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">stoll</span><span class="p">(</span><span class="n">token</span><span class="p">);</span><span class="w"></span>
+<span class="w">          </span><span class="n">shape</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">stoll</span><span class="p">(</span><span class="n">token</span><span class="p">));</span><span class="w"></span>
+<span class="w">        </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">token</span><span class="w"> </span><span class="o">!=</span><span class="w"> </span><span class="s">&quot;inputs:&quot;</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">          </span><span class="n">entry</span><span class="p">.</span><span class="n">inputs</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">stoi</span><span class="p">(</span><span class="n">token</span><span class="p">));</span><span class="w"></span>
+<span class="w">        </span><span class="p">}</span><span class="w"></span>
+<span class="w">      </span><span class="p">}</span><span class="w"></span>
+<span class="w">      </span><span class="n">entry</span><span class="p">.</span><span class="n">id</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">id</span><span class="p">;</span><span class="w"></span>
+<span class="w">      </span><span class="n">entry</span><span class="p">.</span><span class="n">output</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">id</span><span class="p">;</span><span class="w"></span>
+<span class="w">      </span><span class="n">graph_</span><span class="p">[</span><span class="n">curr_subgraph</span><span class="p">].</span><span class="n">push_back</span><span class="p">(</span><span class="n">entry</span><span class="p">);</span><span class="w"> </span><span class="c1">// Note 2</span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+<span class="w">    </span><span class="n">DLDevice</span><span class="w"> </span><span class="n">dev</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="n">dev</span><span class="p">.</span><span class="n">device_type</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">static_cast</span><span class="o">&lt;</span><span class="n">DLDeviceType</span><span class="o">&gt;</span><span class="p">(</span><span class="mi">1</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="n">dev</span><span class="p">.</span><span class="n">device_id</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="n">data_entry_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">NDArray</span><span class="o">::</span><span class="n">Empty</span><span class="p">(</span><span class="n">shape</span><span class="p">,</span><span class="w"> </span><span class="n">DLDataType</span><span class="p">{</span><span class="n">kDLFloat</span><span class="p" [...]
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p><strong>Note 1</strong>: We use a class variable <code class="docutils literal notranslate"><span class="pre">op_id_</span></code> to map from subgraph node ID to the operator name (e.g., <code class="docutils literal notranslate"><span class="pre">add</span></code>) so that we can invoke the corresponding operator function in runtime.</p>
@@ -969,47 +969,47 @@
 <div class="section" id="implement-getfunction">
 <h4>Implement GetFunction<a class="headerlink" href="#implement-getfunction" title="Permalink to this headline">¶</a></h4>
 <p>After the construction, we should have the above class variables ready. We then implement <code class="docutils literal notranslate"><span class="pre">GetFunction</span></code> to provide executable subgraph functions to TVM runtime:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">PackedFunc</span> <span class="nf">GetFunction</span><span class="p">(</span><span class="k">const</span> <span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span> <span class="n">name</span><span class="p">,</span>
-                       <span class="k">const</span> <span class="n">ObjectPtr</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">&gt;&amp;</span> <span class="n">sptr_to_self</span><span class="p">)</span> <span class="k">final</span> <span class="p">{</span>
-  <span class="k">if</span> <span class="p">(</span><span class="k">this</span><span class="o">-&gt;</span><span class="n">graph_</span><span class="p">.</span><span class="n">find</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="o">!=</span> <span class="k">this</span><span class="o">-&gt;</span><span class="n">graph_</span><span class="p">.</span><span class="n">end</span><span class="p">())</span> <span class="p">{</span>
-    <span class="k">this</span><span class="o">-&gt;</span><span class="n">curr_subgraph_</span> <span class="o">=</span> <span class="n">name</span><span class="p">;</span>
-    <span class="k">return</span> <span class="n">PackedFunc</span><span class="p">([</span><span class="n">sptr_to_self</span><span class="p">,</span> <span class="k">this</span><span class="p">](</span><span class="n">TVMArgs</span> <span class="n">args</span><span class="p">,</span> <span class="n">TVMRetValue</span><span class="o">*</span> <span class="n">rv</span><span class="p">)</span> <span class="p">{</span>
-
-      <span class="c1">// Copy input tensors to corresponding data entries.</span>
-      <span class="k">for</span> <span class="p">(</span><span class="k">auto</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">();</span> <span class="o">++</span><span class="n">i</span><span class="p">)</span> <span class="p">{</span>
-        <span class="n">ICHECK</span><span class="p">(</span><span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">].</span><span class="n">type_code</span><span class="p">()</span> <span class="o">==</span> <span class="n">kNDArrayContainer</span> <span class="o">||</span> <span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">].</span><span class="n">type_code</span><span class="p">()</span> <span class="o">==</sp [...]
-            <span class="o">&lt;&lt;</span> <span class="s">&quot;Expect NDArray or DLTensor as inputs</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span>
-        <span class="k">if</span> <span class="p">(</span><span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">].</span><span class="n">type_code</span><span class="p">()</span> <span class="o">==</span> <span class="n">kArrayHandle</span><span class="p">)</span> <span class="p">{</span>
-          <span class="n">DLTensor</span><span class="o">*</span> <span class="n">arg</span> <span class="o">=</span> <span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">];</span>
-          <span class="k">this</span><span class="o">-&gt;</span><span class="n">data_entry_</span><span class="p">[</span><span class="n">i</span><span class="p">].</span><span class="n">CopyFrom</span><span class="p">(</span><span class="n">arg</span><span class="p">);</span>
-        <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-          <span class="n">NDArray</span> <span class="n">arg</span> <span class="o">=</span> <span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">];</span>
-          <span class="k">this</span><span class="o">-&gt;</span><span class="n">data_entry_</span><span class="p">[</span><span class="n">i</span><span class="p">].</span><span class="n">CopyFrom</span><span class="p">(</span><span class="n">arg</span><span class="p">);</span>
-        <span class="p">}</span>
-      <span class="p">}</span>
-
-      <span class="c1">// Execute the subgraph.</span>
-      <span class="k">for</span> <span class="p">(</span><span class="k">const</span> <span class="k">auto</span><span class="o">&amp;</span> <span class="nl">it</span> <span class="p">:</span> <span class="k">this</span><span class="o">-&gt;</span><span class="n">graph_</span><span class="p">[</span><span class="k">this</span><span class="o">-&gt;</span><span class="n">curr_subgraph_</span><span class="p">])</span> <span class="p">{</span>
-        <span class="k">this</span><span class="o">-&gt;</span><span class="n">Run</span><span class="p">(</span><span class="n">it</span><span class="p">.</span><span class="n">id</span><span class="p">,</span> <span class="n">it</span><span class="p">.</span><span class="n">inputs</span><span class="p">,</span> <span class="n">it</span><span class="p">.</span><span class="n">output</span><span class="p">);</span>
-      <span class="p">}</span>
-      <span class="n">ICHECK_GT</span><span class="p">(</span><span class="n">graph_</span><span class="p">.</span><span class="n">count</span><span class="p">(</span><span class="k">this</span><span class="o">-&gt;</span><span class="n">curr_subgraph_</span><span class="p">),</span> <span class="mi">0U</span><span class="p">);</span>
-
-      <span class="c1">// Copy the output from a data entry back to TVM runtime argument.</span>
-      <span class="k">auto</span> <span class="n">out_idx</span> <span class="o">=</span> <span class="n">graph_</span><span class="p">[</span><span class="k">this</span><span class="o">-&gt;</span><span class="n">curr_subgraph_</span><span class="p">].</span><span class="n">back</span><span class="p">().</span><span class="n">output</span><span class="p">;</span>
-      <span class="k">if</span> <span class="p">(</span><span class="n">args</span><span class="p">[</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">()</span> <span class="o">-</span> <span class="mi">1</span><span class="p">].</span><span class="n">type_code</span><span class="p">()</span> <span class="o">==</span> <span class="n">kArrayHandle</span><span class="p">)</span> <span class="p">{</span>
-        <span class="n">DLTensor</span><span class="o">*</span> <span class="n">arg</span> <span class="o">=</span> <span class="n">args</span><span class="p">[</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">()</span> <span class="o">-</span> <span class="mi">1</span><span class="p">];</span>
-        <span class="k">this</span><span class="o">-&gt;</span><span class="n">data_entry_</span><span class="p">[</span><span class="n">out_idx</span><span class="p">].</span><span class="n">CopyTo</span><span class="p">(</span><span class="n">arg</span><span class="p">);</span>
-      <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-        <span class="n">NDArray</span> <span class="n">arg</span> <span class="o">=</span> <span class="n">args</span><span class="p">[</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">()</span> <span class="o">-</span> <span class="mi">1</span><span class="p">];</span>
-        <span class="k">this</span><span class="o">-&gt;</span><span class="n">data_entry_</span><span class="p">[</span><span class="n">out_idx</span><span class="p">].</span><span class="n">CopyTo</span><span class="p">(</span><span class="n">arg</span><span class="p">);</span>
-      <span class="p">}</span>
-      <span class="o">*</span><span class="n">rv</span> <span class="o">=</span> <span class="n">data_entry_</span><span class="p">.</span><span class="n">back</span><span class="p">();</span>
-    <span class="p">});</span>
-  <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-    <span class="n">LOG</span><span class="p">(</span><span class="n">FATAL</span><span class="p">)</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;Unknown subgraph: &quot;</span> <span class="o">&lt;&lt;</span> <span class="n">name</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span>
-    <span class="k">return</span> <span class="n">PackedFunc</span><span class="p">();</span>
-  <span class="p">}</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">PackedFunc</span><span class="w"> </span><span class="nf">GetFunction</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span><span class="w"> </span><span class="n">name</span><span class="p">,</span><span class="w"></span>
+<span class="w">                       </span><span class="k">const</span><span class="w"> </span><span class="n">ObjectPtr</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">&gt;&amp;</span><span class="w"> </span><span class="n">sptr_to_self</span><span class="p">)</span><span class="w"> </span><span class="k">final</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="k">this</span><span class="o">-&gt;</span><span class="n">graph_</span><span class="p">.</span><span class="n">find</span><span class="p">(</span><span class="n">name</span><span class="p">)</span><span class="w"> </span><span class="o">!=</span><span class="w"> </span><span class="k">this</span><span class="o">-&gt;</span><span class="n">graph_</span><span class="p">.</span><sp [...]
+<span class="w">    </span><span class="k">this</span><span class="o">-&gt;</span><span class="n">curr_subgraph_</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">name</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="n">PackedFunc</span><span class="p">([</span><span class="n">sptr_to_self</span><span class="p">,</span><span class="w"> </span><span class="k">this</span><span class="p">](</span><span class="n">TVMArgs</span><span class="w"> </span><span class="n">args</span><span class="p">,</span><span class="w"> </span><span class="n">TVMRetValue</span><span class="o">*</span><span class="w"> </span><span cl [...]
+
+<span class="w">      </span><span class="c1">// Copy input tensors to corresponding data entries.</span>
+<span class="w">      </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">&lt;</span><span class="w"> </span><span class="n">args</span><span class="p">.</span><span class="n">siz [...]
+<span class="w">        </span><span class="n">ICHECK</span><span class="p">(</span><span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">].</span><span class="n">type_code</span><span class="p">()</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="n">kNDArrayContainer</span><span class="w"> </span><span class="o">||</span><span class="w"> </span><span class="n">args</span><span class="p">[</span><span class="n [...]
+<span class="w">            </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Expect NDArray or DLTensor as inputs</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">        </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">].</span><span class="n">type_code</span><span class="p">()</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="n">kArrayHandle</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">          </span><span class="n">DLTensor</span><span class="o">*</span><span class="w"> </span><span class="n">arg</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">];</span><span class="w"></span>
+<span class="w">          </span><span class="k">this</span><span class="o">-&gt;</span><span class="n">data_entry_</span><span class="p">[</span><span class="n">i</span><span class="p">].</span><span class="n">CopyFrom</span><span class="p">(</span><span class="n">arg</span><span class="p">);</span><span class="w"></span>
+<span class="w">        </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">          </span><span class="n">NDArray</span><span class="w"> </span><span class="n">arg</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">];</span><span class="w"></span>
+<span class="w">          </span><span class="k">this</span><span class="o">-&gt;</span><span class="n">data_entry_</span><span class="p">[</span><span class="n">i</span><span class="p">].</span><span class="n">CopyFrom</span><span class="p">(</span><span class="n">arg</span><span class="p">);</span><span class="w"></span>
+<span class="w">        </span><span class="p">}</span><span class="w"></span>
+<span class="w">      </span><span class="p">}</span><span class="w"></span>
+
+<span class="w">      </span><span class="c1">// Execute the subgraph.</span>
+<span class="w">      </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">&amp;</span><span class="w"> </span><span class="n">it</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="k">this</span><span class="o">-&gt;</span><span class="n">graph_</span><span class="p">[</span><span class="k">this</span><span class="o">-&gt;</spa [...]
+<span class="w">        </span><span class="k">this</span><span class="o">-&gt;</span><span class="n">Run</span><span class="p">(</span><span class="n">it</span><span class="p">.</span><span class="n">id</span><span class="p">,</span><span class="w"> </span><span class="n">it</span><span class="p">.</span><span class="n">inputs</span><span class="p">,</span><span class="w"> </span><span class="n">it</span><span class="p">.</span><span class="n">output</span><span class="p">);</span><span [...]
+<span class="w">      </span><span class="p">}</span><span class="w"></span>
+<span class="w">      </span><span class="n">ICHECK_GT</span><span class="p">(</span><span class="n">graph_</span><span class="p">.</span><span class="n">count</span><span class="p">(</span><span class="k">this</span><span class="o">-&gt;</span><span class="n">curr_subgraph_</span><span class="p">),</span><span class="w"> </span><span class="mi">0U</span><span class="p">);</span><span class="w"></span>
+
+<span class="w">      </span><span class="c1">// Copy the output from a data entry back to TVM runtime argument.</span>
+<span class="w">      </span><span class="k">auto</span><span class="w"> </span><span class="n">out_idx</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">graph_</span><span class="p">[</span><span class="k">this</span><span class="o">-&gt;</span><span class="n">curr_subgraph_</span><span class="p">].</span><span class="n">back</span><span class="p">().</span><span class="n">output</span><span class="p">;</span><span class="w"></span>
+<span class="w">      </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">args</span><span class="p">[</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">()</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">1</span><span class="p">].</span><span class="n">type_code</span><span class="p">()</span><span class="w"> </span><span class="o">==</span><span cl [...]
+<span class="w">        </span><span class="n">DLTensor</span><span class="o">*</span><span class="w"> </span><span class="n">arg</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">args</span><span class="p">[</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">()</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">1</span><span class="p">];</span><span cl [...]
+<span class="w">        </span><span class="k">this</span><span class="o">-&gt;</span><span class="n">data_entry_</span><span class="p">[</span><span class="n">out_idx</span><span class="p">].</span><span class="n">CopyTo</span><span class="p">(</span><span class="n">arg</span><span class="p">);</span><span class="w"></span>
+<span class="w">      </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">        </span><span class="n">NDArray</span><span class="w"> </span><span class="n">arg</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">args</span><span class="p">[</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">()</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">1</span><span class="p">];</span><span class="w"></span>
+<span class="w">        </span><span class="k">this</span><span class="o">-&gt;</span><span class="n">data_entry_</span><span class="p">[</span><span class="n">out_idx</span><span class="p">].</span><span class="n">CopyTo</span><span class="p">(</span><span class="n">arg</span><span class="p">);</span><span class="w"></span>
+<span class="w">      </span><span class="p">}</span><span class="w"></span>
+<span class="w">      </span><span class="o">*</span><span class="n">rv</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">data_entry_</span><span class="p">.</span><span class="n">back</span><span class="p">();</span><span class="w"></span>
+<span class="w">    </span><span class="p">});</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">LOG</span><span class="p">(</span><span class="n">FATAL</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Unknown subgraph: &quot;</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">name</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;</span><span class="se"> [...]
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="n">PackedFunc</span><span class="p">();</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>As can be seen, <code class="docutils literal notranslate"><span class="pre">GetFunction</span></code> is composed of three major parts. The first part copies data from TVM runtime arguments to the corresponding data entries we assigned in the constructor. The second part executes the subgraph with <code class="docutils literal notranslate"><span class="pre">Run</span></code> function (will implement later) and saves the results to another data entry. The third part copies the results [...]
@@ -1017,45 +1017,45 @@
 <div class="section" id="implement-run">
 <h4>Implement Run<a class="headerlink" href="#implement-run" title="Permalink to this headline">¶</a></h4>
 <p>Now let’s implement <code class="docutils literal notranslate"><span class="pre">Run</span></code> function. This function accepts 1) a subgraph ID, 2) a list of input data entry indexs, and 3) an output data entry index.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span> <span class="nf">Run</span><span class="p">(</span><span class="kt">int</span> <span class="n">id</span><span class="p">,</span> <span class="k">const</span> <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="kt">int</span><span class="o">&gt;&amp;</span> <span class="n">inputs</span><span class="p">,</span> <spa [...]
-  <span class="c1">// Make a list data entry indexs.</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="kt">int</span><span class="o">&gt;</span> <span class="n">args</span><span class="p">(</span><span class="n">inputs</span><span class="p">.</span><span class="n">begin</span><span class="p">(),</span> <span class="n">inputs</span><span class="p">.</span><span class="n">end</span><span class="p">());</span>
-  <span class="n">args</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">output</span><span class="p">);</span>
-
-  <span class="c1">// Initialize data holders.</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">TVMValue</span><span class="o">&gt;</span> <span class="n">values</span><span class="p">(</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">());</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="kt">int</span><span class="o">&gt;</span> <span class="n">type_codes</span><span class="p">(</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">());</span>
-
-  <span class="c1">// Initialize a TVM arg setter with TVMValue and its type code.</span>
-  <span class="n">TVMArgsSetter</span> <span class="n">setter</span><span class="p">(</span><span class="n">values</span><span class="p">.</span><span class="n">data</span><span class="p">(),</span> <span class="n">type_codes</span><span class="p">.</span><span class="n">data</span><span class="p">());</span>
-
-  <span class="c1">// Set each argument to its corresponding data entry.</span>
-  <span class="k">if</span> <span class="p">(</span><span class="n">op_id_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span> <span class="o">==</span> <span class="s">&quot;add&quot;</span> <span class="o">||</span> <span class="n">op_id_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span> <span class="o">==</span> <span class="s">&quot;sub&quot;</span> <span class="o">||</span> <span class="n">op_id_</span><span class="p">[</span [...]
-    <span class="k">for</span> <span class="p">(</span><span class="kt">size_t</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">();</span> <span class="n">i</span><span class="o">++</span><span class="p">)</span> <span class="p">{</span>
-      <span class="n">setter</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">data_entry_</span><span class="p">[</span><span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">]]);</span>
-    <span class="p">}</span>
-  <span class="p">}</span>
-
-  <span class="c1">// Invoke the corresponding operator function.</span>
-  <span class="k">if</span> <span class="p">(</span><span class="n">op_id_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span> <span class="o">==</span> <span class="s">&quot;add&quot;</span><span class="p">)</span> <span class="p">{</span>
-    <span class="n">Add</span><span class="p">(</span><span class="n">values</span><span class="p">.</span><span class="n">data</span><span class="p">(),</span> <span class="n">type_codes</span><span class="p">.</span><span class="n">data</span><span class="p">(),</span> <span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">());</span>
-  <span class="p">}</span> <span class="k">else</span> <span class="k">if</span> <span class="p">(</span><span class="n">op_id_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span> <span class="o">==</span> <span class="s">&quot;sub&quot;</span><span class="p">)</span> <span class="p">{</span>
-    <span class="n">Sub</span><span class="p">(</span><span class="n">values</span><span class="p">.</span><span class="n">data</span><span class="p">(),</span> <span class="n">type_codes</span><span class="p">.</span><span class="n">data</span><span class="p">(),</span> <span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">());</span>
-  <span class="p">}</span> <span class="k">else</span> <span class="k">if</span> <span class="p">(</span><span class="n">op_id_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span> <span class="o">==</span> <span class="s">&quot;mul&quot;</span><span class="p">)</span> <span class="p">{</span>
-    <span class="n">Mul</span><span class="p">(</span><span class="n">values</span><span class="p">.</span><span class="n">data</span><span class="p">(),</span> <span class="n">type_codes</span><span class="p">.</span><span class="n">data</span><span class="p">(),</span> <span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">());</span>
-  <span class="p">}</span> <span class="k">else</span> <span class="p">{</span>
-    <span class="n">LOG</span><span class="p">(</span><span class="n">FATAL</span><span class="p">)</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;Unknown op: &quot;</span> <span class="o">&lt;&lt;</span> <span class="n">op_id_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span> <span class="o">&lt;&lt;</span> <span class="s">&quot;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span>
-  <span class="p">}</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span><span class="w"> </span><span class="nf">Run</span><span class="p">(</span><span class="kt">int</span><span class="w"> </span><span class="n">id</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="kt">int</span [...]
+<span class="w">  </span><span class="c1">// Make a list data entry indexs.</span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="kt">int</span><span class="o">&gt;</span><span class="w"> </span><span class="n">args</span><span class="p">(</span><span class="n">inputs</span><span class="p">.</span><span class="n">begin</span><span class="p">(),</span><span class="w"> </span><span class="n">inputs</span><span class="p">.</span><span class="n">end</span><span class="p">());< [...]
+<span class="w">  </span><span class="n">args</span><span class="p">.</span><span class="n">push_back</span><span class="p">(</span><span class="n">output</span><span class="p">);</span><span class="w"></span>
+
+<span class="w">  </span><span class="c1">// Initialize data holders.</span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">TVMValue</span><span class="o">&gt;</span><span class="w"> </span><span class="n">values</span><span class="p">(</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">());</span><span class="w"></span>
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="kt">int</span><span class="o">&gt;</span><span class="w"> </span><span class="n">type_codes</span><span class="p">(</span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">());</span><span class="w"></span>
+
+<span class="w">  </span><span class="c1">// Initialize a TVM arg setter with TVMValue and its type code.</span>
+<span class="w">  </span><span class="n">TVMArgsSetter</span><span class="w"> </span><span class="n">setter</span><span class="p">(</span><span class="n">values</span><span class="p">.</span><span class="n">data</span><span class="p">(),</span><span class="w"> </span><span class="n">type_codes</span><span class="p">.</span><span class="n">data</span><span class="p">());</span><span class="w"></span>
+
+<span class="w">  </span><span class="c1">// Set each argument to its corresponding data entry.</span>
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">op_id_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s">&quot;add&quot;</span><span class="w"> </span><span class="o">||</span><span class="w"> </span><span class="n">op_id_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span><span  [...]
+<span class="w">    </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="kt">size_t</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">&lt;</span><span class="w"> </span><span class="n">args</span><span class="p">.</span><span class="n">si [...]
+<span class="w">      </span><span class="n">setter</span><span class="p">(</span><span class="n">i</span><span class="p">,</span><span class="w"> </span><span class="n">data_entry_</span><span class="p">[</span><span class="n">args</span><span class="p">[</span><span class="n">i</span><span class="p">]]);</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+
+<span class="w">  </span><span class="c1">// Invoke the corresponding operator function.</span>
+<span class="w">  </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">op_id_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s">&quot;add&quot;</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">Add</span><span class="p">(</span><span class="n">values</span><span class="p">.</span><span class="n">data</span><span class="p">(),</span><span class="w"> </span><span class="n">type_codes</span><span class="p">.</span><span class="n">data</span><span class="p">(),</span><span class="w"> </span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">());</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">op_id_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s">&quot;sub&quot;</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span clas [...]
+<span class="w">    </span><span class="n">Sub</span><span class="p">(</span><span class="n">values</span><span class="p">.</span><span class="n">data</span><span class="p">(),</span><span class="w"> </span><span class="n">type_codes</span><span class="p">.</span><span class="n">data</span><span class="p">(),</span><span class="w"> </span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">());</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">op_id_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="s">&quot;mul&quot;</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span clas [...]
+<span class="w">    </span><span class="n">Mul</span><span class="p">(</span><span class="n">values</span><span class="p">.</span><span class="n">data</span><span class="p">(),</span><span class="w"> </span><span class="n">type_codes</span><span class="p">.</span><span class="n">data</span><span class="p">(),</span><span class="w"> </span><span class="n">args</span><span class="p">.</span><span class="n">size</span><span class="p">());</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">LOG</span><span class="p">(</span><span class="n">FATAL</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Unknown op: &quot;</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">op_id_</span><span class="p">[</span><span class="n">id</span><span class="p">]</span><span class="w"> </span><span class="o">&lt;&lt;</span>< [...]
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p><code class="docutils literal notranslate"><span class="pre">Run</span></code> function mainly has two parts. The first part allocates a list of <code class="docutils literal notranslate"><span class="pre">TVMValue</span></code>, and maps corresponding data entry blocks. This will become the arguments of our operator functions. The second part than invokes our operator functions. Although we use the same C functions as the previous example, you can replace <code class="docutils litera [...]
 <p>With above functions implemented, our customized codegen and runtime can now execute subgraphs. The last step is registering an API (<code class="docutils literal notranslate"><span class="pre">examplejson_module_create</span></code>) to create this module:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;module.examplejson_module_create&quot;</span><span class="p">)</span>
-<span class="p">.</span><span class="n">set_body_typed</span><span class="p">([](</span><span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">code</span><span class="p">){</span>
-    <span class="k">auto</span> <span class="n">n</span> <span class="o">=</span> <span class="n">make_object</span><span class="o">&lt;</span><span class="n">ExampleJsonModule</span><span class="o">&gt;</span><span class="p">(</span><span class="n">code</span><span class="p">);</span>
-    <span class="k">return</span> <span class="n">runtime</span><span class="o">::</span><span class="n">Module</span><span class="p">(</span><span class="n">n</span><span class="p">);</span>
-<span class="p">});</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;module.examplejson_module_create&quot;</span><span class="p">)</span><span class="w"></span>
+<span class="p">.</span><span class="n">set_body_typed</span><span class="p">([](</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">code</span><span class="p">){</span><span class="w"></span>
+<span class="w">    </span><span class="k">auto</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">make_object</span><span class="o">&lt;</span><span class="n">ExampleJsonModule</span><span class="o">&gt;</span><span class="p">(</span><span class="n">code</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="n">runtime</span><span class="o">::</span><span class="n">Module</span><span class="p">(</span><span class="n">n</span><span class="p">);</span><span class="w"></span>
+<span class="p">});</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
@@ -1063,47 +1063,47 @@
 <h4>Implement SaveToBinary and LoadFromBinary<a class="headerlink" href="#implement-savetobinary-and-loadfrombinary" title="Permalink to this headline">¶</a></h4>
 <p>So far we have implemented the main features of a customized runtime so that it can be used as other TVM runtimes. However, when users want to save the built runtime to a disk for deployment, TVM has no idea about how to save it. This is the reason we want to implement <code class="docutils literal notranslate"><span class="pre">SaveToBinary</span></code> and <code class="docutils literal notranslate"><span class="pre">LoadFromBinary</span></code>, which tell TVM how should this custo [...]
 <p>We first implement <code class="docutils literal notranslate"><span class="pre">SaveToBinary</span></code> function to allow users to save this module in disk.</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span> <span class="nf">SaveToBinary</span><span class="p">(</span><span class="n">dmlc</span><span class="o">::</span><span class="n">Stream</span><span class="o">*</span> <span class="n">stream</span><span class="p">)</span> <span class="k">final</span> <span class="p">{</span>
-    <span class="n">stream</span><span class="o">-&gt;</span><span class="n">Write</span><span class="p">(</span><span class="k">this</span><span class="o">-&gt;</span><span class="n">graph_json_</span><span class="p">);</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="kt">void</span><span class="w"> </span><span class="nf">SaveToBinary</span><span class="p">(</span><span class="n">dmlc</span><span class="o">::</span><span class="n">Stream</span><span class="o">*</span><span class="w"> </span><span class="n">stream</span><span class="p">)</span><span class="w"> </span><span class="k">final</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="n">stream</span><span class="o">-&gt;</span><span class="n">Write</span><span class="p">(</span><span class="k">this</span><span class="o">-&gt;</span><span class="n">graph_json_</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>We can find that this function is pretty simple. Recall that the only argument we took in constructor is a subgraph representation, meaning that we only need a subgraph representation to construct/recover this customized runtime module. As a result, <code class="docutils literal notranslate"><span class="pre">SaveToBinary</span></code> simply writes the subgraph to an output DMLC stream. That is, when users use <code class="docutils literal notranslate"><span class="pre">export_librar [...]
 <p>Similarity, <code class="docutils literal notranslate"><span class="pre">LoadFromBinary</span></code> reads the subgraph stream and re-constructs the customized runtime module:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">static</span> <span class="n">Module</span> <span class="nf">LoadFromBinary</span><span class="p">(</span><span class="kt">void</span><span class="o">*</span> <span class="n">strm</span><span class="p">)</span> <span class="p">{</span>
-  <span class="n">dmlc</span><span class="o">::</span><span class="n">Stream</span><span class="o">*</span> <span class="n">stream</span> <span class="o">=</span> <span class="k">static_cast</span><span class="o">&lt;</span><span class="n">dmlc</span><span class="o">::</span><span class="n">Stream</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">strm</span><span class="p">);</span>
-  <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">graph_json</span><span class="p">;</span>
-  <span class="n">stream</span><span class="o">-&gt;</span><span class="n">Read</span><span class="p">(</span><span class="o">&amp;</span><span class="n">graph_json</span><span class="p">);</span>
-  <span class="k">auto</span> <span class="n">n</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">::</span><span class="n">runtime</span><span class="o">::</span><span class="n">make_object</span><span class="o">&lt;</span><span class="n">ExampleJsonModule</span><span class="o">&gt;</span><span class="p">(</span><span class="n">graph_json</span><span class="p">);</span>
-  <span class="k">return</span> <span class="n">Module</span><span class="p">(</span><span class="n">n</span><span class="p">);</span>
-<span class="p">}</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">static</span><span class="w"> </span><span class="n">Module</span><span class="w"> </span><span class="nf">LoadFromBinary</span><span class="p">(</span><span class="kt">void</span><span class="o">*</span><span class="w"> </span><span class="n">strm</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="n">dmlc</span><span class="o">::</span><span class="n">Stream</span><span class="o">*</span><span class="w"> </span><span class="n">stream</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">static_cast</span><span class="o">&lt;</span><span class="n">dmlc</span><span class="o">::</span><span class="n">Stream</span><span class="o">*&gt;</span><span class="p">(</span><span class="n">strm</span><span class="p"> [...]
+<span class="w">  </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">graph_json</span><span class="p">;</span><span class="w"></span>
+<span class="w">  </span><span class="n">stream</span><span class="o">-&gt;</span><span class="n">Read</span><span class="p">(</span><span class="o">&amp;</span><span class="n">graph_json</span><span class="p">);</span><span class="w"></span>
+<span class="w">  </span><span class="k">auto</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">tvm</span><span class="o">::</span><span class="n">runtime</span><span class="o">::</span><span class="n">make_object</span><span class="o">&lt;</span><span class="n">ExampleJsonModule</span><span class="o">&gt;</span><span class="p">(</span><span class="n">graph_json</span><span class="p">);</span><spa [...]
+<span class="w">  </span><span class="k">return</span><span class="w"> </span><span class="n">Module</span><span class="p">(</span><span class="n">n</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 <p>We also need to register this function to enable the corresponding Python API:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;module.loadbinary_examplejson&quot;</span><span class="p">)</span>
-<span class="p">.</span><span class="n">set_body_typed</span><span class="p">(</span><span class="n">ExampleJsonModule</span><span class="o">::</span><span class="n">LoadFromBinary</span><span class="p">);</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;module.loadbinary_examplejson&quot;</span><span class="p">)</span><span class="w"></span>
+<span class="p">.</span><span class="n">set_body_typed</span><span class="p">(</span><span class="n">ExampleJsonModule</span><span class="o">::</span><span class="n">LoadFromBinary</span><span class="p">);</span><span class="w"></span>
 </pre></div>
 </div>
 <p>The above registration means when users call <code class="docutils literal notranslate"><span class="pre">tvm.runtime.load_module(lib_path)</span></code> API and the exported library has an ExampleJSON stream, our <code class="docutils literal notranslate"><span class="pre">LoadFromBinary</span></code> will be invoked to create the same customized runtime module.</p>
 <p>In addition, if you want to support module creation directly from an ExampleJSON file, you can also implement a simple function and register a Python API as follows:</p>
-<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">static</span> <span class="n">Module</span> <span class="nf">Create</span><span class="p">(</span><span class="k">const</span> <span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span> <span class="n">path</span><span class="p">)</span> <span class="p">{</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">ifstream</span> <span class="n">filep</span><span class="p">;</span>
-    <span class="n">filep</span><span class="p">.</span><span class="n">open</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">std</span><span class="o">::</span><span class="n">ios</span><span class="o">::</span><span class="n">in</span><span class="p">);</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">graph_json</span><span class="p">;</span>
-    <span class="n">std</span><span class="o">::</span><span class="n">string</span> <span class="n">line</span><span class="p">;</span>
-    <span class="k">while</span> <span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">getline</span><span class="p">(</span><span class="n">filep</span><span class="p">,</span> <span class="n">line</span><span class="p">))</span> <span class="p">{</span>
-        <span class="n">graph_json</span> <span class="o">+=</span> <span class="n">line</span><span class="p">;</span>
-        <span class="n">graph_json</span> <span class="o">+=</span> <span class="s">&quot;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span>
-    <span class="p">}</span>
-    <span class="n">filep</span><span class="p">.</span><span class="n">close</span><span class="p">();</span>
-    <span class="k">auto</span> <span class="n">n</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">::</span><span class="n">runtime</span><span class="o">::</span><span class="n">make_object</span><span class="o">&lt;</span><span class="n">ExampleJsonModule</span><span class="o">&gt;</span><span class="p">(</span><span class="n">graph_json</span><span class="p">);</span>
-    <span class="k">return</span> <span class="n">Module</span><span class="p">(</span><span class="n">n</span><span class="p">);</span>
-<span class="p">}</span>
-
-<span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;module.loadfile_examplejson&quot;</span><span class="p">)</span>
-<span class="p">.</span><span class="n">set_body</span><span class="p">([](</span><span class="n">TVMArgs</span> <span class="n">args</span><span class="p">,</span> <span class="n">TVMRetValue</span><span class="o">*</span> <span class="n">rv</span><span class="p">)</span> <span class="p">{</span>
-    <span class="o">*</span><span class="n">rv</span> <span class="o">=</span> <span class="n">ExampleJsonModule</span><span class="o">::</span><span class="n">Create</span><span class="p">(</span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">]);</span>
-<span class="p">});</span>
+<div class="highlight-c++ notranslate"><div class="highlight"><pre><span></span><span class="k">static</span><span class="w"> </span><span class="n">Module</span><span class="w"> </span><span class="nf">Create</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span><span class="w"> </span><span class="n">path</span><span class="p">)</span><span class="w">  [...]
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">ifstream</span><span class="w"> </span><span class="n">filep</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="n">filep</span><span class="p">.</span><span class="n">open</span><span class="p">(</span><span class="n">path</span><span class="p">,</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">ios</span><span class="o">::</span><span class="n">in</span><span class="p">);</span><span class="w"></span>
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">graph_json</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="w"> </span><span class="n">line</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="k">while</span><span class="w"> </span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">getline</span><span class="p">(</span><span class="n">filep</span><span class="p">,</span><span class="w"> </span><span class="n">line</span><span class="p">))</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">        </span><span class="n">graph_json</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="n">line</span><span class="p">;</span><span class="w"></span>
+<span class="w">        </span><span class="n">graph_json</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="s">&quot;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">;</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+<span class="w">    </span><span class="n">filep</span><span class="p">.</span><span class="n">close</span><span class="p">();</span><span class="w"></span>
+<span class="w">    </span><span class="k">auto</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">tvm</span><span class="o">::</span><span class="n">runtime</span><span class="o">::</span><span class="n">make_object</span><span class="o">&lt;</span><span class="n">ExampleJsonModule</span><span class="o">&gt;</span><span class="p">(</span><span class="n">graph_json</span><span class="p">);</span><s [...]
+<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="n">Module</span><span class="p">(</span><span class="n">n</span><span class="p">);</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
+
+<span class="n">TVM_REGISTER_GLOBAL</span><span class="p">(</span><span class="s">&quot;module.loadfile_examplejson&quot;</span><span class="p">)</span><span class="w"></span>
+<span class="p">.</span><span class="n">set_body</span><span class="p">([](</span><span class="n">TVMArgs</span><span class="w"> </span><span class="n">args</span><span class="p">,</span><span class="w"> </span><span class="n">TVMRetValue</span><span class="o">*</span><span class="w"> </span><span class="n">rv</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">    </span><span class="o">*</span><span class="n">rv</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ExampleJsonModule</span><span class="o">::</span><span class="n">Create</span><span class="p">(</span><span class="n">args</span><span class="p">[</span><span class="mi">0</span><span class="p">]);</span><span class="w"></span>
+<span class="p">});</span><span class="w"></span>
 </pre></div>
 </div>
 <p>It means users can manually write/modify an ExampleJSON file, and use Python API <code class="docutils literal notranslate"><span class="pre">tvm.runtime.load_module(&quot;mysubgraph.examplejson&quot;,</span> <span class="pre">&quot;examplejson&quot;)</span></code> to construct a customized module.</p>
diff --git a/docs/dev/tutorial/codebase_walkthrough.html b/docs/dev/tutorial/codebase_walkthrough.html
index 4d35f93e7..e3249bb45 100644
--- a/docs/dev/tutorial/codebase_walkthrough.html
+++ b/docs/dev/tutorial/codebase_walkthrough.html
@@ -344,7 +344,7 @@
 <span class="k">class</span> <span class="nc">Tensor</span><span class="p">(</span><span class="n">Object</span><span class="p">,</span> <span class="n">_expr</span><span class="o">.</span><span class="n">ExprOp</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;Tensor object, to construct, see function.Tensor&quot;&quot;&quot;</span>
 
-    <span class="k">def</span> <span class="nf">__call__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">indices</span><span class="p">):</span>
+    <span class="k">def</span> <span class="fm">__call__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">indices</span><span class="p">):</span>
        <span class="o">...</span>
 </pre></div>
 </div>
diff --git a/docs/how_to/compile_models/from_coreml.html b/docs/how_to/compile_models/from_coreml.html
index 4a1b1d67f..c9d93d332 100644
--- a/docs/how_to/compile_models/from_coreml.html
+++ b/docs/how_to/compile_models/from_coreml.html
@@ -352,12 +352,12 @@
 <p>or please refer to official site
 <a class="reference external" href="https://github.com/apple/coremltools">https://github.com/apple/coremltools</a></p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
 <span class="kn">import</span> <span class="nn">tvm.relay</span> <span class="k">as</span> <span class="nn">relay</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
 <span class="kn">import</span> <span class="nn">coremltools</span> <span class="k">as</span> <span class="nn">cm</span>
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
-<span class="kn">from</span> <span class="nn">PIL</span> <span class="k">import</span> <span class="n">Image</span>
+<span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
 </pre></div>
 </div>
 <div class="section" id="load-pretrained-coreml-model">
@@ -401,7 +401,7 @@ provided by apple in this example</p>
 <div class="section" id="execute-on-tvm">
 <h2>Execute on TVM<a class="headerlink" href="#execute-on-tvm" title="Permalink to this headline">¶</a></h2>
 <p>The process is no different from other example</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span>
 
 <span class="n">dev</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">cpu</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
 <span class="n">dtype</span> <span class="o">=</span> <span class="s2">&quot;float32&quot;</span>
diff --git a/docs/how_to/compile_models/from_darknet.html b/docs/how_to/compile_models/from_darknet.html
index 8b2ffd9c9..af92d3ae1 100644
--- a/docs/how_to/compile_models/from_darknet.html
+++ b/docs/how_to/compile_models/from_darknet.html
@@ -359,11 +359,11 @@ pip install opencv-python
 
 <span class="c1"># tvm, relay</span>
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
-<span class="kn">from</span> <span class="nn">ctypes</span> <span class="k">import</span> <span class="o">*</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
-<span class="kn">from</span> <span class="nn">tvm.relay.testing.darknet</span> <span class="k">import</span> <span class="n">__darknetffi__</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">ctypes</span> <span class="kn">import</span> <span class="o">*</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">tvm.relay.testing.darknet</span> <span class="kn">import</span> <span class="n">__darknetffi__</span>
 <span class="kn">import</span> <span class="nn">tvm.relay.testing.yolo_detection</span>
 <span class="kn">import</span> <span class="nn">tvm.relay.testing.darknet</span>
 </pre></div>
@@ -454,7 +454,7 @@ pip install opencv-python
 <div class="section" id="execute-on-tvm-runtime">
 <h2>Execute on TVM Runtime<a class="headerlink" href="#execute-on-tvm-runtime" title="Permalink to this headline">¶</a></h2>
 <p>The process is no different from other examples.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span>
 
 <span class="n">m</span> <span class="o">=</span> <a href="../../reference/api/python/graph_executor.html#tvm.contrib.graph_executor.GraphModule" title="View documentation for tvm.contrib.graph_executor.GraphModule"><span class="n">graph_executor</span><span class="o">.</span><span class="n">GraphModule</span></a><span class="p">(</span><span class="n">lib</span><span class="p">[</span><span class="s2">&quot;default&quot;</span><span class="p">](</span><span class="n">dev</span><span cla [...]
 
diff --git a/docs/how_to/compile_models/from_keras.html b/docs/how_to/compile_models/from_keras.html
index a02727557..29dab2989 100644
--- a/docs/how_to/compile_models/from_keras.html
+++ b/docs/how_to/compile_models/from_keras.html
@@ -354,9 +354,9 @@ pip install -U tensorflow --user
 <p>or please refer to official site
 <a class="reference external" href="https://keras.io/#installation">https://keras.io/#installation</a></p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
 <span class="kn">import</span> <span class="nn">tvm.relay</span> <span class="k">as</span> <span class="nn">relay</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
 <span class="kn">import</span> <span class="nn">keras</span>
 <span class="kn">import</span> <span class="nn">tensorflow</span> <span class="k">as</span> <span class="nn">tf</span>
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
@@ -394,9 +394,9 @@ pip install -U tensorflow --user
 <div class="section" id="load-a-test-image">
 <h2>Load a test image<a class="headerlink" href="#load-a-test-image" title="Permalink to this headline">¶</a></h2>
 <p>A single cat dominates the examples!</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">PIL</span> <span class="k">import</span> <span class="n">Image</span>
-<span class="kn">from</span> <span class="nn">matplotlib</span> <span class="k">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span>
-<span class="kn">from</span> <span class="nn">tensorflow.keras.applications.resnet50</span> <span class="k">import</span> <span class="n">preprocess_input</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
+<span class="kn">from</span> <span class="nn">matplotlib</span> <span class="kn">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span>
+<span class="kn">from</span> <span class="nn">tensorflow.keras.applications.resnet50</span> <span class="kn">import</span> <span class="n">preprocess_input</span>
 
 <span class="n">img_url</span> <span class="o">=</span> <span class="s2">&quot;https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true&quot;</span>
 <span class="n">img_path</span> <span class="o">=</span> <span class="n">download_testdata</span><span class="p">(</span><span class="n">img_url</span><span class="p">,</span> <span class="s2">&quot;cat.png&quot;</span><span class="p">,</span> <span class="n">module</span><span class="o">=</span><span class="s2">&quot;data&quot;</span><span class="p">)</span>
diff --git a/docs/how_to/compile_models/from_mxnet.html b/docs/how_to/compile_models/from_mxnet.html
index 516c2a08b..fde1bb759 100644
--- a/docs/how_to/compile_models/from_mxnet.html
+++ b/docs/how_to/compile_models/from_mxnet.html
@@ -360,10 +360,10 @@
 <div class="section" id="download-resnet18-model-from-gluon-model-zoo">
 <h2>Download Resnet18 model from Gluon Model Zoo<a class="headerlink" href="#download-resnet18-model-from-gluon-model-zoo" title="Permalink to this headline">¶</a></h2>
 <p>In this section, we download a pretrained imagenet model and classify an image.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
-<span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="k">import</span> <span class="n">get_model</span>
-<span class="kn">from</span> <span class="nn">PIL</span> <span class="k">import</span> <span class="n">Image</span>
-<span class="kn">from</span> <span class="nn">matplotlib</span> <span class="k">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="kn">import</span> <span class="n">get_model</span>
+<span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
+<span class="kn">from</span> <span class="nn">matplotlib</span> <span class="kn">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span>
 
 <span class="n">block</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="s2">&quot;resnet18_v1&quot;</span><span class="p">,</span> <span class="n">pretrained</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
 <span class="n">img_url</span> <span class="o">=</span> <span class="s2">&quot;https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true&quot;</span>
@@ -400,7 +400,7 @@
 </div>
 <img alt="../../_images/sphx_glr_from_mxnet_001.png" class="sphx-glr-single-img" src="../../_images/sphx_glr_from_mxnet_001.png" />
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip90c37b1c-4f10-4a8e-8112-80f1ef514b75 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip375983c7-f444-43eb-aeb6-1fc225818214 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
 x (1, 3, 224, 224)
 </pre></div>
 </div>
@@ -427,7 +427,7 @@ We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon</p>
 <div class="section" id="execute-the-portable-graph-on-tvm">
 <h2>Execute the portable graph on TVM<a class="headerlink" href="#execute-the-portable-graph-on-tvm" title="Permalink to this headline">¶</a></h2>
 <p>Now, we would like to reproduce the same forward computation using TVM.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span>
 
 <span class="n">dev</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">cuda</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
 <span class="n">dtype</span> <span class="o">=</span> <span class="s2">&quot;float32&quot;</span>
diff --git a/docs/how_to/compile_models/from_onnx.html b/docs/how_to/compile_models/from_onnx.html
index 7be3bda09..17c40329e 100644
--- a/docs/how_to/compile_models/from_onnx.html
+++ b/docs/how_to/compile_models/from_onnx.html
@@ -355,9 +355,9 @@
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">onnx</span>
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
 <span class="kn">import</span> <span class="nn">tvm.relay</span> <span class="k">as</span> <span class="nn">relay</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
 </pre></div>
 </div>
 <div class="section" id="load-pretrained-onnx-model">
@@ -386,7 +386,7 @@ we skip the pytorch model construction part, and download the saved onnx model</
 axis, a 672x672 image. Re-scale the cat image to fit this input shape then
 convert to <cite>YCbCr</cite>. The super resolution model will then be applied to the
 luminance (<cite>Y</cite>) channel.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">PIL</span> <span class="k">import</span> <span class="n">Image</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
 
 <span class="n">img_url</span> <span class="o">=</span> <span class="s2">&quot;https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true&quot;</span>
 <span class="n">img_path</span> <span class="o">=</span> <span class="n">download_testdata</span><span class="p">(</span><span class="n">img_url</span><span class="p">,</span> <span class="s2">&quot;cat.png&quot;</span><span class="p">,</span> <span class="n">module</span><span class="o">=</span><span class="s2">&quot;data&quot;</span><span class="p">)</span>
@@ -438,7 +438,7 @@ provides a static definition of the input size.</p>
 <p>We put input and output image neck to neck. The luminance channel, <cite>Y</cite> is the output
 from the model. The chroma channels <cite>Cb</cite> and <cite>Cr</cite> are resized to match with a simple
 bicubic algorithm. The image is then recombined and converted back to <cite>RGB</cite>.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">matplotlib</span> <span class="k">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">matplotlib</span> <span class="kn">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span>
 
 <span class="n">out_y</span> <span class="o">=</span> <span class="n">Image</span><span class="o">.</span><span class="n">fromarray</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">uint8</span><span class="p">((</span><span class="n">tvm_output</span><span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">])</span><span class="o">.</span><span class="n">clip</span><span class="p">(</s [...]
 <span class="n">out_cb</span> <span class="o">=</span> <span class="n">img_cb</span><span class="o">.</span><span class="n">resize</span><span class="p">(</span><span class="n">out_y</span><span class="o">.</span><span class="n">size</span><span class="p">,</span> <span class="n">Image</span><span class="o">.</span><span class="n">BICUBIC</span><span class="p">)</span>
diff --git a/docs/how_to/compile_models/from_paddle.html b/docs/how_to/compile_models/from_paddle.html
index 92a8e524e..402f4d47c 100644
--- a/docs/how_to/compile_models/from_paddle.html
+++ b/docs/how_to/compile_models/from_paddle.html
@@ -355,8 +355,8 @@ A quick solution is</p>
 <span class="kn">import</span> <span class="nn">paddle</span>
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
@@ -397,7 +397,7 @@ A quick solution is</p>
 <div class="section" id="load-a-test-image">
 <h2>Load a test image<a class="headerlink" href="#load-a-test-image" title="Permalink to this headline">¶</a></h2>
 <p>A single cat dominates the examples!</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">PIL</span> <span class="k">import</span> <span class="n">Image</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
 <span class="kn">import</span> <span class="nn">paddle.vision.transforms</span> <span class="k">as</span> <span class="nn">T</span>
 
 
@@ -456,14 +456,14 @@ A quick solution is</p>
     <span class="n">synset</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">readlines</span><span class="p">()</span>
 
 <span class="n">top1</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">tvm_output</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
-<span class="nb">print</span><span class="p">(</span><span class="n">f</span><span class="s2">&quot;TVM prediction top-1 id: </span><span class="si">{top1}</span><span class="s2">, class name: </span><span class="si">{synset[top1]}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;TVM prediction top-1 id: </span><span class="si">{</span><span class="n">top1</span><span class="si">}</span><span class="s2">, class name: </span><span class="si">{</span><span class="n">synset</span><span class="p">[</span><span class="n">top1</span><span class="p">]</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>TVM prediction top-1 id: 282, class name:  282: &#39;tiger cat&#39;,
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  6.070 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  4.802 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-paddle-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/16269b77359771348d507395692524cf/from_paddle.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_paddle.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_pytorch.html b/docs/how_to/compile_models/from_pytorch.html
index e326dd635..3def1c8c3 100644
--- a/docs/how_to/compile_models/from_pytorch.html
+++ b/docs/how_to/compile_models/from_pytorch.html
@@ -359,11 +359,11 @@ with the proper TorchVision version.</p>
 <p>Currently, TVM supports PyTorch 1.7 and 1.4. Other versions may
 be unstable.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
 
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
 
 <span class="c1"># PyTorch imports</span>
 <span class="kn">import</span> <span class="nn">torch</span>
@@ -386,24 +386,25 @@ be unstable.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://download.pytorch.org/models/resnet18-f37072fd.pth&quot; to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
 
   0%|          | 0.00/44.7M [00:00&lt;?, ?B/s]
-  8%|8         | 3.58M/44.7M [00:00&lt;00:01, 37.5MB/s]
- 19%|#8        | 8.34M/44.7M [00:00&lt;00:00, 44.6MB/s]
- 77%|#######7  | 34.5M/44.7M [00:00&lt;00:00, 149MB/s]
-100%|##########| 44.7M/44.7M [00:00&lt;00:00, 139MB/s]
+  4%|3         | 1.70M/44.7M [00:00&lt;00:02, 17.7MB/s]
+  8%|7         | 3.40M/44.7M [00:00&lt;00:02, 15.6MB/s]
+ 37%|###6      | 16.4M/44.7M [00:00&lt;00:00, 66.9MB/s]
+ 72%|#######2  | 32.2M/44.7M [00:00&lt;00:00, 104MB/s]
+100%|##########| 44.7M/44.7M [00:00&lt;00:00, 89.7MB/s]
 </pre></div>
 </div>
 </div>
 <div class="section" id="load-a-test-image">
 <h2>Load a test image<a class="headerlink" href="#load-a-test-image" title="Permalink to this headline">¶</a></h2>
 <p>Classic cat example!</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">PIL</span> <span class="k">import</span> <span class="n">Image</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
 
 <span class="n">img_url</span> <span class="o">=</span> <span class="s2">&quot;https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true&quot;</span>
 <span class="n">img_path</span> <span class="o">=</span> <span class="n">download_testdata</span><span class="p">(</span><span class="n">img_url</span><span class="p">,</span> <span class="s2">&quot;cat.png&quot;</span><span class="p">,</span> <span class="n">module</span><span class="o">=</span><span class="s2">&quot;data&quot;</span><span class="p">)</span>
 <span class="n">img</span> <span class="o">=</span> <span class="n">Image</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">img_path</span><span class="p">)</span><span class="o">.</span><span class="n">resize</span><span class="p">((</span><span class="mi">224</span><span class="p">,</span> <span class="mi">224</span><span class="p">))</span>
 
 <span class="c1"># Preprocess the image and convert to tensor</span>
-<span class="kn">from</span> <span class="nn">torchvision</span> <span class="k">import</span> <span class="n">transforms</span>
+<span class="kn">from</span> <span class="nn">torchvision</span> <span class="kn">import</span> <span class="n">transforms</span>
 
 <span class="n">my_preprocess</span> <span class="o">=</span> <span class="n">transforms</span><span class="o">.</span><span class="n">Compose</span><span class="p">(</span>
     <span class="p">[</span>
@@ -440,7 +441,7 @@ be unstable.</p>
 <div class="section" id="execute-the-portable-graph-on-tvm">
 <h2>Execute the portable graph on TVM<a class="headerlink" href="#execute-the-portable-graph-on-tvm" title="Permalink to this headline">¶</a></h2>
 <p>Now we can try deploying the compiled model on target.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span>
 
 <span class="n">dtype</span> <span class="o">=</span> <span class="s2">&quot;float32&quot;</span>
 <span class="n">m</span> <span class="o">=</span> <a href="../../reference/api/python/graph_executor.html#tvm.contrib.graph_executor.GraphModule" title="View documentation for tvm.contrib.graph_executor.GraphModule"><span class="n">graph_executor</span><span class="o">.</span><span class="n">GraphModule</span></a><span class="p">(</span><span class="n">lib</span><span class="p">[</span><span class="s2">&quot;default&quot;</span><span class="p">](</span><span class="n">dev</span><span cla [...]
diff --git a/docs/how_to/compile_models/from_tensorflow.html b/docs/how_to/compile_models/from_tensorflow.html
index 023fcea18..6ae2f0b7e 100644
--- a/docs/how_to/compile_models/from_tensorflow.html
+++ b/docs/how_to/compile_models/from_tensorflow.html
@@ -351,8 +351,8 @@
 <p>Please refer to <a class="reference external" href="https://www.tensorflow.org/install">https://www.tensorflow.org/install</a></p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># tvm, relay</span>
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
 
 <span class="c1"># os and numpy</span>
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
@@ -421,7 +421,7 @@ from tensorflow.</p>
 <div class="section" id="download-required-files">
 <h2>Download required files<a class="headerlink" href="#download-required-files" title="Permalink to this headline">¶</a></h2>
 <p>Download files listed above.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
 
 <span class="n">img_path</span> <span class="o">=</span> <span class="n">download_testdata</span><span class="p">(</span><span class="n">image_url</span><span class="p">,</span> <span class="n">img_name</span><span class="p">,</span> <span class="n">module</span><span class="o">=</span><span class="s2">&quot;data&quot;</span><span class="p">)</span>
 <span class="n">model_path</span> <span class="o">=</span> <span class="n">download_testdata</span><span class="p">(</span><span class="n">model_url</span><span class="p">,</span> <span class="n">model_name</span><span class="p">,</span> <span class="n">module</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;tf&quot;</span><span class="p">,</span> <span class="s2">&quot;InceptionV1&quot;</span><span class="p">])</span>
@@ -453,7 +453,7 @@ from tensorflow.</p>
 JpegDecode is bypassed (just return source node).
 Hence we supply decoded frame to TVM instead.</p>
 </div>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">PIL</span> <span class="k">import</span> <span class="n">Image</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
 
 <span class="n">image</span> <span class="o">=</span> <span class="n">Image</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">img_path</span><span class="p">)</span><span class="o">.</span><span class="n">resize</span><span class="p">((</span><span class="mi">299</span><span class="p">,</span> <span class="mi">299</span><span class="p">))</span>
 
@@ -502,7 +502,7 @@ lib: target library which can be deployed on target with TVM runtime.</p>
 <div class="section" id="execute-the-portable-graph-on-tvm">
 <h2>Execute the portable graph on TVM<a class="headerlink" href="#execute-the-portable-graph-on-tvm" title="Permalink to this headline">¶</a></h2>
 <p>Now we can try deploying the compiled model on target.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span>
 
 <span class="n">dtype</span> <span class="o">=</span> <span class="s2">&quot;uint8&quot;</span>
 <span class="n">m</span> <span class="o">=</span> <a href="../../reference/api/python/graph_executor.html#tvm.contrib.graph_executor.GraphModule" title="View documentation for tvm.contrib.graph_executor.GraphModule"><span class="n">graph_executor</span><span class="o">.</span><span class="n">GraphModule</span></a><span class="p">(</span><span class="n">lib</span><span class="p">[</span><span class="s2">&quot;default&quot;</span><span class="p">](</span><span class="n">dev</span><span cla [...]
@@ -606,7 +606,7 @@ banana (score = 0.00022)
 desk (score = 0.00019)
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  0.457 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  2.588 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-tensorflow-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7f1d3d1b878694c201c614c807cdebc8/from_tensorflow.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_tensorflow.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_tflite.html b/docs/how_to/compile_models/from_tflite.html
index 0e54d4e3f..5b5bbe504 100644
--- a/docs/how_to/compile_models/from_tflite.html
+++ b/docs/how_to/compile_models/from_tflite.html
@@ -389,7 +389,7 @@ flatc --python schema.fbs
 <div class="section" id="load-pretrained-tflite-model">
 <h2>Load pretrained TFLite model<a class="headerlink" href="#load-pretrained-tflite-model" title="Permalink to this headline">¶</a></h2>
 <p>Load mobilenet V1 TFLite model provided by Google</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
 
 <span class="n">model_url</span> <span class="o">=</span> <span class="s2">&quot;http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz&quot;</span>
 
@@ -417,8 +417,8 @@ flatc --python schema.fbs
 <div class="section" id="load-a-test-image">
 <h2>Load a test image<a class="headerlink" href="#load-a-test-image" title="Permalink to this headline">¶</a></h2>
 <p>A single cat dominates the examples!</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">PIL</span> <span class="k">import</span> <span class="n">Image</span>
-<span class="kn">from</span> <span class="nn">matplotlib</span> <span class="k">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
+<span class="kn">from</span> <span class="nn">matplotlib</span> <span class="kn">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span>
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 
 <span class="n">image_url</span> <span class="o">=</span> <span class="s2">&quot;https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true&quot;</span>
@@ -453,7 +453,7 @@ flatc --python schema.fbs
 <span class="n">input_dtype</span> <span class="o">=</span> <span class="s2">&quot;float32&quot;</span>
 
 <span class="c1"># Parse TFLite model and convert it to a Relay module</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">transform</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">transform</span>
 
 <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/frontend.html#tvm.relay.frontend.from_tflite" title="View documentation for tvm.relay.frontend.from_tflite"><span class="n">relay</span><span class="o">.</span><span class="n">frontend</span><span class="o">.</span><span class="n">from_tflite</span></a><span class="p">(</span>
     <span class="n">tflite_model</span><span class="p">,</span> <span class="n">shape_dict</span><span class="o">=</span><span class="p">{</span><span class="n">input_tensor</span><span class="p">:</span> <span class="n">input_shape</span><span class="p">},</span> <span class="n">dtype_dict</span><span class="o">=</span><span class="p">{</span><span class="n">input_tensor</span><span class="p">:</span> <span class="n">input_dtype</span><span class="p">}</span>
@@ -469,8 +469,8 @@ flatc --python schema.fbs
 <div class="section" id="execute-on-tvm">
 <h2>Execute on TVM<a class="headerlink" href="#execute-on-tvm" title="Permalink to this headline">¶</a></h2>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span> <span class="k">as</span> <span class="n">runtime</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span> <span class="k">as</span> <span class="n">runtime</span>
 
 <span class="c1"># Create a runtime executor module</span>
 <span class="n">module</span> <span class="o">=</span> <a href="../../reference/api/python/graph_executor.html#tvm.contrib.graph_executor.GraphModule" title="View documentation for tvm.contrib.graph_executor.GraphModule"><span class="n">runtime</span><span class="o">.</span><span class="n">GraphModule</span></a><span class="p">(</span><span class="n">lib</span><span class="p">[</span><span class="s2">&quot;default&quot;</span><span class="p">](</span><span class="n">tvm</span><span class [...]
diff --git a/docs/how_to/compile_models/sg_execution_times.html b/docs/how_to/compile_models/sg_execution_times.html
index 09a9b5cc6..64836ef1a 100644
--- a/docs/how_to/compile_models/sg_execution_times.html
+++ b/docs/how_to/compile_models/sg_execution_times.html
@@ -300,17 +300,17 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-compile-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>04:44.140</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
+<p><strong>04:50.106</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
 <ul class="simple">
-<li><p><strong>01:06.070</strong>: <a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></li>
-<li><p><strong>01:00.457</strong>: <a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></li>
-<li><p><strong>00:55.771</strong>: <a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></li>
-<li><p><strong>00:24.997</strong>: <a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></li>
-<li><p><strong>00:21.753</strong>: <a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></li>
-<li><p><strong>00:20.903</strong>: <a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></li>
-<li><p><strong>00:18.634</strong>: <a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></li>
-<li><p><strong>00:13.172</strong>: <a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></li>
-<li><p><strong>00:02.383</strong>: <a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></li>
+<li><p><strong>01:04.802</strong>: <a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></li>
+<li><p><strong>01:02.588</strong>: <a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></li>
+<li><p><strong>00:57.900</strong>: <a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></li>
+<li><p><strong>00:25.115</strong>: <a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></li>
+<li><p><strong>00:22.278</strong>: <a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></li>
+<li><p><strong>00:21.336</strong>: <a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></li>
+<li><p><strong>00:19.651</strong>: <a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></li>
+<li><p><strong>00:13.779</strong>: <a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></li>
+<li><p><strong>00:02.656</strong>: <a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></li>
 </ul>
 </div>
 
diff --git a/docs/how_to/deploy/arm_compute_lib.html b/docs/how_to/deploy/arm_compute_lib.html
index 5d917c4fa..7c307ffd6 100644
--- a/docs/how_to/deploy/arm_compute_lib.html
+++ b/docs/how_to/deploy/arm_compute_lib.html
@@ -390,8 +390,8 @@ binaries are searched for by CMake in the default locations
 /path-to-tvm-project/acl/ will also be searched. It is likely that you will need to set your own path to
 locate ACL. This can be done by specifying a path in the place of ON.</p>
 <p>These flags should be set in your config.cmake file. For example:</p>
-<div class="highlight-cmake notranslate"><div class="highlight"><pre><span></span><span class="nb">set</span><span class="p">(</span><span class="s">USE_ARM_COMPUTE_LIB</span> <span class="s">ON</span><span class="p">)</span>
-<span class="nb">set</span><span class="p">(</span><span class="s">USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR</span> <span class="s">/path/to/acl</span><span class="p">)</span>
+<div class="highlight-cmake notranslate"><div class="highlight"><pre><span></span><span class="nb">set</span><span class="p">(</span><span class="s">USE_ARM_COMPUTE_LIB</span><span class="w"> </span><span class="s">ON</span><span class="p">)</span>
+<span class="nb">set</span><span class="p">(</span><span class="s">USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR</span><span class="w"> </span><span class="s">/path/to/acl</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
@@ -467,14 +467,14 @@ as a result, how runtime tests will be run.</p>
 <li><p>device_key - The device key when connecting via a tracker.</p></li>
 <li><p>cross_compile - Path to cross compiler when connecting from a non-arm platform e.g. aarch64-linux-gnu-g++.</p></li>
 </ul>
-<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
-  <span class="nt">&quot;connection_type&quot;</span><span class="p">:</span> <span class="s2">&quot;local&quot;</span><span class="p">,</span>
-  <span class="nt">&quot;host&quot;</span><span class="p">:</span> <span class="s2">&quot;127.0.0.1&quot;</span><span class="p">,</span>
-  <span class="nt">&quot;port&quot;</span><span class="p">:</span> <span class="mi">9090</span><span class="p">,</span>
-  <span class="nt">&quot;target&quot;</span><span class="p">:</span> <span class="s2">&quot;llvm -mtriple=aarch64-linux-gnu -mattr=+neon&quot;</span><span class="p">,</span>
-  <span class="nt">&quot;device_key&quot;</span><span class="p">:</span> <span class="s2">&quot;&quot;</span><span class="p">,</span>
-  <span class="nt">&quot;cross_compile&quot;</span><span class="p">:</span> <span class="s2">&quot;&quot;</span>
-<span class="p">}</span>
+<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span><span class="w"></span>
+<span class="w">  </span><span class="nt">&quot;connection_type&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;local&quot;</span><span class="p">,</span><span class="w"></span>
+<span class="w">  </span><span class="nt">&quot;host&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;127.0.0.1&quot;</span><span class="p">,</span><span class="w"></span>
+<span class="w">  </span><span class="nt">&quot;port&quot;</span><span class="p">:</span><span class="w"> </span><span class="mi">9090</span><span class="p">,</span><span class="w"></span>
+<span class="w">  </span><span class="nt">&quot;target&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;llvm -mtriple=aarch64-linux-gnu -mattr=+neon&quot;</span><span class="p">,</span><span class="w"></span>
+<span class="w">  </span><span class="nt">&quot;device_key&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;&quot;</span><span class="p">,</span><span class="w"></span>
+<span class="w">  </span><span class="nt">&quot;cross_compile&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;&quot;</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/deploy/bnns.html b/docs/how_to/deploy/bnns.html
index 092f6e73f..67f493d7b 100644
--- a/docs/how_to/deploy/bnns.html
+++ b/docs/how_to/deploy/bnns.html
@@ -361,7 +361,7 @@ and will link tvm library to the BNNS runtime module.</p></li>
 <p>Enabling of this flag will cause to search the default Accelerate Frameworks on current target SDK.
 The minimal versions of required SDK is macOS 11.0, iOS 14.0, tvOS 14.0 and watchOS 7.0.</p>
 <p>Example setting in config.cmake file:</p>
-<div class="highlight-cmake notranslate"><div class="highlight"><pre><span></span><span class="nb">set</span><span class="p">(</span><span class="s">USE_BNNS</span> <span class="s">ON</span><span class="p">)</span>
+<div class="highlight-cmake notranslate"><div class="highlight"><pre><span></span><span class="nb">set</span><span class="p">(</span><span class="s">USE_BNNS</span><span class="w"> </span><span class="s">ON</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
@@ -410,7 +410,7 @@ intermediate tensors to NCHW data layout.</p>
 
 <span class="n">dtype</span> <span class="o">=</span> <span class="s2">&quot;float32&quot;</span>
 <span class="n">input_shape</span> <span class="o">=</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">224</span><span class="p">,</span> <span class="mi">224</span><span class="p">)</span>
-<span class="n">block</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="s1">&#39;mobilenetv2_1.0&#39;</span><span class="p">,</span> <span class="n">pretrained</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
+<span class="n">block</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="s1">&#39;mobilenetv2_1.0&#39;</span><span class="p">,</span> <span class="n">pretrained</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
 <span class="n">module</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">frontend</span><span class="o">.</span><span class="n">from_mxnet</span><span class="p">(</span><span class="n">block</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;data&#39;</span><span class="p">:</span> <span class="n">input_shape [...]
 </pre></div>
 </div>
@@ -434,7 +434,7 @@ regular TVM llvm compilation and code generation.</p>
 </div>
 <p>Load module and run inference on the target machine with TVM  built with <code class="docutils literal notranslate"><span class="pre">USE_BNNS</span></code> enabled</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
+<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 <span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span>
 
 <span class="n">dev</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">cpu</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
diff --git a/docs/how_to/deploy/hls.html b/docs/how_to/deploy/hls.html
index cd9097407..c86832dc6 100644
--- a/docs/how_to/deploy/hls.html
+++ b/docs/how_to/deploy/hls.html
@@ -371,7 +371,7 @@
 </li>
 <li><p>run.py - a script to use FPGA as an accelerator.</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
+<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 <span class="kn">import</span> <span class="nn">os</span>
 
 <span class="n">tgt</span><span class="o">=</span><span class="s2">&quot;sdaccel&quot;</span>
diff --git a/docs/how_to/deploy/tensorrt.html b/docs/how_to/deploy/tensorrt.html
index 7e8414afd..15a465067 100644
--- a/docs/how_to/deploy/tensorrt.html
+++ b/docs/how_to/deploy/tensorrt.html
@@ -378,8 +378,8 @@ TensorRT library.</p></li>
 This will build TVM against the installed TensorRT library.</p></li>
 </ul>
 <p>Example setting in config.cmake file:</p>
-<div class="highlight-cmake notranslate"><div class="highlight"><pre><span></span><span class="nb">set</span><span class="p">(</span><span class="s">USE_TENSORRT_CODEGEN</span> <span class="s">ON</span><span class="p">)</span>
-<span class="nb">set</span><span class="p">(</span><span class="s">USE_TENSORRT_RUNTIME</span> <span class="s">/home/ubuntu/TensorRT-7.0.0.11</span><span class="p">)</span>
+<div class="highlight-cmake notranslate"><div class="highlight"><pre><span></span><span class="nb">set</span><span class="p">(</span><span class="s">USE_TENSORRT_CODEGEN</span><span class="w"> </span><span class="s">ON</span><span class="p">)</span>
+<span class="nb">set</span><span class="p">(</span><span class="s">USE_TENSORRT_RUNTIME</span><span class="w"> </span><span class="s">/home/ubuntu/TensorRT-7.0.0.11</span><span class="p">)</span>
 </pre></div>
 </div>
 </div>
@@ -393,7 +393,7 @@ This will build TVM against the installed TensorRT library.</p></li>
 
 <span class="n">dtype</span> <span class="o">=</span> <span class="s2">&quot;float32&quot;</span>
 <span class="n">input_shape</span> <span class="o">=</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">224</span><span class="p">,</span> <span class="mi">224</span><span class="p">)</span>
-<span class="n">block</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="s1">&#39;resnet18_v1&#39;</span><span class="p">,</span> <span class="n">pretrained</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
+<span class="n">block</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="s1">&#39;resnet18_v1&#39;</span><span class="p">,</span> <span class="n">pretrained</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
 <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">frontend</span><span class="o">.</span><span class="n">from_mxnet</span><span class="p">(</span><span class="n">block</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;data&#39;</span><span class="p">:</span> <span class="n">input_shape</s [...]
 </pre></div>
 </div>
diff --git a/docs/how_to/deploy/vitis_ai.html b/docs/how_to/deploy/vitis_ai.html
index 6e7e1a472..4be4c90d7 100644
--- a/docs/how_to/deploy/vitis_ai.html
+++ b/docs/how_to/deploy/vitis_ai.html
@@ -644,7 +644,7 @@ inside TVM.</p>
 <span class="kn">import</span> <span class="nn">pyxir.contrib.target.DPUCADF8H</span>
 
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">import</span> <span class="nn">tvm.relay</span> <span class="kn">as</span> <span class="nn">relay</span>
+<span class="kn">import</span> <span class="nn">tvm.relay</span> <span class="k">as</span> <span class="nn">relay</span>
 <span class="kn">from</span> <span class="nn">tvm.contrib.target</span> <span class="kn">import</span> <span class="n">vitis_ai</span>
 <span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">utils</span><span class="p">,</span> <span class="n">graph_executor</span>
 <span class="kn">from</span> <span class="nn">tvm.relay.op.contrib.vitis_ai</span> <span class="kn">import</span> <span class="n">partition_for_vitis_ai</span>
diff --git a/docs/how_to/deploy_models/deploy_model_on_android.html b/docs/how_to/deploy_models/deploy_model_on_android.html
index 85b3e6022..223d02979 100644
--- a/docs/how_to/deploy_models/deploy_model_on_android.html
+++ b/docs/how_to/deploy_models/deploy_model_on_android.html
@@ -340,15 +340,15 @@
 <p>This is an example of using Relay to compile a keras model and deploy it on Android device.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">os</span>
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
-<span class="kn">from</span> <span class="nn">PIL</span> <span class="k">import</span> <span class="n">Image</span>
+<span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
 <span class="kn">import</span> <span class="nn">keras</span>
-<span class="kn">from</span> <span class="nn">keras.applications.mobilenet_v2</span> <span class="k">import</span> <span class="n">MobileNetV2</span>
+<span class="kn">from</span> <span class="nn">keras.applications.mobilenet_v2</span> <span class="kn">import</span> <span class="n">MobileNetV2</span>
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
 <span class="kn">import</span> <span class="nn">tvm.relay</span> <span class="k">as</span> <span class="nn">relay</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">rpc</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">utils</span><span class="p">,</span> <span class="n">ndk</span><span class="p">,</span> <span class="n">graph_executor</span> <span class="k">as</span> <span class="n">runtime</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">rpc</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">utils</span><span class="p">,</span> <span class="n">ndk</span><span class="p">,</span> <span class="n">graph_executor</span> <span class="k">as</span> <span class="n">runtime</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
 </pre></div>
 </div>
 <div class="section" id="setup-environment">
@@ -622,7 +622,7 @@ to the remote android device.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  16.4564      16.4444      16.9575      16.1097       0.2470
+  15.9748      15.8680      16.8013      15.7512       0.2894
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/deploy_models/deploy_model_on_rasp.html b/docs/how_to/deploy_models/deploy_model_on_rasp.html
index 0658134b5..47f555b3e 100644
--- a/docs/how_to/deploy_models/deploy_model_on_rasp.html
+++ b/docs/how_to/deploy_models/deploy_model_on_rasp.html
@@ -340,11 +340,11 @@
 <p>This is an example of using Relay to compile a ResNet model and deploy
 it on Raspberry Pi.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
 <span class="kn">import</span> <span class="nn">tvm.relay</span> <span class="k">as</span> <span class="nn">relay</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">rpc</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">utils</span><span class="p">,</span> <span class="n">graph_executor</span> <span class="k">as</span> <span class="n">runtime</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">rpc</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">utils</span><span class="p">,</span> <span class="n">graph_executor</span> <span class="k">as</span> <span class="n">runtime</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
 </pre></div>
 </div>
 <div class="section" id="build-tvm-runtime-on-device">
@@ -400,8 +400,8 @@ successfully on your device.</p>
 <p>We will use pre-trained model from
 <a class="reference external" href="https://mxnet.apache.org/api/python/gluon/model_zoo.html">MXNet Gluon model zoo</a>.
 You can found more details about this part at tutorial <a class="reference internal" href="../compile_models/from_mxnet.html#tutorial-from-mxnet"><span class="std std-ref">Compile MXNet Models</span></a>.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="k">import</span> <span class="n">get_model</span>
-<span class="kn">from</span> <span class="nn">PIL</span> <span class="k">import</span> <span class="n">Image</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="kn">import</span> <span class="n">get_model</span>
+<span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 
 <span class="c1"># one line to get the model</span>
diff --git a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
index 01a450fca..72682cb60 100644
--- a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
+++ b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
@@ -352,10 +352,10 @@ with the proper TorchVision version.</p>
 <p>Currently, TVM supports PyTorch 1.7 and 1.4. Other versions may
 be unstable.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
-<span class="kn">from</span> <span class="nn">tvm.runtime.vm</span> <span class="k">import</span> <span class="n">VirtualMachine</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm.runtime.vm</span> <span class="kn">import</span> <span class="n">VirtualMachine</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
 
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 <span class="kn">import</span> <span class="nn">cv2</span>
@@ -385,7 +385,7 @@ be unstable.</p>
 
 
 <span class="k">class</span> <span class="nc">TraceWrapper</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
-    <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">model</span><span class="p">):</span>
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">model</span><span class="p">):</span>
         <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">model</span> <span class="o">=</span> <span class="n">model</span>
 
@@ -409,13 +409,16 @@ be unstable.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth&quot; to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
 
   0%|          | 0.00/170M [00:00&lt;?, ?B/s]
-  9%|8         | 14.8M/170M [00:00&lt;00:01, 155MB/s]
- 23%|##3       | 39.8M/170M [00:00&lt;00:00, 218MB/s]
- 40%|###9      | 67.1M/170M [00:00&lt;00:00, 249MB/s]
- 55%|#####5    | 94.2M/170M [00:00&lt;00:00, 263MB/s]
- 71%|#######1  | 121M/170M [00:00&lt;00:00, 268MB/s]
- 86%|########6 | 146M/170M [00:00&lt;00:00, 261MB/s]
-100%|##########| 170M/170M [00:00&lt;00:00, 257MB/s]
+  2%|1         | 3.30M/170M [00:00&lt;00:05, 34.5MB/s]
+  4%|3         | 6.60M/170M [00:00&lt;00:05, 33.3MB/s]
+ 16%|#6        | 27.2M/170M [00:00&lt;00:01, 115MB/s]
+ 30%|##9       | 50.1M/170M [00:00&lt;00:00, 164MB/s]
+ 43%|####2     | 72.6M/170M [00:00&lt;00:00, 189MB/s]
+ 57%|#####7    | 97.3M/170M [00:00&lt;00:00, 213MB/s]
+ 71%|#######1  | 121M/170M [00:00&lt;00:00, 224MB/s]
+ 84%|########4 | 143M/170M [00:00&lt;00:00, 228MB/s]
+ 98%|#########8| 167M/170M [00:00&lt;00:00, 234MB/s]
+100%|##########| 170M/170M [00:00&lt;00:00, 194MB/s]
 /usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:3878: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
   for i in range(dim)
 /usr/local/lib/python3.7/dist-packages/torchvision/models/detection/anchor_utils.py:127: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the &#39;trunc&#39; function NOT &#39;floor&#39;). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode=&#39;trunc&#39;), or for actual floor division, use torch.div(a, b, rounding_mode=&#39;floor&#39;).
@@ -508,7 +511,7 @@ torchvision rcnn models.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Get 9 valid boxes
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  3.616 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  3.882 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-object-detection-pytorch-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7795da4b258c8feff986668b95ef57ad/deploy_object_detection_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_object_detection_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized.html b/docs/how_to/deploy_models/deploy_prequantized.html
index a4c91148a..13eaf1a4c 100644
--- a/docs/how_to/deploy_models/deploy_prequantized.html
+++ b/docs/how_to/deploy_models/deploy_prequantized.html
@@ -344,16 +344,16 @@ the quantization story in TVM can be found
 <p>Here, we demonstrate how to load and run models quantized by PyTorch, MXNet, and TFLite.
 Once loaded, we can run compiled, quantized models on any hardware TVM supports.</p>
 <p>First, necessary imports</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">PIL</span> <span class="k">import</span> <span class="n">Image</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
 
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 
 <span class="kn">import</span> <span class="nn">torch</span>
-<span class="kn">from</span> <span class="nn">torchvision.models.quantization</span> <span class="k">import</span> <span class="n">mobilenet</span> <span class="k">as</span> <span class="n">qmobilenet</span>
+<span class="kn">from</span> <span class="nn">torchvision.models.quantization</span> <span class="kn">import</span> <span class="n">mobilenet</span> <span class="k">as</span> <span class="n">qmobilenet</span>
 
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
 </pre></div>
 </div>
 <p>Helper functions to run the demo</p>
@@ -450,7 +450,9 @@ training. Other models require a full post training calibration.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://download.pytorch.org/models/mobilenet_v2-b0353104.pth&quot; to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
 
   0%|          | 0.00/13.6M [00:00&lt;?, ?B/s]
-100%|##########| 13.6M/13.6M [00:00&lt;00:00, 182MB/s]
+ 26%|##6       | 3.59M/13.6M [00:00&lt;00:00, 36.2MB/s]
+ 52%|#####1    | 7.04M/13.6M [00:00&lt;00:00, 36.1MB/s]
+100%|##########| 13.6M/13.6M [00:00&lt;00:00, 57.5MB/s]
 </pre></div>
 </div>
 </div>
@@ -539,7 +541,7 @@ output values are identical out of 1000 outputs from mobilenet v2.</p>
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  90.5416      90.2102      102.7956     90.0396       1.3333
+  90.3311      90.2821      91.2938      90.0786       0.2095
 </pre></div>
 </div>
 <div class="admonition note">
@@ -578,7 +580,7 @@ This includes support for the VNNI 8 bit dot product instruction (CascadeLake or
 <div class="section" id="deploy-a-quantized-tflite-model">
 <h2>Deploy a quantized TFLite Model<a class="headerlink" href="#deploy-a-quantized-tflite-model" title="Permalink to this headline">¶</a></h2>
 <p>TODO</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  5.240 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  4.808 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/fb8217c13f4351224c6cf3aacf1a87fc/deploy_prequantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized_tflite.html b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
index 58cadc8b4..6c75d048d 100644
--- a/docs/how_to/deploy_models/deploy_prequantized_tflite.html
+++ b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
@@ -357,14 +357,14 @@ pip install <span class="nv">tflite</span><span class="o">==</span><span class="
 <span class="kn">import</span> <span class="nn">tflite</span>
 
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
 </pre></div>
 </div>
 </div>
 <div class="section" id="download-pretrained-quantized-tflite-model">
 <h2>Download pretrained Quantized TFLite model<a class="headerlink" href="#download-pretrained-quantized-tflite-model" title="Permalink to this headline">¶</a></h2>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># Download mobilenet V2 TFLite model provided by Google</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
 
 <span class="n">model_url</span> <span class="o">=</span> <span class="p">(</span>
     <span class="s2">&quot;https://storage.googleapis.com/download.tensorflow.org/models/&quot;</span>
@@ -403,7 +403,7 @@ pip install <span class="nv">tflite</span><span class="o">==</span><span class="
 <div class="section" id="get-a-real-image-for-e2e-testing">
 <h2>Get a real image for e2e testing<a class="headerlink" href="#get-a-real-image-for-e2e-testing" title="Permalink to this headline">¶</a></h2>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">get_real_image</span><span class="p">(</span><span class="n">im_height</span><span class="p">,</span> <span class="n">im_width</span><span class="p">):</span>
-    <span class="kn">from</span> <span class="nn">PIL</span> <span class="k">import</span> <span class="n">Image</span>
+    <span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
 
     <span class="n">repo_base</span> <span class="o">=</span> <span class="s2">&quot;https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/&quot;</span>
     <span class="n">img_name</span> <span class="o">=</span> <span class="s2">&quot;elephant-299.jpg&quot;</span>
@@ -440,9 +440,9 @@ pip install <span class="nv">tflite</span><span class="o">==</span><span class="
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">run_tflite_model</span><span class="p">(</span><span class="n">tflite_model_buf</span><span class="p">,</span> <span class="n">input_data</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;Generic function to execute TFLite&quot;&quot;&quot;</span>
     <span class="k">try</span><span class="p">:</span>
-        <span class="kn">from</span> <span class="nn">tensorflow</span> <span class="k">import</span> <span class="n">lite</span> <span class="k">as</span> <span class="n">interpreter_wrapper</span>
+        <span class="kn">from</span> <span class="nn">tensorflow</span> <span class="kn">import</span> <span class="n">lite</span> <span class="k">as</span> <span class="n">interpreter_wrapper</span>
     <span class="k">except</span> <span class="ne">ImportError</span><span class="p">:</span>
-        <span class="kn">from</span> <span class="nn">tensorflow.contrib</span> <span class="k">import</span> <span class="n">lite</span> <span class="k">as</span> <span class="n">interpreter_wrapper</span>
+        <span class="kn">from</span> <span class="nn">tensorflow.contrib</span> <span class="kn">import</span> <span class="n">lite</span> <span class="k">as</span> <span class="n">interpreter_wrapper</span>
 
     <span class="n">input_data</span> <span class="o">=</span> <span class="n">input_data</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">input_data</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="k">else</span> <span class="p">[</span><span class="n">input_data</span><span class="p">]</span>
 
@@ -470,7 +470,7 @@ pip install <span class="nv">tflite</span><span class="o">==</span><span class="
 </div>
 <p>Lets run TVM compiled pre-quantized model inference and get the TVM prediction.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">run_tvm</span><span class="p">(</span><span class="n">lib</span><span class="p">):</span>
-    <span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span>
+    <span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span>
 
     <span class="n">rt_mod</span> <span class="o">=</span> <a href="../../reference/api/python/graph_executor.html#tvm.contrib.graph_executor.GraphModule" title="View documentation for tvm.contrib.graph_executor.GraphModule"><span class="n">graph_executor</span><span class="o">.</span><span class="n">GraphModule</span></a><span class="p">(</span><span class="n">lib</span><span class="p">[</span><span class="s2">&quot;default&quot;</span><span class="p">](</span><span class="n">tvm</span> [...]
     <span class="n">rt_mod</span><span class="o">.</span><span class="n">set_input</span><span class="p">(</span><span class="s2">&quot;input&quot;</span><span class="p">,</span> <span class="n">data</span><span class="p">)</span>
@@ -540,7 +540,7 @@ TFLite Top-5 labels: [387 102 386 341 349]
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  121.8275     121.7358     126.9073     120.5809      0.6996
+  120.7286     120.6932     123.0922     119.5067      0.6691
 </pre></div>
 </div>
 <div class="admonition note">
@@ -568,7 +568,7 @@ network for ARM CPU</span></a>.</p></li>
 </ul>
 </div></blockquote>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  53.611 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  52.034 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-tflite-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/56691c7a27d45da61d112276334640d3/deploy_prequantized_tflite.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized_tflite.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_quantized.html b/docs/how_to/deploy_models/deploy_quantized.html
index 83682e3df..d8d520f7a 100644
--- a/docs/how_to/deploy_models/deploy_quantized.html
+++ b/docs/how_to/deploy_models/deploy_quantized.html
@@ -344,11 +344,11 @@ the quantization story in TVM can be found
 In this tutorial, we will import a GluonCV pre-trained model on ImageNet to
 Relay, quantize the Relay model and then perform the inference.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
 <span class="kn">import</span> <span class="nn">mxnet</span> <span class="k">as</span> <span class="nn">mx</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
-<span class="kn">from</span> <span class="nn">mxnet</span> <span class="k">import</span> <span class="n">gluon</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">mxnet</span> <span class="kn">import</span> <span class="n">gluon</span>
 <span class="kn">import</span> <span class="nn">logging</span>
 <span class="kn">import</span> <span class="nn">os</span>
 
@@ -480,7 +480,7 @@ for calibration. But the accuracy might be impacted.</p>
   DeprecationWarning,
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  28.479 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  14.969 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_sparse.html b/docs/how_to/deploy_models/deploy_sparse.html
index a818a7afc..02342a0c1 100644
--- a/docs/how_to/deploy_models/deploy_sparse.html
+++ b/docs/how_to/deploy_models/deploy_sparse.html
@@ -390,10 +390,10 @@ tensorflow 2.2+ are required.</p>
 <span class="kn">import</span> <span class="nn">itertools</span>
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 <span class="kn">import</span> <span class="nn">tensorflow</span> <span class="k">as</span> <span class="nn">tf</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">runtime</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span>
-<span class="kn">from</span> <span class="nn">tvm.relay</span> <span class="k">import</span> <span class="n">data_dep_optimization</span> <span class="k">as</span> <span class="n">ddo</span>
-<span class="kn">from</span> <span class="nn">tensorflow.python.framework.convert_to_constants</span> <span class="k">import</span> <span class="p">(</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">runtime</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span>
+<span class="kn">from</span> <span class="nn">tvm.relay</span> <span class="kn">import</span> <span class="n">data_dep_optimization</span> <span class="k">as</span> <span class="n">ddo</span>
+<span class="kn">from</span> <span class="nn">tensorflow.python.framework.convert_to_constants</span> <span class="kn">import</span> <span class="p">(</span>
     <span class="n">convert_variables_to_constants_v2</span><span class="p">,</span>
 <span class="p">)</span>
 <span class="kn">import</span> <span class="nn">scipy.sparse</span> <span class="k">as</span> <span class="nn">sp</span>
diff --git a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
index 7d34edcc2..e5cdc7075 100644
--- a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
+++ b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
@@ -341,13 +341,13 @@
 <p>This article is an introductory tutorial to deploy SSD models with TVM.
 We will use GluonCV pre-trained SSD model and convert it to Relay IR</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
 
-<span class="kn">from</span> <span class="nn">matplotlib</span> <span class="k">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
-<span class="kn">from</span> <span class="nn">gluoncv</span> <span class="k">import</span> <span class="n">model_zoo</span><span class="p">,</span> <span class="n">data</span><span class="p">,</span> <span class="n">utils</span>
+<span class="kn">from</span> <span class="nn">matplotlib</span> <span class="kn">import</span> <span class="n">pyplot</span> <span class="k">as</span> <span class="n">plt</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">gluoncv</span> <span class="kn">import</span> <span class="n">model_zoo</span><span class="p">,</span> <span class="n">data</span><span class="p">,</span> <span class="n">utils</span>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
@@ -415,22 +415,23 @@ to your device.</p>
 Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
 
   0%|          | 0/132723 [00:00&lt;?, ?KB/s]
-  3%|2         | 3819/132723 [00:00&lt;00:03, 38183.66KB/s]
-  9%|9         | 12493/132723 [00:00&lt;00:01, 66738.87KB/s]
- 16%|#5        | 21212/132723 [00:00&lt;00:01, 76073.58KB/s]
- 23%|##2       | 29893/132723 [00:00&lt;00:01, 80309.58KB/s]
- 29%|##9       | 38683/132723 [00:00&lt;00:01, 83040.67KB/s]
- 36%|###5      | 47463/132723 [00:00&lt;00:01, 84655.64KB/s]
- 42%|####2     | 56259/132723 [00:00&lt;00:00, 85732.37KB/s]
- 49%|####9     | 65037/132723 [00:00&lt;00:00, 86381.00KB/s]
- 56%|#####5    | 73852/132723 [00:00&lt;00:00, 86932.25KB/s]
- 62%|######2   | 82704/132723 [00:01&lt;00:00, 87419.45KB/s]
- 69%|######8   | 91540/132723 [00:01&lt;00:00, 87705.52KB/s]
- 76%|#######5  | 100365/132723 [00:01&lt;00:00, 87862.25KB/s]
- 82%|########2 | 109215/132723 [00:01&lt;00:00, 88053.35KB/s]
- 89%|########8 | 118021/132723 [00:01&lt;00:00, 87969.87KB/s]
- 96%|#########5| 126819/132723 [00:01&lt;00:00, 87564.90KB/s]
-100%|##########| 132723/132723 [00:01&lt;00:00, 84457.93KB/s]
+  5%|4         | 6006/132723 [00:00&lt;00:02, 60043.74KB/s]
+ 11%|#1        | 14931/132723 [00:00&lt;00:01, 77216.17KB/s]
+ 17%|#7        | 22653/132723 [00:00&lt;00:01, 57818.07KB/s]
+ 24%|##3       | 31487/132723 [00:00&lt;00:01, 67827.45KB/s]
+ 29%|##9       | 38744/132723 [00:00&lt;00:01, 48676.18KB/s]
+ 36%|###5      | 47684/132723 [00:00&lt;00:01, 58547.99KB/s]
+ 41%|####1     | 54527/132723 [00:00&lt;00:01, 54097.16KB/s]
+ 48%|####7     | 63515/132723 [00:01&lt;00:01, 62882.16KB/s]
+ 53%|#####3    | 70531/132723 [00:01&lt;00:01, 58778.23KB/s]
+ 59%|#####9    | 78874/132723 [00:01&lt;00:00, 64984.76KB/s]
+ 65%|######4   | 85872/132723 [00:01&lt;00:00, 60609.60KB/s]
+ 71%|#######1  | 94856/132723 [00:01&lt;00:00, 68110.70KB/s]
+ 78%|#######8  | 103809/132723 [00:01&lt;00:00, 73856.23KB/s]
+ 85%|########5 | 112829/132723 [00:01&lt;00:00, 78384.20KB/s]
+ 92%|#########1| 121823/132723 [00:01&lt;00:00, 81660.63KB/s]
+ 99%|#########8| 130793/132723 [00:01&lt;00:00, 83976.31KB/s]
+100%|##########| 132723/132723 [00:01&lt;00:00, 67888.43KB/s]
 </pre></div>
 </div>
 <p>Create TVM runtime and do inference
@@ -470,7 +471,7 @@ Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from h
 </pre></div>
 </div>
 <img alt="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" class="sphx-glr-single-img" src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" />
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  22.292 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  22.844 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-ssd-gluoncv-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/cccb17d28e5e8b2e94ea8cd5ec59f6ed/deploy_ssd_gluoncv.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_ssd_gluoncv.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/sg_execution_times.html b/docs/how_to/deploy_models/sg_execution_times.html
index dc0dc3692..563e2895b 100644
--- a/docs/how_to/deploy_models/sg_execution_times.html
+++ b/docs/how_to/deploy_models/sg_execution_times.html
@@ -300,16 +300,16 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-deploy-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>10:42.710</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
+<p><strong>10:27.896</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
 <ul class="simple">
-<li><p><strong>03:03.616</strong>: <a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></li>
-<li><p><strong>02:22.292</strong>: <a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></li>
-<li><p><strong>01:53.611</strong>: <a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></li>
-<li><p><strong>01:28.479</strong>: <a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></li>
-<li><p><strong>01:05.240</strong>: <a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></li>
-<li><p><strong>00:27.764</strong>: <a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></li>
-<li><p><strong>00:21.519</strong>: <a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></li>
-<li><p><strong>00:00.188</strong>: <a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></li>
+<li><p><strong>03:03.882</strong>: <a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></li>
+<li><p><strong>02:22.844</strong>: <a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></li>
+<li><p><strong>01:52.034</strong>: <a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></li>
+<li><p><strong>01:14.969</strong>: <a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></li>
+<li><p><strong>01:04.808</strong>: <a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></li>
+<li><p><strong>00:27.820</strong>: <a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></li>
+<li><p><strong>00:21.350</strong>: <a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></li>
+<li><p><strong>00:00.190</strong>: <a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></li>
 </ul>
 </div>
 
diff --git a/docs/how_to/extend_tvm/bring_your_own_datatypes.html b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
index 7a7ba9360..eb4557c5f 100644
--- a/docs/how_to/extend_tvm/bring_your_own_datatypes.html
+++ b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
@@ -368,7 +368,7 @@ to show that any datatype can be used in the BYODT framework.</p>
 <h2>A Simple TVM Program<a class="headerlink" href="#a-simple-tvm-program" title="Permalink to this headline">¶</a></h2>
 <p>We’ll begin by writing a simple program in TVM; afterwards, we will re-write it to use custom datatypes.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
 
 <span class="c1"># Our basic program: Z = X + Y</span>
 <span class="n">x</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">var</span><span class="p">(</span><span class="s2">&quot;x&quot;</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">3</span><span class="p">,),</span> <span class="n">dtype</span><span class="o">=</span><span class="s2">&quot;float32&quot;</span><span class="p">)</span>
@@ -563,7 +563,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
 <p>First let us define two helper functions to get the mobilenet model and a cat image.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">get_mobilenet</span><span class="p">():</span>
     <span class="n">dshape</span> <span class="o">=</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">224</span><span class="p">,</span> <span class="mi">224</span><span class="p">)</span>
-    <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="k">import</span> <span class="n">get_model</span>
+    <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="kn">import</span> <span class="n">get_model</span>
 
     <span class="n">block</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="s2">&quot;mobilenet0.25&quot;</span><span class="p">,</span> <span class="n">pretrained</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
     <span class="n">shape_dict</span> <span class="o">=</span> <span class="p">{</span><span class="s2">&quot;data&quot;</span><span class="p">:</span> <span class="n">dshape</span><span class="p">}</span>
@@ -571,8 +571,8 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
 
 
 <span class="k">def</span> <span class="nf">get_cat_image</span><span class="p">():</span>
-    <span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
-    <span class="kn">from</span> <span class="nn">PIL</span> <span class="k">import</span> <span class="n">Image</span>
+    <span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
+    <span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
 
     <span class="n">url</span> <span class="o">=</span> <span class="s2">&quot;https://gist.githubusercontent.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/fa7ef0e9c9a5daea686d6473a62aacd1a5885849/cat.png&quot;</span>
     <span class="n">dst</span> <span class="o">=</span> <span class="s2">&quot;cat.png&quot;</span>
@@ -588,7 +588,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zipb171556b-b337-4a31-9148-a8a50c7ffe93 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip19f69514-aa16-4970-ad6d-7e8a958c7858 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
 </pre></div>
 </div>
 <p>It’s easy to execute MobileNet with native TVM:</p>
@@ -614,7 +614,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
 </pre></div>
 </div>
 <p>Now, to actually convert the entire network, we have written <a class="reference external" href="https://github.com/gussmith23/tvm/blob/ea174c01c54a2529e19ca71e125f5884e728da6e/python/tvm/relay/frontend/change_datatype.py#L21">a pass in Relay</a> which simply converts all nodes within the model to use the new datatype.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.relay.frontend.change_datatype</span> <span class="k">import</span> <a href="../../reference/api/python/relay/frontend.html#tvm.relay.frontend.ChangeDatatype" title="View documentation for tvm.relay.frontend.change_datatype.ChangeDatatype"><span class="n">ChangeDatatype</span></a>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.relay.frontend.change_datatype</span> <span class="kn">import</span> <a href="../../reference/api/python/relay/frontend.html#tvm.relay.frontend.ChangeDatatype" title="View documentation for tvm.relay.frontend.change_datatype.ChangeDatatype"><span class="n">ChangeDatatype</span></a>
 
 <span class="n">src_dtype</span> <span class="o">=</span> <span class="s2">&quot;float32&quot;</span>
 <span class="n">dst_dtype</span> <span class="o">=</span> <span class="s2">&quot;custom[myfloat]32&quot;</span>
@@ -650,7 +650,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Check failed: (lower) is false: Intrinsic lowering function for target llvm, intrinsic name tir.sqrt, type 150 not found
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Check failed: (lower) is false: FloatImm lowering function for target llvm type 150 not found
 </pre></div>
 </div>
 <p>When we attempt to run the model, we get a familiar error telling us that more functions need to be registerd for myfloat.</p>
diff --git a/docs/how_to/extend_tvm/low_level_custom_pass.html b/docs/how_to/extend_tvm/low_level_custom_pass.html
index b260631df..b09500b60 100644
--- a/docs/how_to/extend_tvm/low_level_custom_pass.html
+++ b/docs/how_to/extend_tvm/low_level_custom_pass.html
@@ -356,7 +356,7 @@ visitor is implemented.</p></li>
 take a look at <code class="docutils literal notranslate"><span class="pre">python/tvm/build_module.py</span></code> to get some basics.</p></li>
 </ul>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 </pre></div>
 </div>
diff --git a/docs/how_to/extend_tvm/sg_execution_times.html b/docs/how_to/extend_tvm/sg_execution_times.html
index 19609e1da..dafc45577 100644
--- a/docs/how_to/extend_tvm/sg_execution_times.html
+++ b/docs/how_to/extend_tvm/sg_execution_times.html
@@ -300,12 +300,12 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-extend-tvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:38.522</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
+<p><strong>00:38.402</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
 <ul class="simple">
-<li><p><strong>00:35.019</strong>: <a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></li>
-<li><p><strong>00:02.248</strong>: <a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></li>
-<li><p><strong>00:01.062</strong>: <a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></li>
-<li><p><strong>00:00.192</strong>: <a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></li>
+<li><p><strong>00:34.875</strong>: <a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></li>
+<li><p><strong>00:02.254</strong>: <a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></li>
+<li><p><strong>00:01.077</strong>: <a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></li>
+<li><p><strong>00:00.196</strong>: <a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></li>
 </ul>
 </div>
 
diff --git a/docs/how_to/extend_tvm/use_pass_infra.html b/docs/how_to/extend_tvm/use_pass_infra.html
index bfba3ba7e..546937219 100644
--- a/docs/how_to/extend_tvm/use_pass_infra.html
+++ b/docs/how_to/extend_tvm/use_pass_infra.html
@@ -354,7 +354,7 @@ a certain optimization and create an optimization pipeline for a Relay program.
 The same approach can be used for tir as well.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
 <span class="kn">import</span> <span class="nn">tvm.relay</span> <span class="k">as</span> <span class="nn">relay</span>
 </pre></div>
 </div>
@@ -572,7 +572,7 @@ customized pass.</p>
 <span class="k">class</span> <span class="nc">CustomPipeline</span><span class="p">:</span>
     <span class="sd">&quot;&quot;&quot;Simple test function to replace one argument to another.&quot;&quot;&quot;</span>
 
-    <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">multiplier</span><span class="p">):</span>
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">multiplier</span><span class="p">):</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">multiplier</span> <span class="o">=</span> <span class="n">multiplier</span>
 
     <span class="c1"># This function can define a pass.</span>
diff --git a/docs/how_to/extend_tvm/use_pass_instrument.html b/docs/how_to/extend_tvm/use_pass_instrument.html
index b70abe750..cf97b2a87 100644
--- a/docs/how_to/extend_tvm/use_pass_instrument.html
+++ b/docs/how_to/extend_tvm/use_pass_instrument.html
@@ -349,10 +349,10 @@ but an extension mechanism is available via the <a class="reference internal" hr
 passes. Please also refer to the <a class="reference internal" href="../../arch/pass_infra.html#pass-infra"><span class="std std-ref">Pass Infrastructure</span></a>.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
 <span class="kn">import</span> <span class="nn">tvm.relay</span> <span class="k">as</span> <span class="nn">relay</span>
-<span class="kn">from</span> <span class="nn">tvm.relay.testing</span> <span class="k">import</span> <span class="n">resnet</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
-<span class="kn">from</span> <span class="nn">tvm.relay.build_module</span> <span class="k">import</span> <span class="n">bind_params_by_name</span>
-<span class="kn">from</span> <span class="nn">tvm.ir.instrument</span> <span class="k">import</span> <span class="p">(</span>
+<span class="kn">from</span> <span class="nn">tvm.relay.testing</span> <span class="kn">import</span> <span class="n">resnet</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">tvm.relay.build_module</span> <span class="kn">import</span> <span class="n">bind_params_by_name</span>
+<span class="kn">from</span> <span class="nn">tvm.ir.instrument</span> <span class="kn">import</span> <span class="p">(</span>
     <span class="n">PassTimingInstrument</span><span class="p">,</span>
     <span class="n">pass_instrument</span><span class="p">,</span>
 <span class="p">)</span>
@@ -486,10 +486,10 @@ profile the execution time of each passes.</p>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6040us [6040us] (45.42%; 45.42%)
-FoldScaleAxis: 7257us [3us] (54.58%; 54.58%)
-        FoldConstant: 7255us [1471us] (54.56%; 99.96%)
-                InferType: 5784us [5784us] (43.50%; 79.73%)
+InferType: 6481us [6481us] (45.91%; 45.91%)
+FoldScaleAxis: 7636us [2us] (54.09%; 54.09%)
+        FoldConstant: 7634us [1570us] (54.07%; 99.97%)
+                InferType: 6064us [6064us] (42.95%; 79.43%)
 </pre></div>
 </div>
 </div>
@@ -512,10 +512,10 @@ Refer to following sections and <a class="reference internal" href="../../refere
 </div>
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 5855us [5855us] (44.88%; 44.88%)
-FoldScaleAxis: 7191us [2us] (55.12%; 55.12%)
-        FoldConstant: 7189us [1515us] (55.11%; 99.98%)
-                InferType: 5675us [5675us] (43.50%; 78.93%)
+InferType: 6060us [6060us] (44.75%; 44.75%)
+FoldScaleAxis: 7481us [2us] (55.25%; 55.25%)
+        FoldConstant: 7479us [1534us] (55.23%; 99.97%)
+                InferType: 5945us [5945us] (43.91%; 79.49%)
 </pre></div>
 </div>
 <p>Register empty list to clear existing instruments.</p>
@@ -537,7 +537,7 @@ occurrences of each operator caused by each pass. We can look at <code class="do
 find the name of each operator. And we do this before and after passes to calculate the difference.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@pass_instrument</span>
 <span class="k">class</span> <span class="nc">RelayCallNodeDiffer</span><span class="p">:</span>
-    <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">_op_diff</span> <span class="o">=</span> <span class="p">[]</span>
         <span class="c1"># Passes can be nested.</span>
         <span class="c1"># Use stack to make sure we get correct before/after pairs.</span>
@@ -640,7 +640,7 @@ profile result.</p>
 </pre></div>
 </div>
 <p>We can see how many CallNode increase/decrease per op type.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pprint</span> <span class="k">import</span> <span class="n">pprint</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pprint</span> <span class="kn">import</span> <span class="n">pprint</span>
 
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Printing the change in number of occurrences of each operator caused by each pass...&quot;</span><span class="p">)</span>
 <span class="n">pprint</span><span class="p">(</span><span class="n">call_node_inst</span><span class="o">.</span><span class="n">get_pass_to_op_diff</span><span class="p">())</span>
@@ -660,7 +660,7 @@ profile result.</p>
 <p>Let’s see what happens if an exception occurs in a method of a <code class="docutils literal notranslate"><span class="pre">PassInstrument</span></code>.</p>
 <p>Define <code class="docutils literal notranslate"><span class="pre">PassInstrument</span></code> classes which raise exceptions in enter/exit <code class="docutils literal notranslate"><span class="pre">PassContext</span></code>:</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">PassExampleBase</span><span class="p">:</span>
-    <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span>
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">_name</span> <span class="o">=</span> <span class="n">name</span>
 
     <span class="k">def</span> <span class="nf">enter_pass_ctx</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
diff --git a/docs/how_to/optimize_operators/opt_conv_cuda.html b/docs/how_to/optimize_operators/opt_conv_cuda.html
index 828b0ab87..404a08ac4 100644
--- a/docs/how_to/optimize_operators/opt_conv_cuda.html
+++ b/docs/how_to/optimize_operators/opt_conv_cuda.html
@@ -352,7 +352,7 @@ of size 3 x 3.  We use stride size 1 and padding size 1 for the
 convolution. The following code defines the convolution algorithm in TVM.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
 
 <span class="c1"># The sizes of inputs and filters</span>
 <span class="n">batch</span> <span class="o">=</span> <span class="mi">256</span>
@@ -534,7 +534,7 @@ latency of convolution.</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 37.126799 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 37.440570 ms
 </pre></div>
 </div>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-optimize-operators-opt-conv-cuda-py">
diff --git a/docs/how_to/optimize_operators/opt_conv_tensorcore.html b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
index fb8c064dc..671eb5799 100644
--- a/docs/how_to/optimize_operators/opt_conv_tensorcore.html
+++ b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
@@ -364,9 +364,9 @@ The batch size is 256. Convolution filters contain 512 filters of size 3 x 3.
 We use stride size 1 and padding size 1 for the convolution. In the example, we use
 NHWCnc memory layout.The following code defines the convolution algorithm in TVM.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">nvcc</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">nvcc</span>
 
 <span class="c1"># The sizes of inputs and filters</span>
 <span class="n">batch_size</span> <span class="o">=</span> <span class="mi">256</span>
@@ -878,7 +878,7 @@ be able to run on our build server</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 9.389606 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 7.454823 ms
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/optimize_operators/opt_gemm.html b/docs/how_to/optimize_operators/opt_gemm.html
index f72960d56..8f7c10dff 100644
--- a/docs/how_to/optimize_operators/opt_gemm.html
+++ b/docs/how_to/optimize_operators/opt_gemm.html
@@ -371,7 +371,7 @@ Before actually demonstrating, we first define these variables.
 Then we write a baseline implementation, the simplest way to write a matrix multiplication in TVM.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
 <span class="kn">import</span> <span class="nn">tvm.testing</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
 <span class="kn">import</span> <span class="nn">numpy</span>
 <span class="kn">import</span> <span class="nn">timeit</span>
 
@@ -431,8 +431,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018728
-Baseline: 3.458604
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.019117
+Baseline: 3.342248
 </pre></div>
 </div>
 <p>In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -494,7 +494,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.297369
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.301089
 </pre></div>
 </div>
 <p>Here is the generated IR after blocking.</p>
@@ -563,7 +563,7 @@ vastly.</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.337631
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.333339
 </pre></div>
 </div>
 <p>Here is the generated IR after vectorization.</p>
@@ -626,7 +626,7 @@ the access pattern for A matrix is more cache friendly.</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.117098
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.117179
 </pre></div>
 </div>
 <p>Here is the generated IR after loop permutation.</p>
@@ -711,7 +711,7 @@ flattening.</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.111210
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.110986
 </pre></div>
 </div>
 <p>Here is the generated IR after array packing.</p>
@@ -799,7 +799,7 @@ write to C when all the block results are ready.</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111095
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111551
 </pre></div>
 </div>
 <p>Here is the generated IR after blocking.</p>
@@ -891,7 +891,7 @@ write to C when all the block results are ready.</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.144609
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.144950
 </pre></div>
 </div>
 <p>Here is the generated IR after parallelization.</p>
diff --git a/docs/how_to/optimize_operators/sg_execution_times.html b/docs/how_to/optimize_operators/sg_execution_times.html
index 8f686f2e0..eed513a6d 100644
--- a/docs/how_to/optimize_operators/sg_execution_times.html
+++ b/docs/how_to/optimize_operators/sg_execution_times.html
@@ -300,11 +300,11 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-optimize-operators-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:35.150</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
+<p><strong>00:34.790</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
 <ul class="simple">
-<li><p><strong>00:32.478</strong>: <a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></li>
-<li><p><strong>00:01.472</strong>: <a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></li>
-<li><p><strong>00:01.201</strong>: <a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></li>
+<li><p><strong>00:32.204</strong>: <a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></li>
+<li><p><strong>00:01.398</strong>: <a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></li>
+<li><p><strong>00:01.188</strong>: <a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></li>
 </ul>
 </div>
 
diff --git a/docs/how_to/profile/papi.html b/docs/how_to/profile/papi.html
index 27a05d6df..8fb9dc94c 100644
--- a/docs/how_to/profile/papi.html
+++ b/docs/how_to/profile/papi.html
@@ -371,7 +371,7 @@ is an example:</p>
     <span class="n">func_name</span><span class="o">=</span><span class="s2">&quot;main&quot;</span><span class="p">,</span>
     <span class="n">collectors</span><span class="o">=</span><span class="p">[</span><span class="n">tvm</span><span class="o">.</span><span class="n">runtime</span><span class="o">.</span><span class="n">profiling</span><span class="o">.</span><span class="n">PAPIMetricCollector</span><span class="p">()],</span>
 <span class="p">)</span>
-<span class="k">print</span><span class="p">(</span><span class="n">report</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="n">report</span><span class="p">)</span>
 </pre></div>
 </div>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">Name</span>                                    <span class="n">perf</span><span class="p">::</span><span class="n">CACHE</span><span class="o">-</span><span class="n">MISSES</span>   <span class="n">perf</span><span class="p">::</span><span class="n">CYCLES</span>  <span class="n">perf</span><span class="p">::</span><span class="n">STALLED</span><span class="o">-</span><span class="n">CYCL [...]
diff --git a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
index 48051bf99..41c889a51 100644
--- a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
+++ b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
@@ -300,14 +300,14 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-tune-with-autoscheduler-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:06.155</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
+<p><strong>05:02.754</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
 <ul class="simple">
-<li><p><strong>02:21.677</strong>: <a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></li>
-<li><p><strong>01:20.086</strong>: <a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></li>
-<li><p><strong>00:40.448</strong>: <a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></li>
-<li><p><strong>00:26.771</strong>: <a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></li>
-<li><p><strong>00:08.608</strong>: <a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></li>
-<li><p><strong>00:08.564</strong>: <a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></li>
+<li><p><strong>02:29.207</strong>: <a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></li>
+<li><p><strong>01:19.500</strong>: <a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></li>
+<li><p><strong>00:40.516</strong>: <a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></li>
+<li><p><strong>00:16.464</strong>: <a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></li>
+<li><p><strong>00:08.597</strong>: <a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></li>
+<li><p><strong>00:08.472</strong>: <a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></li>
 </ul>
 </div>
 
diff --git a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
index 8302795c1..b6f00c0f5 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
@@ -354,8 +354,8 @@ get it to run, you will need to wrap the body of this tutorial in a <code class=
 
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span><span class="p">,</span> <span class="n">auto_scheduler</span><span class="p">,</span> <span class="n">topi</span>
-<span class="kn">from</span> <span class="nn">tvm.topi.testing</span> <span class="k">import</span> <span class="n">conv2d_nchw_python</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span><span class="p">,</span> <span class="n">auto_scheduler</span><span class="p">,</span> <span class="n">topi</span>
+<span class="kn">from</span> <span class="nn">tvm.topi.testing</span> <span class="kn">import</span> <span class="n">conv2d_nchw_python</span>
 </pre></div>
 </div>
 <div class="section" id="define-the-computation">
@@ -450,7 +450,7 @@ file and apply it.</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>.T
 </pre></div>
 </div>
 <p>We can lower the schedule to see the IR after auto-scheduling.
@@ -470,45 +470,483 @@ cooperative fetching, unrolling and operator fusion.</p>
              compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
   buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
   preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
-  attr [IterVar(blockIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;blockIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-  allocate(conv2d_nchw: Pointer(local float32), float32, [8]), storage_scope = local;
-  allocate(pad_temp.shared: Pointer(shared float32), float32, [6272]), storage_scope = shared;
-  allocate(kernel.shared: Pointer(shared float32), float32, [1024]), storage_scope = shared;
-  attr [IterVar(threadIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49 {
-    for (ff.outer.inner.init: int32, 0, 4) {
-      for (ff.inner.init: int32, 0, 2) {
-        conv2d_nchw_1: Buffer(conv2d_nchw, float32, [8], [], scope=&quot;local&quot;, align=32)[((ff.outer.inner.init*2) + ff.inner.init)] = 0f32
-      }
-    }
-    for (rc.outer.outer: int32, 0, 4) {
+  attr [IterVar(blockIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;blockIdx.x&quot;)] &quot;thread_extent&quot; = 28;
+  allocate(conv2d_nchw: Pointer(local float32), float32, [14]), storage_scope = local;
+  allocate(pad_temp.shared: Pointer(shared float32), float32, [72]), storage_scope = shared;
+  allocate(kernel.shared: Pointer(shared float32), float32, [3072]), storage_scope = shared;
+  attr [IterVar(threadIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64 {
+    conv2d_nchw_1: Buffer(conv2d_nchw, float32, [14], [], scope=&quot;local&quot;, align=32)[0] = 0f32
+    conv2d_nchw_1[1] = 0f32
+    conv2d_nchw_1[2] = 0f32
+    conv2d_nchw_1[3] = 0f32
+    conv2d_nchw_1[4] = 0f32
+    conv2d_nchw_1[5] = 0f32
+    conv2d_nchw_1[6] = 0f32
+    conv2d_nchw_1[7] = 0f32
+    conv2d_nchw_1[8] = 0f32
+    conv2d_nchw_1[9] = 0f32
+    conv2d_nchw_1[10] = 0f32
+    conv2d_nchw_1[11] = 0f32
+    conv2d_nchw_1[12] = 0f32
+    conv2d_nchw_1[13] = 0f32
+    for (rc.outer.outer: int32, 0, 64) {
       for (ry.outer.outer: int32, 0, 3) {
-        for (rx.outer.outer: int32, 0, 3) {
-          for (ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer: int32, 0, 128) {
-            let cse_var_1: int32 = (ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer*49)
-            attr [IterVar(threadIdx.x_1: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
-            pad_temp.shared_1: Buffer(pad_temp.shared, float32, [6272], [], scope=&quot;shared&quot;)[(cse_var_1 + threadIdx.x_1)] = @tir.if_then_else(((((1 &lt;= (floordiv(threadIdx.x_1, 7) + ry.outer.outer)) &amp;&amp; ((floordiv(threadIdx.x_1, 7) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) &amp;&amp; ((rx.outer.outer + floormod(threadIdx.x_1, 7)) &lt; 8)), data[((((((rc.outer.outer*6272) + cse_var_1) + (ry.outer.outer*7)) + rx.outer.o [...]
-          }
-          for (ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer_1: int32, 0, 21) {
-            attr [IterVar(threadIdx.x_2: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 49;
-            if @tir.likely((((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer_1*49) + threadIdx.x_2) &lt; 1024), dtype=bool) {
-              kernel.shared_1: Buffer(kernel.shared, float32, [1024], [], scope=&quot;shared&quot;)[((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer_1*49) + threadIdx.x_2)] = kernel[((((((blockIdx.x*36864) + (floordiv(((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer_1*49) + threadIdx.x_2), 128)*4608)) + (rc.outer.outer*1152)) + (floormod(((ax0.ax1.fused.ax2.fused.ax3.fused.outer.outer_1*49) + threadIdx.x_2), 128)*9)) + (ry.outer.outer*3)) + rx.outer.outer)]
+        let cse_var_2: int32 = (rc.outer.outer*72)
+        let cse_var_1: int32 = (ry.outer.outer*3)
+         {
+          attr [IterVar(threadIdx.x_1: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64 {
+            if @tir.likely((threadIdx.x_1 &lt; 18), dtype=bool) {
+              pad_temp.shared_1: Buffer(pad_temp.shared, float32, [72], [], scope=&quot;shared&quot;)[(threadIdx.x_1*4)] = @tir.if_then_else(((((1 &lt;= (ry.outer.outer + floormod(blockIdx.x, 7))) &amp;&amp; ((ry.outer.outer + floormod(blockIdx.x, 7)) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1*4), 9))) &amp;&amp; (floormod((threadIdx.x_1*4), 9) &lt; 8)), data[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1*4), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) +  [...]
             }
-          }
-          for (rc.outer.inner: int32, 0, 2) {
-            for (ff.outer.inner: int32, 0, 4) {
-              for (rc.inner: int32, 0, 64) {
-                for (ff.inner: int32, 0, 2) {
-                  let cse_var_2: int32 = ((ff.outer.inner*2) + ff.inner)
-                  conv2d_nchw_1[cse_var_2] = (conv2d_nchw_1[cse_var_2] + (pad_temp.shared_1[(((rc.outer.inner*3136) + (rc.inner*49)) + threadIdx.x)]*kernel.shared_1[((((ff.outer.inner*256) + (ff.inner*128)) + (rc.outer.inner*64)) + rc.inner)]))
-                }
-              }
+            if @tir.likely((threadIdx.x_1 &lt; 18), dtype=bool) {
+              pad_temp.shared_1[((threadIdx.x_1*4) + 1)] = @tir.if_then_else(((((1 &lt;= (ry.outer.outer + floormod(blockIdx.x, 7))) &amp;&amp; ((ry.outer.outer + floormod(blockIdx.x, 7)) &lt; 8)) &amp;&amp; (1 &lt;= floormod(((threadIdx.x_1*4) + 1), 9))) &amp;&amp; (floormod(((threadIdx.x_1*4) + 1), 9) &lt; 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 1), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 1), 9)) - 8)], 0 [...]
+            }
+            if @tir.likely((threadIdx.x_1 &lt; 18), dtype=bool) {
+              pad_temp.shared_1[((threadIdx.x_1*4) + 2)] = @tir.if_then_else(((((1 &lt;= (ry.outer.outer + floormod(blockIdx.x, 7))) &amp;&amp; ((ry.outer.outer + floormod(blockIdx.x, 7)) &lt; 8)) &amp;&amp; (1 &lt;= floormod(((threadIdx.x_1*4) + 2), 9))) &amp;&amp; (floormod(((threadIdx.x_1*4) + 2), 9) &lt; 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 2), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 2), 9)) - 8)], 0 [...]
+            }
+            if @tir.likely((threadIdx.x_1 &lt; 18), dtype=bool) {
+              pad_temp.shared_1[((threadIdx.x_1*4) + 3)] = @tir.if_then_else(((((1 &lt;= (ry.outer.outer + floormod(blockIdx.x, 7))) &amp;&amp; ((ry.outer.outer + floormod(blockIdx.x, 7)) &lt; 8)) &amp;&amp; (1 &lt;= floormod(((threadIdx.x_1*4) + 3), 9))) &amp;&amp; (floormod(((threadIdx.x_1*4) + 3), 9) &lt; 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 3), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 3), 9)) - 8)], 0 [...]
             }
           }
+          attr [IterVar(threadIdx.x_2: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1: Buffer(kernel.shared, float32, [3072], [], scope=&quot;shared&quot;)[threadIdx.x_2] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 64)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 8), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 64), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 128)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 16), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 128), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 192)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 36864)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 256)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 32), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 256), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 320)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 40), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 320), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 384)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 73728)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 448)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 56), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 448), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 512)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 64), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 512), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 576)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 110592)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 640)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 80), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 640), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 704)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 88), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 704), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 768)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 147456)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 832)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 104), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 832), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 896)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 112), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 896), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 960)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 184320)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1024)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 128), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1024), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1088)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 136), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1088), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1152)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 221184)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1216)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 152), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1216), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1280)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 160), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1280), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 258048)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1408)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 176), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1408), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1472)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 184), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1472), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1536)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 294912)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1600)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 200), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1600), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1664)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 208), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1664), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1728)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 331776)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 224), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1792), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1856)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 232), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1856), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1920)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 368640)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 1984)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 248), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 1984), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2048)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 256), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2048), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2112)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 405504)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2176)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 272), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2176), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 280), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2240), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2304)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 442368)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2368)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 296), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2368), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2432)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 304), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2432), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2496)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 479232)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2560)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 320), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2560), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2624)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 328), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2624), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 516096)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2752)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 344), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2752), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2816)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 352), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2816), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2880)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(floordiv(threadIdx.x_2, 8), 3)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 552960)]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 2944)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 368), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 2944), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
+          kernel.shared_1[(threadIdx.x_2 + 3008)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((floordiv(threadIdx.x_2, 8) + 376), 3)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 3008), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[0]*kernel.shared_1[(threadIdx.x*48)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[1]*kernel.shared_1[(threadIdx.x*48)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[2]*kernel.shared_1[(threadIdx.x*48)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[3]*kernel.shared_1[(threadIdx.x*48)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[4]*kernel.shared_1[(threadIdx.x*48)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[5]*kernel.shared_1[(threadIdx.x*48)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[6]*kernel.shared_1[(threadIdx.x*48)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 3)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[0]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 24)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 27)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 1)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 4)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 25)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 28)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 2)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 5)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 26)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 29)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 6)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 9)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 30)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 33)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 7)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 10)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 31)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 34)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 8)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 11)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 32)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 35)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 12)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 15)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 36)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 39)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 13)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 16)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 37)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 40)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 14)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 17)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 38)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 41)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 18)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 21)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 42)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 45)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 19)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 22)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 43)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 46)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 20)]))
+          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 23)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 47)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 44)]))
+          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 47)]))
         }
       }
     }
-    for (i1.inner: int32, 0, 8) {
-      compute[(((blockIdx.x*392) + (i1.inner*49)) + threadIdx.x)] = max((conv2d_nchw_1[i1.inner] + bias[((blockIdx.x*8) + i1.inner)]), 0f32)
+    for (i1.inner: int32, 0, 2) {
+      for (i3.inner: int32, 0, 7) {
+        compute[(((((floordiv(blockIdx.x, 7)*6272) + (threadIdx.x*98)) + (i1.inner*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[((i1.inner*7) + i3.inner)] + bias[(((floordiv(blockIdx.x, 7)*128) + (threadIdx.x*2)) + i1.inner)]), 0f32)
+      }
     }
   }
 }
@@ -546,7 +984,7 @@ cooperative fetching, unrolling and operator fusion.</p>
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.374 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.361 ms
 </pre></div>
 </div>
 </div>
@@ -576,36 +1014,36 @@ conv2d_nchw_nn_o_i, conv2d_nchw_nn_i = s[conv2d_nchw].split(conv2d_nchw_nn, fact
 conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_i, factor=1)
 conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
 conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
-conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=2)
-conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=4)
-conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=1)
+conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
+conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
+conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
 conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
 conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
 conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
-conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
+conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
 conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
 conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
-conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
-conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
+conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
+conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
 conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
-conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=64)
-conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=2)
+conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
+conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
 conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
 conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
 conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
-conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1)
+conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
 s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2d_nc [...]
 compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
 compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
 compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
-compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=8)
-compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=1)
+compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
+compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
 compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
 compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
-compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
+compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
 compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
-compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
+compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
+compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
 compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
 s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
 s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -625,14 +1063,14 @@ s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread
 kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
 kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
 s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=49)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
 s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis(&quot;threadIdx.x&quot;))
 pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
 s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=49)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
 s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis(&quot;threadIdx.x&quot;))
-s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, &quot;auto_unroll_max_step&quot;, 0)
+s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, &quot;auto_unroll_max_step&quot;, 512)
 s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, &quot;unroll_explicit&quot;, True)
 
 CUDA source code:
@@ -650,42 +1088,430 @@ CUDA source code:
   #define int64_t long long
   #define uint64_t unsigned long long
 #endif
-extern &quot;C&quot; __global__ void __launch_bounds__(49) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
-  float conv2d_nchw[8];
-  __shared__ float pad_temp_shared[6272];
-  __shared__ float kernel_shared[1024];
-  for (int ff_outer_inner_init = 0; ff_outer_inner_init &lt; 4; ++ff_outer_inner_init) {
-    for (int ff_inner_init = 0; ff_inner_init &lt; 2; ++ff_inner_init) {
-      conv2d_nchw[((ff_outer_inner_init * 2) + ff_inner_init)] = 0.000000e+00f;
-    }
-  }
-  for (int rc_outer_outer = 0; rc_outer_outer &lt; 4; ++rc_outer_outer) {
+extern &quot;C&quot; __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+  float conv2d_nchw[14];
+  __shared__ float pad_temp_shared[72];
+  __shared__ float kernel_shared[3072];
+  conv2d_nchw[0] = 0.000000e+00f;
+  conv2d_nchw[1] = 0.000000e+00f;
+  conv2d_nchw[2] = 0.000000e+00f;
+  conv2d_nchw[3] = 0.000000e+00f;
+  conv2d_nchw[4] = 0.000000e+00f;
+  conv2d_nchw[5] = 0.000000e+00f;
+  conv2d_nchw[6] = 0.000000e+00f;
+  conv2d_nchw[7] = 0.000000e+00f;
+  conv2d_nchw[8] = 0.000000e+00f;
+  conv2d_nchw[9] = 0.000000e+00f;
+  conv2d_nchw[10] = 0.000000e+00f;
+  conv2d_nchw[11] = 0.000000e+00f;
+  conv2d_nchw[12] = 0.000000e+00f;
+  conv2d_nchw[13] = 0.000000e+00f;
+  for (int rc_outer_outer = 0; rc_outer_outer &lt; 64; ++rc_outer_outer) {
     for (int ry_outer_outer = 0; ry_outer_outer &lt; 3; ++ry_outer_outer) {
-      for (int rx_outer_outer = 0; rx_outer_outer &lt; 3; ++rx_outer_outer) {
-        __syncthreads();
-        for (int ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = 0; ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer &lt; 128; ++ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer) {
-          pad_temp_shared[((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49) + ((int)threadIdx.x))] = (((((1 &lt;= ((((int)threadIdx.x) / 7) + ry_outer_outer)) &amp;&amp; (((((int)threadIdx.x) / 7) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= (rx_outer_outer + (((int)threadIdx.x) % 7)))) &amp;&amp; ((rx_outer_outer + (((int)threadIdx.x) % 7)) &lt; 8)) ? data[((((((rc_outer_outer * 6272) + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 49)) + (ry_outer_outer * 7)) + rx_outer_outer) [...]
-        }
-        for (int ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer1 = 0; ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer1 &lt; 21; ++ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer1) {
-          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer1 * 49) + ((int)threadIdx.x)) &lt; 1024) {
-            kernel_shared[((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer1 * 49) + ((int)threadIdx.x))] = kernel[((((((((int)blockIdx.x) * 36864) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer1 * 49) + ((int)threadIdx.x)) &gt;&gt; 7) * 4608)) + (rc_outer_outer * 1152)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer1 * 49) + ((int)threadIdx.x)) &amp; 127) * 9)) + (ry_outer_outer * 3)) + rx_outer_outer)];
-          }
-        }
-        __syncthreads();
-        for (int rc_outer_inner = 0; rc_outer_inner &lt; 2; ++rc_outer_inner) {
-          for (int ff_outer_inner = 0; ff_outer_inner &lt; 4; ++ff_outer_inner) {
-            for (int rc_inner = 0; rc_inner &lt; 64; ++rc_inner) {
-              for (int ff_inner = 0; ff_inner &lt; 2; ++ff_inner) {
-                conv2d_nchw[((ff_outer_inner * 2) + ff_inner)] = (conv2d_nchw[((ff_outer_inner * 2) + ff_inner)] + (pad_temp_shared[(((rc_outer_inner * 3136) + (rc_inner * 49)) + ((int)threadIdx.x))] * kernel_shared[((((ff_outer_inner * 256) + (ff_inner * 128)) + (rc_outer_inner * 64)) + rc_inner)]));
-              }
-            }
-          }
-        }
+      __syncthreads();
+      if (((int)threadIdx.x) &lt; 18) {
+        pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) * 4) % 9))) &amp;&amp; (((((int)threadIdx.x) * 4) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
       }
+      if (((int)threadIdx.x) &lt; 18) {
+        pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= (((((int)threadIdx.x) * 4) + 1) % 9))) &amp;&amp; ((((((int)threadIdx.x) * 4) + 1) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
+      }
+      if (((int)threadIdx.x) &lt; 18) {
+        pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= (((((int)threadIdx.x) * 4) + 2) % 9))) &amp;&amp; ((((((int)threadIdx.x) * 4) + 2) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
+      }
+      if (((int)threadIdx.x) &lt; 18) {
+        pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= (((((int)threadIdx.x) * 4) + 3) % 9))) &amp;&amp; ((((((int)threadIdx.x) * 4) + 3) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
+      }
+      kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
+      kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
+      kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
+      kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
+      kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
+      kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
+      kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
+      kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
+      kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
+      kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
+      kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
+      kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
+      kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
+      kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
+      kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
+      kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      __syncthreads();
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
+      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
+      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
     }
   }
-  for (int i1_inner = 0; i1_inner &lt; 8; ++i1_inner) {
-    compute[(((((int)blockIdx.x) * 392) + (i1_inner * 49)) + ((int)threadIdx.x))] = max((conv2d_nchw[i1_inner] + bias[((((int)blockIdx.x) * 8) + i1_inner)]), 0.000000e+00f);
+  for (int i1_inner = 0; i1_inner &lt; 2; ++i1_inner) {
+    for (int i3_inner = 0; i3_inner &lt; 7; ++i3_inner) {
+      compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
+    }
   }
 }
 </pre></div>
@@ -723,7 +1549,7 @@ In the example below we resume the status and do more 5 trials.</p>
 Get devices for measurement successfully!
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  21.677 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  29.207 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/e3e540f3b477c0c52d8eb73e674e8ffd/tune_conv2d_layer_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_conv2d_layer_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_arm.html b/docs/how_to/tune_with_autoscheduler/tune_network_arm.html
index 87e9f0fca..aa5d04c66 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_arm.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_arm.html
@@ -365,11 +365,11 @@ get it to run, you will need to wrap the body of this tutorial in a <code class=
 <span class="kn">import</span> <span class="nn">os</span>
 
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">auto_scheduler</span>
-<span class="kn">from</span> <span class="nn">tvm.relay</span> <span class="k">import</span> <span class="n">data_dep_optimization</span> <span class="k">as</span> <span class="n">ddo</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">auto_scheduler</span>
+<span class="kn">from</span> <span class="nn">tvm.relay</span> <span class="kn">import</span> <span class="n">data_dep_optimization</span> <span class="k">as</span> <span class="n">ddo</span>
 <span class="kn">import</span> <span class="nn">tvm.relay.testing</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.utils</span> <span class="k">import</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.utils.tempdir" title="View documentation for tvm.contrib.utils.tempdir"><span class="n">tempdir</span></a>
+<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.utils</span> <span class="kn">import</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.utils.tempdir" title="View documentation for tvm.contrib.utils.tempdir"><span class="n">tempdir</span></a>
 </pre></div>
 </div>
 <div class="section" id="define-a-network">
@@ -432,7 +432,7 @@ You can use <a class="reference internal" href="../../arch/convert_layout.html#c
         <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/testing.html#tvm.relay.testing.inception_v3.get_workload" title="View documentation for tvm.relay.testing.inception_v3.get_workload"><span class="n">relay</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">inception_v3</span><span class="o">.</span><span class="n">get_workload</span></a [...]
     <span class="k">elif</span> <span class="n">name</span> <span class="o">==</span> <span class="s2">&quot;mxnet&quot;</span><span class="p">:</span>
         <span class="c1"># an example for mxnet model</span>
-        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="k">import</span> <span class="n">get_model</span>
+        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="kn">import</span> <span class="n">get_model</span>
 
         <span class="k">assert</span> <span class="n">layout</span> <span class="o">==</span> <span class="s2">&quot;NCHW&quot;</span>
 
@@ -451,7 +451,7 @@ You can use <a class="reference internal" href="../../arch/convert_layout.html#c
         <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Network not found.&quot;</span><span class="p">)</span>
 
     <span class="k">if</span> <span class="n">use_sparse</span><span class="p">:</span>
-        <span class="kn">from</span> <span class="nn">tvm.topi.sparse.utils</span> <span class="k">import</span> <span class="n">convert_model_dense_to_sparse</span>
+        <span class="kn">from</span> <span class="nn">tvm.topi.sparse.utils</span> <span class="kn">import</span> <span class="n">convert_model_dense_to_sparse</span>
 
         <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">convert_model_dense_to_sparse</span><span class="p">(</span><span class="n">mod</span><span class="p">,</span> <span class="n">params</span><span class="p">,</span> <span class="n">random_params</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
 
@@ -819,7 +819,7 @@ so we can read the log file and load the best schedules.</p>
     <span class="c1"># Export library</span>
     <span class="n">tmp</span> <span class="o">=</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.utils.tempdir" title="View documentation for tvm.contrib.utils.tempdir"><span class="n">tempdir</span></a><span class="p">()</span>
     <span class="k">if</span> <span class="n">use_ndk</span><span class="p">:</span>
-        <span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">ndk</span>
+        <span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">ndk</span>
 
         <span class="n">filename</span> <span class="o">=</span> <span class="s2">&quot;net.so&quot;</span>
         <span class="n">lib</span><span class="o">.</span><span class="n">export_library</span><span class="p">(</span><span class="n">tmp</span><span class="o">.</span><span class="n">relpath</span><span class="p">(</span><span class="n">filename</span><span class="p">),</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.ndk.create_shared" title="View documentation for tvm.contrib.ndk.create_shared"><span class="n">ndk</span><span class="o">.</span><span class="n">creat [...]
@@ -857,35 +857,35 @@ so we can read the log file and load the best schedules.</p>
 <p>During the tuning, a lot of information will be printed on the console.
 They are used for debugging purposes. The most important info is the output
 of the task scheduler. The following table is a sample output.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="o">----------------------------------------------------------------------</span>
-<span class="o">------------------------------</span>  <span class="p">[</span> <span class="n">Task</span> <span class="n">Scheduler</span> <span class="p">]</span>
-<span class="o">----------------------------------------------------------------------</span>
-<span class="o">|</span>  <span class="n">ID</span>  <span class="o">|</span> <span class="n">Latency</span> <span class="p">(</span><span class="n">ms</span><span class="p">)</span> <span class="o">|</span> <span class="n">Speed</span> <span class="p">(</span><span class="n">GFLOPS</span><span class="p">)</span> <span class="o">|</span> <span class="n">Trials</span> <span class="o">|</span>
-<span class="o">-------------------------------------------------</span>
-<span class="o">|</span>    <span class="mi">0</span> <span class="o">|</span>        <span class="mf">0.013</span> <span class="o">|</span>           <span class="mf">0.31</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">1</span> <span class="o">|</span>        <span class="mf">0.845</span> <span class="o">|</span>           <span class="mf">2.43</span> <span class="o">|</span>    <span class="mi">448</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">2</span> <span class="o">|</span>        <span class="mf">0.046</span> <span class="o">|</span>          <span class="o">-</span><span class="mf">0.00</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">3</span> <span class="o">|</span>        <span class="mf">4.194</span> <span class="o">|</span>          <span class="mf">24.53</span> <span class="o">|</span>   <span class="mi">2112</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">4</span> <span class="o">|</span>        <span class="mf">0.109</span> <span class="o">|</span>           <span class="mf">9.21</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">5</span> <span class="o">|</span>        <span class="mf">1.759</span> <span class="o">|</span>          <span class="mf">29.27</span> <span class="o">|</span>    <span class="mi">896</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">6</span> <span class="o">|</span>        <span class="mf">0.083</span> <span class="o">|</span>           <span class="mf">6.01</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">7</span> <span class="o">|</span>        <span class="mf">3.084</span> <span class="o">|</span>          <span class="mf">33.38</span> <span class="o">|</span>   <span class="mi">7680</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">8</span> <span class="o">|</span>        <span class="mf">0.136</span> <span class="o">|</span>          <span class="mf">14.78</span> <span class="o">|</span>    <span class="mi">384</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">9</span> <span class="o">|</span>        <span class="mf">1.349</span> <span class="o">|</span>          <span class="mf">38.23</span> <span class="o">|</span>    <span class="mi">768</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">10</span> <span class="o">|</span>        <span class="mf">0.133</span> <span class="o">|</span>           <span class="mf">7.55</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">11</span> <span class="o">|</span>        <span class="mf">2.747</span> <span class="o">|</span>          <span class="mf">37.56</span> <span class="o">|</span>   <span class="mi">1536</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">12</span> <span class="o">|</span>        <span class="mf">0.338</span> <span class="o">|</span>          <span class="mf">11.87</span> <span class="o">|</span>    <span class="mi">192</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">13</span> <span class="o">|</span>        <span class="mf">1.295</span> <span class="o">|</span>          <span class="mf">40.00</span> <span class="o">|</span>    <span class="mi">704</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">14</span> <span class="o">|</span>        <span class="mf">0.482</span> <span class="o">|</span>           <span class="mf">4.16</span> <span class="o">|</span>    <span class="mi">256</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">15</span> <span class="o">|</span>        <span class="mf">2.686</span> <span class="o">|</span>          <span class="mf">38.56</span> <span class="o">|</span>   <span class="mi">1344</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">16</span> <span class="o">|</span>        <span class="mf">0.884</span> <span class="o">|</span>           <span class="mf">9.08</span> <span class="o">|</span>    <span class="mi">448</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">17</span> <span class="o">|</span>        <span class="mf">1.332</span> <span class="o">|</span>          <span class="mf">39.18</span> <span class="o">|</span>    <span class="mi">704</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">18</span> <span class="o">|</span>        <span class="mf">1.045</span> <span class="o">|</span>           <span class="mf">3.84</span> <span class="o">|</span>    <span class="mi">576</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">19</span> <span class="o">|</span>        <span class="mf">1.391</span> <span class="o">|</span>          <span class="mf">38.09</span> <span class="o">|</span>    <span class="mi">704</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">20</span> <span class="o">|</span>        <span class="mf">0.777</span> <span class="o">|</span>          <span class="mf">10.34</span> <span class="o">|</span>    <span class="mi">448</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">21</span> <span class="o">|</span>        <span class="mf">0.739</span> <span class="o">|</span>          <span class="mf">30.97</span> <span class="o">|</span>    <span class="mi">448</span> <span class="o">|</span>
-<span class="o">-------------------------------------------------</span>
- <span class="n">Estimated</span> <span class="n">total</span> <span class="nl">latency</span><span class="p">:</span> <span class="mf">38.347</span> <span class="n">ms</span>      <span class="nl">Trials</span><span class="p">:</span> <span class="mi">19992</span>   <span class="n">Used</span> <span class="nl">time</span> <span class="p">:</span> <span class="mi">19260</span> <span class="n">s</span>     <span class="n">Next</span> <span class="nl">ID</span><span class="p">:</span> <spa [...]
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="o">----------------------------------------------------------------------</span><span class="w"></span>
+<span class="o">------------------------------</span><span class="w">  </span><span class="p">[</span><span class="w"> </span><span class="n">Task</span><span class="w"> </span><span class="n">Scheduler</span><span class="w"> </span><span class="p">]</span><span class="w"></span>
+<span class="o">----------------------------------------------------------------------</span><span class="w"></span>
+<span class="o">|</span><span class="w">  </span><span class="n">ID</span><span class="w">  </span><span class="o">|</span><span class="w"> </span><span class="n">Latency</span><span class="w"> </span><span class="p">(</span><span class="n">ms</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">Speed</span><span class="w"> </span><span class="p">(</span><span class="n">GFLOPS</span><span class="p">)</span><span class="w">  [...]
+<span class="o">-------------------------------------------------</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">0</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.013</span><span class="w"> </span><span class="o">|</span><span class="w">           </span><span class="mf">0.31</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">1</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.845</span><span class="w"> </span><span class="o">|</span><span class="w">           </span><span class="mf">2.43</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">448</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">2</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.046</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">-0.00</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">3</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">4.194</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">24.53</span><span class="w"> </span><span class="o">|</span><span class="w">   </span><span class="mi">2112</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">4</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.109</span><span class="w"> </span><span class="o">|</span><span class="w">           </span><span class="mf">9.21</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">5</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1.759</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">29.27</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">896</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">6</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.083</span><span class="w"> </span><span class="o">|</span><span class="w">           </span><span class="mf">6.01</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">7</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">3.084</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">33.38</span><span class="w"> </span><span class="o">|</span><span class="w">   </span><span class="mi">7680</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">8</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.136</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">14.78</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">384</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">9</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1.349</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">38.23</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">768</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">10</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.133</span><span class="w"> </span><span class="o">|</span><span class="w">           </span><span class="mf">7.55</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">11</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">2.747</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">37.56</span><span class="w"> </span><span class="o">|</span><span class="w">   </span><span class="mi">1536</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">12</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.338</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">11.87</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">192</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">13</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1.295</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">40.00</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">704</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">14</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.482</span><span class="w"> </span><span class="o">|</span><span class="w">           </span><span class="mf">4.16</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">256</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">15</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">2.686</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">38.56</span><span class="w"> </span><span class="o">|</span><span class="w">   </span><span class="mi">1344</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">16</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.884</span><span class="w"> </span><span class="o">|</span><span class="w">           </span><span class="mf">9.08</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">448</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">17</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1.332</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">39.18</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">704</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">18</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1.045</span><span class="w"> </span><span class="o">|</span><span class="w">           </span><span class="mf">3.84</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">576</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">19</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1.391</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">38.09</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">704</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">20</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.777</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">10.34</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">448</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">21</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.739</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">30.97</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">448</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">-------------------------------------------------</span><span class="w"></span>
+<span class="w"> </span><span class="n">Estimated</span><span class="w"> </span><span class="n">total</span><span class="w"> </span><span class="n">latency</span><span class="o">:</span><span class="w"> </span><span class="mf">38.347</span><span class="w"> </span><span class="n">ms</span><span class="w">      </span><span class="n">Trials</span><span class="o">:</span><span class="w"> </span><span class="mi">19992</span><span class="w">   </span><span class="n">Used</span><span class="w" [...]
 </pre></div>
 </div>
 <p>This table lists the latency and (estimated) speed of all tasks.
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
index 3948a3fbf..87107d4ee 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
@@ -362,9 +362,9 @@ get it to run, you will need to wrap the body of this tutorial in a <code class=
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">auto_scheduler</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">auto_scheduler</span>
 <span class="kn">import</span> <span class="nn">tvm.relay.testing</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span>
 </pre></div>
 </div>
 <div class="section" id="define-a-network">
@@ -427,7 +427,7 @@ You can use <a class="reference internal" href="../../arch/convert_layout.html#c
         <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/testing.html#tvm.relay.testing.inception_v3.get_workload" title="View documentation for tvm.relay.testing.inception_v3.get_workload"><span class="n">relay</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">inception_v3</span><span class="o">.</span><span class="n">get_workload</span></a [...]
     <span class="k">elif</span> <span class="n">name</span> <span class="o">==</span> <span class="s2">&quot;mxnet&quot;</span><span class="p">:</span>
         <span class="c1"># an example for mxnet model</span>
-        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="k">import</span> <span class="n">get_model</span>
+        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="kn">import</span> <span class="n">get_model</span>
 
         <span class="k">assert</span> <span class="n">layout</span> <span class="o">==</span> <span class="s2">&quot;NCHW&quot;</span>
 
@@ -796,37 +796,37 @@ and do more analyses later.</p></li>
 <p>During the tuning, a lot of information will be printed on the console.
 They are used for debugging purposes. The most important info is the output
 of the task scheduler. The following table is a sample output.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="o">----------------------------------------------------------------------</span>
-<span class="o">------------------------------</span>  <span class="p">[</span> <span class="n">Task</span> <span class="n">Scheduler</span> <span class="p">]</span>
-<span class="o">----------------------------------------------------------------------</span>
-<span class="o">|</span>  <span class="n">ID</span>  <span class="o">|</span> <span class="n">Latency</span> <span class="p">(</span><span class="n">ms</span><span class="p">)</span> <span class="o">|</span> <span class="n">Speed</span> <span class="p">(</span><span class="n">GFLOPS</span><span class="p">)</span> <span class="o">|</span> <span class="n">Trials</span> <span class="o">|</span>
-<span class="o">-------------------------------------------------</span>
-<span class="o">|</span>    <span class="mi">0</span> <span class="o">|</span>        <span class="mf">0.005</span> <span class="o">|</span>           <span class="mf">0.88</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">1</span> <span class="o">|</span>        <span class="mf">0.010</span> <span class="o">|</span>          <span class="mf">99.10</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">2</span> <span class="o">|</span>        <span class="mf">0.006</span> <span class="o">|</span>           <span class="mf">0.00</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">3</span> <span class="o">|</span>        <span class="mf">0.145</span> <span class="o">|</span>         <span class="mf">979.78</span> <span class="o">|</span>    <span class="mi">384</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">4</span> <span class="o">|</span>        <span class="mf">0.130</span> <span class="o">|</span>        <span class="mf">1097.02</span> <span class="o">|</span>    <span class="mi">384</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">5</span> <span class="o">|</span>        <span class="mf">0.143</span> <span class="o">|</span>         <span class="mf">992.69</span> <span class="o">|</span>    <span class="mi">384</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">6</span> <span class="o">|</span>        <span class="mf">0.076</span> <span class="o">|</span>        <span class="mf">1526.86</span> <span class="o">|</span>    <span class="mi">192</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">7</span> <span class="o">|</span>        <span class="mf">0.115</span> <span class="o">|</span>         <span class="mf">999.44</span> <span class="o">|</span>    <span class="mi">320</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">8</span> <span class="o">|</span>        <span class="mf">0.079</span> <span class="o">|</span>        <span class="mf">1449.39</span> <span class="o">|</span>    <span class="mi">320</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">9</span> <span class="o">|</span>        <span class="mf">0.122</span> <span class="o">|</span>         <span class="mf">938.73</span> <span class="o">|</span>    <span class="mi">384</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">10</span> <span class="o">|</span>        <span class="mf">0.063</span> <span class="o">|</span>        <span class="mf">1832.98</span> <span class="o">|</span>    <span class="mi">192</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">11</span> <span class="o">|</span>        <span class="mf">0.072</span> <span class="o">|</span>        <span class="mf">1763.62</span> <span class="o">|</span>    <span class="mi">256</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">12</span> <span class="o">|</span>        <span class="mf">0.062</span> <span class="o">|</span>        <span class="mf">2036.40</span> <span class="o">|</span>    <span class="mi">192</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">13</span> <span class="o">|</span>        <span class="mf">0.068</span> <span class="o">|</span>        <span class="mf">1874.44</span> <span class="o">|</span>    <span class="mi">192</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">14</span> <span class="o">|</span>        <span class="mf">0.049</span> <span class="o">|</span>        <span class="mf">2346.50</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">15</span> <span class="o">|</span>        <span class="mf">0.076</span> <span class="o">|</span>        <span class="mf">1694.31</span> <span class="o">|</span>    <span class="mi">256</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">16</span> <span class="o">|</span>        <span class="mf">0.067</span> <span class="o">|</span>        <span class="mf">1933.30</span> <span class="o">|</span>    <span class="mi">448</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">17</span> <span class="o">|</span>        <span class="mf">0.076</span> <span class="o">|</span>        <span class="mf">1680.90</span> <span class="o">|</span>    <span class="mi">256</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">18</span> <span class="o">|</span>        <span class="mf">0.022</span> <span class="o">|</span>          <span class="mf">98.43</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">19</span> <span class="o">|</span>        <span class="mf">0.076</span> <span class="o">|</span>        <span class="mf">3112.55</span> <span class="o">|</span>    <span class="mi">192</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">20</span> <span class="o">|</span>        <span class="mf">0.013</span> <span class="o">|</span>        <span class="mf">2026.44</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">21</span> <span class="o">|</span>        <span class="mf">0.011</span> <span class="o">|</span>        <span class="mf">1136.69</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">22</span> <span class="o">|</span>        <span class="mf">0.013</span> <span class="o">|</span>         <span class="mf">992.47</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">23</span> <span class="o">|</span>        <span class="mf">0.020</span> <span class="o">|</span>         <span class="mf">627.56</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">-------------------------------------------------</span>
-<span class="n">Estimated</span> <span class="n">total</span> <span class="nl">latency</span><span class="p">:</span> <span class="mf">1.587</span> <span class="n">ms</span>  <span class="nl">Trials</span><span class="p">:</span> <span class="mi">4992</span>  <span class="n">Used</span> <span class="nl">time</span> <span class="p">:</span> <span class="mi">13296</span> <span class="n">s</span>  <span class="n">Next</span> <span class="nl">ID</span><span class="p">:</span> <span class="mi [...]
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="o">----------------------------------------------------------------------</span><span class="w"></span>
+<span class="o">------------------------------</span><span class="w">  </span><span class="p">[</span><span class="w"> </span><span class="n">Task</span><span class="w"> </span><span class="n">Scheduler</span><span class="w"> </span><span class="p">]</span><span class="w"></span>
+<span class="o">----------------------------------------------------------------------</span><span class="w"></span>
+<span class="o">|</span><span class="w">  </span><span class="n">ID</span><span class="w">  </span><span class="o">|</span><span class="w"> </span><span class="n">Latency</span><span class="w"> </span><span class="p">(</span><span class="n">ms</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">Speed</span><span class="w"> </span><span class="p">(</span><span class="n">GFLOPS</span><span class="p">)</span><span class="w">  [...]
+<span class="o">-------------------------------------------------</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">0</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.005</span><span class="w"> </span><span class="o">|</span><span class="w">           </span><span class="mf">0.88</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">1</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.010</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">99.10</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">2</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.006</span><span class="w"> </span><span class="o">|</span><span class="w">           </span><span class="mf">0.00</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">3</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.145</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">979.78</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">384</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">4</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.130</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1097.02</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">384</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">5</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.143</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">992.69</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">384</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">6</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.076</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1526.86</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">192</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">7</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.115</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">999.44</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">320</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">8</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.079</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1449.39</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">320</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">9</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.122</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">938.73</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">384</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">10</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.063</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1832.98</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">192</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">11</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.072</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1763.62</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">256</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">12</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.062</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">2036.40</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">192</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">13</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.068</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1874.44</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">192</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">14</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.049</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">2346.50</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">15</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.076</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1694.31</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">256</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">16</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.067</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1933.30</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">448</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">17</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.076</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1680.90</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">256</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">18</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.022</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">98.43</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">19</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.076</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">3112.55</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">192</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">20</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.013</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">2026.44</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">21</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.011</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1136.69</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">22</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.013</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">992.47</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">23</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.020</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">627.56</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">-------------------------------------------------</span><span class="w"></span>
+<span class="n">Estimated</span><span class="w"> </span><span class="n">total</span><span class="w"> </span><span class="n">latency</span><span class="o">:</span><span class="w"> </span><span class="mf">1.587</span><span class="w"> </span><span class="n">ms</span><span class="w">  </span><span class="n">Trials</span><span class="o">:</span><span class="w"> </span><span class="mi">4992</span><span class="w">  </span><span class="n">Used</span><span class="w"> </span><span class="n">time</ [...]
 </pre></div>
 </div>
 <p>This table lists the latency and (estimated) speed of all tasks.
@@ -876,7 +876,7 @@ so we can read the log file and load the best schedules.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-   9.4762       9.4668       9.5029       9.4590       0.0191
+   9.9205       9.9307       9.9435       9.8873       0.0241
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_mali.html b/docs/how_to/tune_with_autoscheduler/tune_network_mali.html
index 2b8db11b2..6ae141040 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_mali.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_mali.html
@@ -362,9 +362,9 @@ get it to run, you will need to wrap the body of this tutorial in a <code class=
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">auto_scheduler</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">auto_scheduler</span>
 <span class="kn">import</span> <span class="nn">tvm.relay.testing</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span>
 <span class="kn">import</span> <span class="nn">os</span>
 </pre></div>
 </div>
@@ -428,7 +428,7 @@ You can use <a class="reference internal" href="../../arch/convert_layout.html#c
         <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/testing.html#tvm.relay.testing.inception_v3.get_workload" title="View documentation for tvm.relay.testing.inception_v3.get_workload"><span class="n">relay</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">inception_v3</span><span class="o">.</span><span class="n">get_workload</span></a [...]
     <span class="k">elif</span> <span class="n">name</span> <span class="o">==</span> <span class="s2">&quot;mxnet&quot;</span><span class="p">:</span>
         <span class="c1"># an example for mxnet model</span>
-        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="k">import</span> <span class="n">get_model</span>
+        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="kn">import</span> <span class="n">get_model</span>
 
         <span class="k">assert</span> <span class="n">layout</span> <span class="o">==</span> <span class="s2">&quot;NCHW&quot;</span>
 
@@ -751,11 +751,11 @@ and do more analyses later.</p></li>
 
     <span class="c1"># Create graph executor</span>
     <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;=============== Request Remote ===============&quot;</span><span class="p">)</span>
-    <span class="kn">from</span> <span class="nn">tvm.auto_scheduler.utils</span> <span class="k">import</span> <span class="n">request_remote</span>
+    <span class="kn">from</span> <span class="nn">tvm.auto_scheduler.utils</span> <span class="kn">import</span> <span class="n">request_remote</span>
 
     <span class="n">remote</span> <span class="o">=</span> <span class="n">request_remote</span><span class="p">(</span><span class="n">device_key</span><span class="p">,</span> <span class="s2">&quot;127.0.0.1&quot;</span><span class="p">,</span> <span class="mi">9190</span><span class="p">)</span>
     <span class="n">dev</span> <span class="o">=</span> <span class="n">remote</span><span class="o">.</span><span class="n">cl</span><span class="p">()</span>
-    <span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">utils</span><span class="p">,</span> <span class="n">ndk</span>
+    <span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">utils</span><span class="p">,</span> <span class="n">ndk</span>
 
     <span class="n">temp</span> <span class="o">=</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.utils.tempdir" title="View documentation for tvm.contrib.utils.tempdir"><span class="n">utils</span><span class="o">.</span><span class="n">tempdir</span></a><span class="p">()</span>
     <span class="n">filename</span> <span class="o">=</span> <span class="s2">&quot;deploy_lib.so&quot;</span>
@@ -785,42 +785,42 @@ and do more analyses later.</p></li>
 <p>During the tuning, a lot of information will be printed on the console.
 They are used for debugging purposes. The most important info is the output
 of the task scheduler. The following table is a sample output.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="o">----------------------------------------------------------------------</span>
-<span class="o">------------------------------</span>  <span class="p">[</span> <span class="n">Task</span> <span class="n">Scheduler</span> <span class="p">]</span>
-<span class="o">----------------------------------------------------------------------</span>
-<span class="o">|</span>  <span class="n">ID</span>  <span class="o">|</span> <span class="n">Latency</span> <span class="p">(</span><span class="n">ms</span><span class="p">)</span> <span class="o">|</span> <span class="n">Speed</span> <span class="p">(</span><span class="n">GFLOPS</span><span class="p">)</span> <span class="o">|</span> <span class="n">Trials</span> <span class="o">|</span>
-<span class="o">-------------------------------------------------</span>
-<span class="o">|</span>    <span class="mi">0</span> <span class="o">|</span>        <span class="mf">0.010</span> <span class="o">|</span>           <span class="mf">0.40</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">1</span> <span class="o">|</span>        <span class="mf">0.087</span> <span class="o">|</span>          <span class="mf">47.19</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">2</span> <span class="o">|</span>        <span class="mf">0.008</span> <span class="o">|</span>          <span class="o">-</span><span class="mf">0.00</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">3</span> <span class="o">|</span>        <span class="mf">0.177</span> <span class="o">|</span>         <span class="mf">582.07</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">4</span> <span class="o">|</span>        <span class="mf">0.268</span> <span class="o">|</span>         <span class="mf">862.37</span> <span class="o">|</span>    <span class="mi">256</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">5</span> <span class="o">|</span>        <span class="mf">0.166</span> <span class="o">|</span>         <span class="mf">621.13</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">6</span> <span class="o">|</span>        <span class="mf">0.170</span> <span class="o">|</span>         <span class="mf">605.10</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">7</span> <span class="o">|</span>        <span class="mf">0.128</span> <span class="o">|</span>         <span class="mf">403.20</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">8</span> <span class="o">|</span>        <span class="mf">0.189</span> <span class="o">|</span>         <span class="mf">545.71</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">9</span> <span class="o">|</span>        <span class="mf">0.231</span> <span class="o">|</span>        <span class="mf">1001.01</span> <span class="o">|</span>    <span class="mi">448</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">10</span> <span class="o">|</span>        <span class="mf">0.155</span> <span class="o">|</span>         <span class="mf">664.80</span> <span class="o">|</span>    <span class="mi">256</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">11</span> <span class="o">|</span>        <span class="mf">0.155</span> <span class="o">|</span>         <span class="mf">662.86</span> <span class="o">|</span>    <span class="mi">256</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">12</span> <span class="o">|</span>        <span class="mf">0.119</span> <span class="o">|</span>         <span class="mf">434.08</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">13</span> <span class="o">|</span>        <span class="mf">0.199</span> <span class="o">|</span>         <span class="mf">522.13</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">14</span> <span class="o">|</span>        <span class="mf">0.235</span> <span class="o">|</span>         <span class="mf">986.56</span> <span class="o">|</span>    <span class="mi">320</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">15</span> <span class="o">|</span>        <span class="mf">0.149</span> <span class="o">|</span>         <span class="mf">689.13</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">16</span> <span class="o">|</span>        <span class="mf">0.155</span> <span class="o">|</span>         <span class="mf">664.80</span> <span class="o">|</span>    <span class="mi">192</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">17</span> <span class="o">|</span>        <span class="mf">0.151</span> <span class="o">|</span>         <span class="mf">340.64</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">18</span> <span class="o">|</span>        <span class="mf">0.176</span> <span class="o">|</span>         <span class="mf">597.55</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">19</span> <span class="o">|</span>        <span class="mf">0.220</span> <span class="o">|</span>        <span class="mf">1054.37</span> <span class="o">|</span>    <span class="mi">192</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">20</span> <span class="o">|</span>        <span class="mf">0.150</span> <span class="o">|</span>         <span class="mf">686.01</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">21</span> <span class="o">|</span>        <span class="mf">0.159</span> <span class="o">|</span>         <span class="mf">650.88</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">22</span> <span class="o">|</span>        <span class="mf">0.073</span> <span class="o">|</span>         <span class="mf">358.19</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">23</span> <span class="o">|</span>        <span class="mf">0.031</span> <span class="o">|</span>          <span class="mf">70.63</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">24</span> <span class="o">|</span>        <span class="mf">0.251</span> <span class="o">|</span>         <span class="mf">947.73</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">25</span> <span class="o">|</span>        <span class="mf">0.157</span> <span class="o">|</span>         <span class="mf">652.47</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">26</span> <span class="o">|</span>        <span class="mf">0.215</span> <span class="o">|</span>         <span class="mf">954.84</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">27</span> <span class="o">|</span>        <span class="mf">0.237</span> <span class="o">|</span>         <span class="mf">868.92</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">28</span> <span class="o">|</span>        <span class="mf">0.266</span> <span class="o">|</span>         <span class="mf">774.06</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">-------------------------------------------------</span>
-<span class="n">Estimated</span> <span class="n">total</span> <span class="nl">latency</span><span class="p">:</span> <span class="mf">10.016</span> <span class="n">ms</span>      <span class="nl">Trials</span><span class="p">:</span> <span class="mi">3992</span>    <span class="n">Used</span> <span class="nl">time</span> <span class="p">:</span> <span class="mi">1131</span> <span class="n">s</span>      <span class="n">Next</span> <span class="nl">ID</span><span class="p">:</span> <span [...]
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="o">----------------------------------------------------------------------</span><span class="w"></span>
+<span class="o">------------------------------</span><span class="w">  </span><span class="p">[</span><span class="w"> </span><span class="n">Task</span><span class="w"> </span><span class="n">Scheduler</span><span class="w"> </span><span class="p">]</span><span class="w"></span>
+<span class="o">----------------------------------------------------------------------</span><span class="w"></span>
+<span class="o">|</span><span class="w">  </span><span class="n">ID</span><span class="w">  </span><span class="o">|</span><span class="w"> </span><span class="n">Latency</span><span class="w"> </span><span class="p">(</span><span class="n">ms</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">Speed</span><span class="w"> </span><span class="p">(</span><span class="n">GFLOPS</span><span class="p">)</span><span class="w">  [...]
+<span class="o">-------------------------------------------------</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">0</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.010</span><span class="w"> </span><span class="o">|</span><span class="w">           </span><span class="mf">0.40</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">1</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.087</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">47.19</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">2</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.008</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">-0.00</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">3</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.177</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">582.07</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">4</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.268</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">862.37</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">256</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">5</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.166</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">621.13</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">6</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.170</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">605.10</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">7</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.128</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">403.20</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">8</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.189</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">545.71</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">9</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.231</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1001.01</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">448</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">10</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.155</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">664.80</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">256</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">11</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.155</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">662.86</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">256</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">12</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.119</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">434.08</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">13</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.199</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">522.13</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">14</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.235</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">986.56</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">320</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">15</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.149</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">689.13</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">16</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.155</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">664.80</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">192</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">17</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.151</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">340.64</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">18</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.176</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">597.55</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">19</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.220</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1054.37</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">192</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">20</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.150</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">686.01</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">21</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.159</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">650.88</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">22</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.073</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">358.19</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">23</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.031</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">70.63</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">24</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.251</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">947.73</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">25</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.157</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">652.47</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">26</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.215</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">954.84</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">27</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.237</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">868.92</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">28</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.266</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">774.06</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">-------------------------------------------------</span><span class="w"></span>
+<span class="n">Estimated</span><span class="w"> </span><span class="n">total</span><span class="w"> </span><span class="n">latency</span><span class="o">:</span><span class="w"> </span><span class="mf">10.016</span><span class="w"> </span><span class="n">ms</span><span class="w">      </span><span class="n">Trials</span><span class="o">:</span><span class="w"> </span><span class="mi">3992</span><span class="w">    </span><span class="n">Used</span><span class="w"> </span><span class="n" [...]
 </pre></div>
 </div>
 <p>This table lists the latency and (estimated) speed of all tasks.
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
index beafa7924..834465a91 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
@@ -362,10 +362,10 @@ get it to run, you will need to wrap the body of this tutorial in a <code class=
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">auto_scheduler</span>
-<span class="kn">from</span> <span class="nn">tvm.relay</span> <span class="k">import</span> <span class="n">data_dep_optimization</span> <span class="k">as</span> <span class="n">ddo</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">auto_scheduler</span>
+<span class="kn">from</span> <span class="nn">tvm.relay</span> <span class="kn">import</span> <span class="n">data_dep_optimization</span> <span class="k">as</span> <span class="n">ddo</span>
 <span class="kn">import</span> <span class="nn">tvm.relay.testing</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span>
 </pre></div>
 </div>
 <div class="section" id="define-a-network">
@@ -428,7 +428,7 @@ You can use <a class="reference internal" href="../../arch/convert_layout.html#c
         <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/testing.html#tvm.relay.testing.inception_v3.get_workload" title="View documentation for tvm.relay.testing.inception_v3.get_workload"><span class="n">relay</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">inception_v3</span><span class="o">.</span><span class="n">get_workload</span></a [...]
     <span class="k">elif</span> <span class="n">name</span> <span class="o">==</span> <span class="s2">&quot;mxnet&quot;</span><span class="p">:</span>
         <span class="c1"># an example for mxnet model</span>
-        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="k">import</span> <span class="n">get_model</span>
+        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="kn">import</span> <span class="n">get_model</span>
 
         <span class="k">assert</span> <span class="n">layout</span> <span class="o">==</span> <span class="s2">&quot;NCHW&quot;</span>
 
@@ -447,7 +447,7 @@ You can use <a class="reference internal" href="../../arch/convert_layout.html#c
         <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Network not found.&quot;</span><span class="p">)</span>
 
     <span class="k">if</span> <span class="n">use_sparse</span><span class="p">:</span>
-        <span class="kn">from</span> <span class="nn">tvm.topi.sparse.utils</span> <span class="k">import</span> <span class="n">convert_model_dense_to_sparse</span>
+        <span class="kn">from</span> <span class="nn">tvm.topi.sparse.utils</span> <span class="kn">import</span> <span class="n">convert_model_dense_to_sparse</span>
 
         <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">convert_model_dense_to_sparse</span><span class="p">(</span><span class="n">mod</span><span class="p">,</span> <span class="n">params</span><span class="p">,</span> <span class="n">bs_r</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">random_params</span><span class="o">=</span><span class="kc">True</span><span cl [...]
 
@@ -782,7 +782,7 @@ and do more analyses later.</p></li>
     <span class="p">)</span>
 
     <span class="k">if</span> <span class="n">use_sparse</span><span class="p">:</span>
-        <span class="kn">from</span> <span class="nn">tvm.topi.sparse.utils</span> <span class="k">import</span> <span class="n">sparse_sketch_rules</span>
+        <span class="kn">from</span> <span class="nn">tvm.topi.sparse.utils</span> <span class="kn">import</span> <span class="n">sparse_sketch_rules</span>
 
         <span class="n">search_policy</span> <span class="o">=</span> <span class="p">[</span>
             <a href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.SketchPolicy" title="View documentation for tvm.auto_scheduler.SketchPolicy"><span class="n">auto_scheduler</span><span class="o">.</span><span class="n">SketchPolicy</span></a><span class="p">(</span>
@@ -810,42 +810,42 @@ and do more analyses later.</p></li>
 <p>During the tuning, a lot of information will be printed on the console.
 They are used for debugging purposes. The most important info is the output
 of the task scheduler. The following table is a sample output.</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="o">----------------------------------------------------------------------</span>
-<span class="o">------------------------------</span>  <span class="p">[</span> <span class="n">Task</span> <span class="n">Scheduler</span> <span class="p">]</span>
-<span class="o">----------------------------------------------------------------------</span>
-<span class="o">|</span>  <span class="n">ID</span>  <span class="o">|</span> <span class="n">Latency</span> <span class="p">(</span><span class="n">ms</span><span class="p">)</span> <span class="o">|</span> <span class="n">Speed</span> <span class="p">(</span><span class="n">GFLOPS</span><span class="p">)</span> <span class="o">|</span> <span class="n">Trials</span> <span class="o">|</span>
-<span class="o">-------------------------------------------------</span>
-<span class="o">|</span>    <span class="mi">0</span> <span class="o">|</span>        <span class="mf">0.010</span> <span class="o">|</span>           <span class="mf">0.40</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">1</span> <span class="o">|</span>        <span class="mf">0.087</span> <span class="o">|</span>          <span class="mf">47.19</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">2</span> <span class="o">|</span>        <span class="mf">0.008</span> <span class="o">|</span>          <span class="o">-</span><span class="mf">0.00</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">3</span> <span class="o">|</span>        <span class="mf">0.177</span> <span class="o">|</span>         <span class="mf">582.07</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">4</span> <span class="o">|</span>        <span class="mf">0.268</span> <span class="o">|</span>         <span class="mf">862.37</span> <span class="o">|</span>    <span class="mi">256</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">5</span> <span class="o">|</span>        <span class="mf">0.166</span> <span class="o">|</span>         <span class="mf">621.13</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">6</span> <span class="o">|</span>        <span class="mf">0.170</span> <span class="o">|</span>         <span class="mf">605.10</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">7</span> <span class="o">|</span>        <span class="mf">0.128</span> <span class="o">|</span>         <span class="mf">403.20</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">8</span> <span class="o">|</span>        <span class="mf">0.189</span> <span class="o">|</span>         <span class="mf">545.71</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>    <span class="mi">9</span> <span class="o">|</span>        <span class="mf">0.231</span> <span class="o">|</span>        <span class="mf">1001.01</span> <span class="o">|</span>    <span class="mi">448</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">10</span> <span class="o">|</span>        <span class="mf">0.155</span> <span class="o">|</span>         <span class="mf">664.80</span> <span class="o">|</span>    <span class="mi">256</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">11</span> <span class="o">|</span>        <span class="mf">0.155</span> <span class="o">|</span>         <span class="mf">662.86</span> <span class="o">|</span>    <span class="mi">256</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">12</span> <span class="o">|</span>        <span class="mf">0.119</span> <span class="o">|</span>         <span class="mf">434.08</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">13</span> <span class="o">|</span>        <span class="mf">0.199</span> <span class="o">|</span>         <span class="mf">522.13</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">14</span> <span class="o">|</span>        <span class="mf">0.235</span> <span class="o">|</span>         <span class="mf">986.56</span> <span class="o">|</span>    <span class="mi">320</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">15</span> <span class="o">|</span>        <span class="mf">0.149</span> <span class="o">|</span>         <span class="mf">689.13</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">16</span> <span class="o">|</span>        <span class="mf">0.155</span> <span class="o">|</span>         <span class="mf">664.80</span> <span class="o">|</span>    <span class="mi">192</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">17</span> <span class="o">|</span>        <span class="mf">0.151</span> <span class="o">|</span>         <span class="mf">340.64</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">18</span> <span class="o">|</span>        <span class="mf">0.176</span> <span class="o">|</span>         <span class="mf">597.55</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">19</span> <span class="o">|</span>        <span class="mf">0.220</span> <span class="o">|</span>        <span class="mf">1054.37</span> <span class="o">|</span>    <span class="mi">192</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">20</span> <span class="o">|</span>        <span class="mf">0.150</span> <span class="o">|</span>         <span class="mf">686.01</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">21</span> <span class="o">|</span>        <span class="mf">0.159</span> <span class="o">|</span>         <span class="mf">650.88</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">22</span> <span class="o">|</span>        <span class="mf">0.073</span> <span class="o">|</span>         <span class="mf">358.19</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">23</span> <span class="o">|</span>        <span class="mf">0.031</span> <span class="o">|</span>          <span class="mf">70.63</span> <span class="o">|</span>     <span class="mi">64</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">24</span> <span class="o">|</span>        <span class="mf">0.251</span> <span class="o">|</span>         <span class="mf">947.73</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">25</span> <span class="o">|</span>        <span class="mf">0.157</span> <span class="o">|</span>         <span class="mf">652.47</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">26</span> <span class="o">|</span>        <span class="mf">0.215</span> <span class="o">|</span>         <span class="mf">954.84</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">27</span> <span class="o">|</span>        <span class="mf">0.237</span> <span class="o">|</span>         <span class="mf">868.92</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">|</span>   <span class="mi">28</span> <span class="o">|</span>        <span class="mf">0.266</span> <span class="o">|</span>         <span class="mf">774.06</span> <span class="o">|</span>    <span class="mi">128</span> <span class="o">|</span>
-<span class="o">-------------------------------------------------</span>
-<span class="n">Estimated</span> <span class="n">total</span> <span class="nl">latency</span><span class="p">:</span> <span class="mf">10.016</span> <span class="n">ms</span>      <span class="nl">Trials</span><span class="p">:</span> <span class="mi">3992</span>    <span class="n">Used</span> <span class="nl">time</span> <span class="p">:</span> <span class="mi">1131</span> <span class="n">s</span>      <span class="n">Next</span> <span class="nl">ID</span><span class="p">:</span> <span [...]
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="o">----------------------------------------------------------------------</span><span class="w"></span>
+<span class="o">------------------------------</span><span class="w">  </span><span class="p">[</span><span class="w"> </span><span class="n">Task</span><span class="w"> </span><span class="n">Scheduler</span><span class="w"> </span><span class="p">]</span><span class="w"></span>
+<span class="o">----------------------------------------------------------------------</span><span class="w"></span>
+<span class="o">|</span><span class="w">  </span><span class="n">ID</span><span class="w">  </span><span class="o">|</span><span class="w"> </span><span class="n">Latency</span><span class="w"> </span><span class="p">(</span><span class="n">ms</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">Speed</span><span class="w"> </span><span class="p">(</span><span class="n">GFLOPS</span><span class="p">)</span><span class="w">  [...]
+<span class="o">-------------------------------------------------</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">0</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.010</span><span class="w"> </span><span class="o">|</span><span class="w">           </span><span class="mf">0.40</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">1</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.087</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">47.19</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">2</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.008</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">-0.00</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">3</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.177</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">582.07</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">4</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.268</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">862.37</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">256</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">5</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.166</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">621.13</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">6</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.170</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">605.10</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">7</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.128</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">403.20</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">8</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.189</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">545.71</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">    </span><span class="mi">9</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.231</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1001.01</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">448</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">10</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.155</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">664.80</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">256</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">11</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.155</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">662.86</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">256</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">12</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.119</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">434.08</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">13</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.199</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">522.13</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">14</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.235</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">986.56</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">320</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">15</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.149</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">689.13</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">16</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.155</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">664.80</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">192</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">17</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.151</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">340.64</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">18</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.176</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">597.55</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">19</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.220</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">1054.37</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">192</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">20</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.150</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">686.01</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">21</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.159</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">650.88</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">22</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.073</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">358.19</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">23</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.031</span><span class="w"> </span><span class="o">|</span><span class="w">          </span><span class="mf">70.63</span><span class="w"> </span><span class="o">|</span><span class="w">     </span><span class="mi">64</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">24</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.251</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">947.73</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">25</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.157</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">652.47</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">26</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.215</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">954.84</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">27</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.237</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">868.92</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">|</span><span class="w">   </span><span class="mi">28</span><span class="w"> </span><span class="o">|</span><span class="w">        </span><span class="mf">0.266</span><span class="w"> </span><span class="o">|</span><span class="w">         </span><span class="mf">774.06</span><span class="w"> </span><span class="o">|</span><span class="w">    </span><span class="mi">128</span><span class="w"> </span><span class="o">|</span><span class="w"></span>
+<span class="o">-------------------------------------------------</span><span class="w"></span>
+<span class="n">Estimated</span><span class="w"> </span><span class="n">total</span><span class="w"> </span><span class="n">latency</span><span class="o">:</span><span class="w"> </span><span class="mf">10.016</span><span class="w"> </span><span class="n">ms</span><span class="w">      </span><span class="n">Trials</span><span class="o">:</span><span class="w"> </span><span class="mi">3992</span><span class="w">    </span><span class="n">Used</span><span class="w"> </span><span class="n" [...]
 </pre></div>
 </div>
 <p>This table lists the latency and (estimated) speed of all tasks.
@@ -895,7 +895,7 @@ so we can read the log file and load the best schedules.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  766.3483     762.8012     773.6176     762.6261      5.1407
+  750.3560     751.6426     751.9646     747.4609      2.0514
 </pre></div>
 </div>
 </div>
@@ -917,7 +917,7 @@ to learn how to use the RPC Tracker and RPC Server.
 To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
 with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
 </ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  20.086 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  19.500 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-x86-py">
 <div class="sphx-glr-download docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/e416b94ca1090b0897c0f6e0df95b911/tune_network_x86.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_x86.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
index 6410d1a02..26be528a5 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
@@ -357,10 +357,10 @@ get it to run, you will need to wrap the body of this tutorial in a <code class=
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 <span class="kn">import</span> <span class="nn">tvm</span>
 <span class="kn">import</span> <span class="nn">tvm.testing</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span><span class="p">,</span> <span class="n">auto_scheduler</span><span class="p">,</span> <span class="n">runtime</span><span class="p">,</span> <span class="n">topi</span>
-<span class="kn">from</span> <span class="nn">tvm.auto_scheduler</span> <span class="k">import</span> <span class="n">_ffi_api</span>
-<span class="kn">from</span> <span class="nn">tvm.topi.utils</span> <span class="k">import</span> <span class="n">get_const_tuple</span>
-<span class="kn">from</span> <span class="nn">tvm.topi.sparse.utils</span> <span class="k">import</span> <span class="n">random_bsr_matrix</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span><span class="p">,</span> <span class="n">auto_scheduler</span><span class="p">,</span> <span class="n">runtime</span><span class="p">,</span> <span class="n">topi</span>
+<span class="kn">from</span> <span class="nn">tvm.auto_scheduler</span> <span class="kn">import</span> <span class="n">_ffi_api</span>
+<span class="kn">from</span> <span class="nn">tvm.topi.utils</span> <span class="kn">import</span> <span class="n">get_const_tuple</span>
+<span class="kn">from</span> <span class="nn">tvm.topi.sparse.utils</span> <span class="kn">import</span> <span class="n">random_bsr_matrix</span>
 </pre></div>
 </div>
 <div class="section" id="define-the-computation">
@@ -600,77 +600,31 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
              placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
              compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
   buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
-  preflattened_buffer_map = {placeholder_9: placeholder_15: Buffer(placeholder_14, float32, [128, 512], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_8: placeholder_17: Buffer(placeholder_13, int32, [33], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_5: placeholder_18: Buffer(placeholder_10, float32, [128, 256], []), placeholder_6: placeholder_19: Buffer(placeholder_11, float32, [4916, 16, 1], [])} {
-  for (i0.outer.i1.outer.fused: int32, 0, 32) &quot;parallel&quot; {
-    allocate(compute_4: Pointer(global float32), float32, [2048]), storage_scope = global {
-      for (nb_j.inner: int32, 0, 2) {
-        for (i.inner.init: int32, 0, 64) {
-          let cse_var_1: int32 = ((i.inner.init*32) + (nb_j.inner*16))
-           {
-            compute_5: Buffer(compute_4, float32, [2048], [])[cse_var_1] = 0f32
-            compute_5[(cse_var_1 + 1)] = 0f32
-            compute_5[(cse_var_1 + 2)] = 0f32
-            compute_5[(cse_var_1 + 3)] = 0f32
-            compute_5[(cse_var_1 + 4)] = 0f32
-            compute_5[(cse_var_1 + 5)] = 0f32
-            compute_5[(cse_var_1 + 6)] = 0f32
-            compute_5[(cse_var_1 + 7)] = 0f32
-            compute_5[(cse_var_1 + 8)] = 0f32
-            compute_5[(cse_var_1 + 9)] = 0f32
-            compute_5[(cse_var_1 + 10)] = 0f32
-            compute_5[(cse_var_1 + 11)] = 0f32
-            compute_5[(cse_var_1 + 12)] = 0f32
-            compute_5[(cse_var_1 + 13)] = 0f32
-            compute_5[(cse_var_1 + 14)] = 0f32
-            compute_5[(cse_var_1 + 15)] = 0f32
+  preflattened_buffer_map = {compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_9: placeholder_15: Buffer(placeholder_14, float32, [128, 512], []), placeholder_6: placeholder_16: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_7: placeholder_17: Buffer(placeholder_12, int32, [4916], []), placeholder_8: placeholder_18: Buffer(placeholder_13, int32, [33], []), placeholder_5: placeholder_19: Buffer(placeholder_10, float32, [128, 256], [])} {
+  for (i0.outer.i1.outer.fused: int32, 0, 16) &quot;parallel&quot; {
+    allocate(compute_4: Pointer(global float32), float32, [4096]), storage_scope = global {
+      for (i.outer.inner: int32, 0, 32) {
+        for (nb_j.inner: int32, 0, 2) {
+          for (i.inner.init: int32, 0, 4) {
+            for (j.init: int32, 0, 16) {
+              compute_5: Buffer(compute_4, float32, [4096], [])[((((i.outer.inner*128) + (i.inner.init*32)) + (nb_j.inner*16)) + j.init)] = 0f32
+            }
           }
-        }
-        for (elem_idx: int32, 0, let cse_var_2: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) in (placeholder_3[(cse_var_2 + 1)] - placeholder_3[cse_var_2])) {
-          for (i.inner: int32, 0, 64) {
-            let cse_var_21: int32 = (elem_idx*16)
-            let cse_var_20: int32 = ((i.inner*32) + (nb_j.inner*16))
-            let cse_var_19: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)
-            let cse_var_18: int32 = (cse_var_20 + 1)
-            let cse_var_17: int32 = (cse_var_20 + 11)
-            let cse_var_16: int32 = (cse_var_20 + 12)
-            let cse_var_15: int32 = (cse_var_20 + 13)
-            let cse_var_14: int32 = (cse_var_20 + 14)
-            let cse_var_13: int32 = (cse_var_20 + 15)
-            let cse_var_12: int32 = (cse_var_20 + 2)
-            let cse_var_11: int32 = (cse_var_20 + 3)
-            let cse_var_10: int32 = (cse_var_20 + 4)
-            let cse_var_9: int32 = (cse_var_20 + 5)
-            let cse_var_8: int32 = (cse_var_20 + 6)
-            let cse_var_7: int32 = (cse_var_20 + 7)
-            let cse_var_6: int32 = (cse_var_20 + 8)
-            let cse_var_5: int32 = (cse_var_20 + 9)
-            let cse_var_4: int32 = ((floordiv(i0.outer.i1.outer.fused, 16)*16384) + (i.inner*256))
-            let cse_var_3: int32 = (cse_var_20 + 10)
-             {
-              compute_5[cse_var_20] = (compute_5[cse_var_20] + (placeholder_1[((placeholder_3[cse_var_19]*16) + cse_var_21)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_18] = (compute_5[cse_var_18] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 1)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 2)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 3)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 4)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 5)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 6)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 7)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 8)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 9)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 10)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 11)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 12)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 13)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 14)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
-              compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_19]*16) + cse_var_21) + 15)]*max(placeholder[(cse_var_4 + placeholder_2[(placeholder_3[cse_var_19] + elem_idx)])], 0f32)))
+          for (elem_idx: int32, 0, let cse_var_1: int32 = ((i0.outer.i1.outer.fused*2) + nb_j.inner) in (placeholder_3[(cse_var_1 + 1)] - placeholder_3[cse_var_1])) {
+            for (i.inner: int32, 0, 4) {
+              for (j: int32, 0, 16) {
+                let cse_var_3: int32 = ((i0.outer.i1.outer.fused*2) + nb_j.inner)
+                let cse_var_2: int32 = ((((i.outer.inner*128) + (i.inner*32)) + (nb_j.inner*16)) + j)
+                compute_5[cse_var_2] = (compute_5[cse_var_2] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + j)]*max(placeholder[(((i.outer.inner*1024) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
+              }
             }
           }
         }
       }
-      for (i0.inner: int32, 0, 64) {
+      for (i0.inner: int32, 0, 128) {
         for (i1.inner: int32, 0, 32) {
-          let cse_var_22: int32 = ((((floordiv(i0.outer.i1.outer.fused, 16)*32768) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)) + i1.inner)
-          compute[cse_var_22] = max((compute_5[((i0.inner*32) + i1.inner)] + placeholder_4[cse_var_22]), 0f32)
+          let cse_var_4: int32 = (((i0.inner*512) + (i0.outer.i1.outer.fused*32)) + i1.inner)
+          compute[cse_var_4] = max((compute_5[((i0.inner*32) + i1.inner)] + placeholder_4[cse_var_4]), 0f32)
         }
       }
     }
@@ -710,46 +664,46 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
 </pre></div>
 </div>
 <p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.840 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.446 ms
 </pre></div>
 </div>
 <div class="admonition note">
 <p class="admonition-title">Note</p>
 <p>Tuning result example</p>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="o">----------------------------------------------------------------------</span>
-<span class="n">Lowered</span> <span class="nl">TIR</span><span class="p">:</span>
-<span class="n">primfn</span><span class="p">(</span><span class="nl">placeholder_5</span><span class="p">:</span> <span class="n">handle</span><span class="p">,</span> <span class="nl">placeholder_6</span><span class="p">:</span> <span class="n">handle</span><span class="p">,</span> <span class="nl">placeholder_7</span><span class="p">:</span> <span class="n">handle</span><span class="p">,</span> <span class="nl">placeholder_8</span><span class="p">:</span> <span class="n">handle</span> [...]
-  <span class="n">attr</span> <span class="o">=</span> <span class="p">{</span><span class="s">&quot;global_symbol&quot;</span><span class="o">:</span> <span class="s">&quot;main&quot;</span><span class="p">,</span> <span class="s">&quot;tir.noalias&quot;</span><span class="o">:</span> <span class="n">True</span><span class="p">}</span>
-  <span class="n">buffers</span> <span class="o">=</span> <span class="p">{</span><span class="nl">placeholder_2</span><span class="p">:</span> <span class="n">Buffer</span><span class="p">(</span><span class="nl">placeholder_10</span><span class="p">:</span> <span class="n">Pointer</span><span class="p">(</span><span class="n">float32</span><span class="p">),</span> <span class="n">float32</span><span class="p">,</span> <span class="p">[</span><span class="mi">9831</span><span class="p" [...]
-             <span class="nl">placeholder_4</span><span class="p">:</span> <span class="n">Buffer</span><span class="p">(</span><span class="nl">placeholder_11</span><span class="p">:</span> <span class="n">Pointer</span><span class="p">(</span><span class="n">int32</span><span class="p">),</span> <span class="n">int32</span><span class="p">,</span> <span class="p">[</span><span class="mi">33</span><span class="p">],</span> <span class="p">[]),</span>
-             <span class="nl">placeholder_3</span><span class="p">:</span> <span class="n">Buffer</span><span class="p">(</span><span class="nl">placeholder_12</span><span class="p">:</span> <span class="n">Pointer</span><span class="p">(</span><span class="n">float32</span><span class="p">),</span> <span class="n">float32</span><span class="p">,</span> <span class="p">[</span><span class="mi">512</span><span class="p">,</span> <span class="mi">512</span><span class="p">],</span> <span c [...]
-             <span class="nl">compute</span><span class="p">:</span> <span class="n">Buffer</span><span class="p">(</span><span class="nl">compute_2</span><span class="p">:</span> <span class="n">Pointer</span><span class="p">(</span><span class="n">float32</span><span class="p">),</span> <span class="n">float32</span><span class="p">,</span> <span class="p">[</span><span class="mi">512</span><span class="p">,</span> <span class="mi">512</span><span class="p">],</span> <span class="p">[] [...]
-             <span class="nl">placeholder_1</span><span class="p">:</span> <span class="n">Buffer</span><span class="p">(</span><span class="nl">placeholder_13</span><span class="p">:</span> <span class="n">Pointer</span><span class="p">(</span><span class="n">float32</span><span class="p">),</span> <span class="n">float32</span><span class="p">,</span> <span class="p">[</span><span class="mi">512</span><span class="p">,</span> <span class="mi">512</span><span class="p">],</span> <span c [...]
-             <span class="nl">placeholder</span><span class="p">:</span> <span class="n">Buffer</span><span class="p">(</span><span class="nl">placeholder_14</span><span class="p">:</span> <span class="n">Pointer</span><span class="p">(</span><span class="n">int32</span><span class="p">),</span> <span class="n">int32</span><span class="p">,</span> <span class="p">[</span><span class="mi">9831</span><span class="p">],</span> <span class="p">[])}</span>
-  <span class="n">buffer_map</span> <span class="o">=</span> <span class="p">{</span><span class="nl">placeholder_7</span><span class="p">:</span> <span class="n">placeholder</span><span class="p">,</span> <span class="nl">placeholder_9</span><span class="p">:</span> <span class="n">placeholder_1</span><span class="p">,</span> <span class="nl">placeholder_6</span><span class="p">:</span> <span class="n">placeholder_2</span><span class="p">,</span> <span class="nl">compute_1</span><span c [...]
-  <span class="k">for</span> <span class="p">(</span><span class="n">i0</span><span class="p">.</span><span class="n">outer</span><span class="p">.</span><span class="n">i1</span><span class="p">.</span><span class="n">outer</span><span class="p">.</span><span class="nl">fused</span><span class="p">:</span> <span class="n">int32</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1024</span><span class="p">)</span> <span class="s">&quot;paral [...]
-    <span class="n">attr</span> <span class="p">[</span><span class="nl">compute_3</span><span class="p">:</span> <span class="n">Pointer</span><span class="p">(</span><span class="n">float32</span><span class="p">)]</span> <span class="s">&quot;storage_scope&quot;</span> <span class="o">=</span> <span class="s">&quot;global&quot;</span><span class="p">;</span>
-    <span class="n">allocate</span><span class="p">(</span><span class="n">compute_3</span><span class="p">,</span> <span class="n">float32</span><span class="p">,</span> <span class="p">[</span><span class="mi">256</span><span class="p">])</span> <span class="p">{</span>
-      <span class="k">for</span> <span class="p">(</span><span class="n">nb_j</span><span class="p">.</span><span class="nl">inner</span><span class="p">:</span> <span class="n">int32</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span> <span class="p">{</span>
-        <span class="k">for</span> <span class="p">(</span><span class="n">i</span><span class="p">.</span><span class="n">inner</span><span class="p">.</span><span class="nl">init</span><span class="p">:</span> <span class="n">int32</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">8</span><span class="p">)</span> <span class="p">{</span>
-          <span class="k">for</span> <span class="p">(</span><span class="n">j</span><span class="p">.</span><span class="nl">init</span><span class="p">:</span> <span class="n">int32</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">16</span><span class="p">)</span> <span class="p">{</span>
-            <span class="n">compute_3</span><span class="p">[(((</span><span class="n">i</span><span class="p">.</span><span class="n">inner</span><span class="p">.</span><span class="n">init</span><span class="o">*</span><span class="mi">32</span><span class="p">)</span> <span class="o">+</span> <span class="p">(</span><span class="n">nb_j</span><span class="p">.</span><span class="n">inner</span><span class="o">*</span><span class="mi">16</span><span class="p">))</span> <span class="o" [...]
-          <span class="p">}</span>
-        <span class="p">}</span>
-        <span class="k">for</span> <span class="p">(</span><span class="nl">elem_idx</span><span class="p">:</span> <span class="n">int32</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="p">((</span><span class="n">int32</span><span class="o">*</span><span class="p">)</span><span class="n">placeholder_11</span><span class="p">[(((</span><span class="n">floormod</span><span class="p">(</span><span class="n">i0</span><span class="p">.</span><spa [...]
-          <span class="k">for</span> <span class="p">(</span><span class="n">i</span><span class="p">.</span><span class="nl">inner</span><span class="p">:</span> <span class="n">int32</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">8</span><span class="p">)</span> <span class="p">{</span>
-            <span class="k">for</span> <span class="p">(</span><span class="nl">j</span><span class="p">:</span> <span class="n">int32</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">16</span><span class="p">)</span> <span class="p">{</span>
-              <span class="n">compute_3</span><span class="p">[(((</span><span class="n">i</span><span class="p">.</span><span class="n">inner</span><span class="o">*</span><span class="mi">32</span><span class="p">)</span> <span class="o">+</span> <span class="p">(</span><span class="n">nb_j</span><span class="p">.</span><span class="n">inner</span><span class="o">*</span><span class="mi">16</span><span class="p">))</span> <span class="o">+</span> <span class="n">j</span><span class="p" [...]
-            <span class="p">}</span>
-          <span class="p">}</span>
-        <span class="p">}</span>
-      <span class="p">}</span>
-      <span class="k">for</span> <span class="p">(</span><span class="n">i0</span><span class="p">.</span><span class="nl">inner</span><span class="p">:</span> <span class="n">int32</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">8</span><span class="p">)</span> <span class="p">{</span>
-        <span class="n">compute_2</span><span class="p">[</span><span class="n">ramp</span><span class="p">((((</span><span class="n">floordiv</span><span class="p">(</span><span class="n">i0</span><span class="p">.</span><span class="n">outer</span><span class="p">.</span><span class="n">i1</span><span class="p">.</span><span class="n">outer</span><span class="p">.</span><span class="n">fused</span><span class="p">,</span> <span class="mi">16</span><span class="p">)</span><span class="o [...]
-      <span class="p">}</span>
-    <span class="p">}</span>
-  <span class="p">}</span>
-<span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="o">----------------------------------------------------------------------</span><span class="w"></span>
+<span class="n">Lowered</span><span class="w"> </span><span class="n">TIR</span><span class="o">:</span><span class="w"></span>
+<span class="n">primfn</span><span class="p">(</span><span class="n">placeholder_5</span><span class="o">:</span><span class="w"> </span><span class="n">handle</span><span class="p">,</span><span class="w"> </span><span class="n">placeholder_6</span><span class="o">:</span><span class="w"> </span><span class="n">handle</span><span class="p">,</span><span class="w"> </span><span class="n">placeholder_7</span><span class="o">:</span><span class="w"> </span><span class="n">handle</span><spa [...]
+<span class="w">  </span><span class="n">attr</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">{</span><span class="s">&quot;global_symbol&quot;</span><span class="o">:</span><span class="w"> </span><span class="s">&quot;main&quot;</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;tir.noalias&quot;</span><span class="o">:</span><span class="w"> </span><span class="n">True</span><span class="p">}</span><span class="w"></span>
+<span class="w">  </span><span class="n">buffers</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">{</span><span class="n">placeholder_2</span><span class="o">:</span><span class="w"> </span><span class="n">Buffer</span><span class="p">(</span><span class="n">placeholder_10</span><span class="o">:</span><span class="w"> </span><span class="n">Pointer</span><span class="p">(</span><span class="n">float32</span><span class="p">),</span><span clas [...]
+<span class="w">             </span><span class="nl">placeholder_4</span><span class="p">:</span><span class="w"> </span><span class="n">Buffer</span><span class="p">(</span><span class="n">placeholder_11</span><span class="o">:</span><span class="w"> </span><span class="n">Pointer</span><span class="p">(</span><span class="n">int32</span><span class="p">),</span><span class="w"> </span><span class="n">int32</span><span class="p">,</span><span class="w"> </span><span class="p">[</span><s [...]
+<span class="w">             </span><span class="nl">placeholder_3</span><span class="p">:</span><span class="w"> </span><span class="n">Buffer</span><span class="p">(</span><span class="n">placeholder_12</span><span class="o">:</span><span class="w"> </span><span class="n">Pointer</span><span class="p">(</span><span class="n">float32</span><span class="p">),</span><span class="w"> </span><span class="n">float32</span><span class="p">,</span><span class="w"> </span><span class="p">[</spa [...]
+<span class="w">             </span><span class="nl">compute</span><span class="p">:</span><span class="w"> </span><span class="n">Buffer</span><span class="p">(</span><span class="n">compute_2</span><span class="o">:</span><span class="w"> </span><span class="n">Pointer</span><span class="p">(</span><span class="n">float32</span><span class="p">),</span><span class="w"> </span><span class="n">float32</span><span class="p">,</span><span class="w"> </span><span class="p">[</span><span cla [...]
+<span class="w">             </span><span class="nl">placeholder_1</span><span class="p">:</span><span class="w"> </span><span class="n">Buffer</span><span class="p">(</span><span class="n">placeholder_13</span><span class="o">:</span><span class="w"> </span><span class="n">Pointer</span><span class="p">(</span><span class="n">float32</span><span class="p">),</span><span class="w"> </span><span class="n">float32</span><span class="p">,</span><span class="w"> </span><span class="p">[</spa [...]
+<span class="w">             </span><span class="nl">placeholder</span><span class="p">:</span><span class="w"> </span><span class="n">Buffer</span><span class="p">(</span><span class="n">placeholder_14</span><span class="o">:</span><span class="w"> </span><span class="n">Pointer</span><span class="p">(</span><span class="n">int32</span><span class="p">),</span><span class="w"> </span><span class="n">int32</span><span class="p">,</span><span class="w"> </span><span class="p">[</span><spa [...]
+<span class="w">  </span><span class="n">buffer_map</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">{</span><span class="n">placeholder_7</span><span class="o">:</span><span class="w"> </span><span class="n">placeholder</span><span class="p">,</span><span class="w"> </span><span class="n">placeholder_9</span><span class="o">:</span><span class="w"> </span><span class="n">placeholder_1</span><span class="p">,</span><span class="w"> </span><spa [...]
+<span class="w">  </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="n">i0</span><span class="p">.</span><span class="n">outer</span><span class="p">.</span><span class="n">i1</span><span class="p">.</span><span class="n">outer</span><span class="p">.</span><span class="n">fused</span><span class="o">:</span><span class="w"> </span><span class="n">int32</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p [...]
+<span class="w">    </span><span class="n">attr</span><span class="w"> </span><span class="p">[</span><span class="n">compute_3</span><span class="o">:</span><span class="w"> </span><span class="n">Pointer</span><span class="p">(</span><span class="n">float32</span><span class="p">)]</span><span class="w"> </span><span class="s">&quot;storage_scope&quot;</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;global&quot;</span><span class="p"> [...]
+<span class="w">    </span><span class="n">allocate</span><span class="p">(</span><span class="n">compute_3</span><span class="p">,</span><span class="w"> </span><span class="n">float32</span><span class="p">,</span><span class="w"> </span><span class="p">[</span><span class="mi">256</span><span class="p">])</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">      </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="n">nb_j</span><span class="p">.</span><span class="n">inner</span><span class="o">:</span><span class="w"> </span><span class="n">int32</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">        </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="n">i</span><span class="p">.</span><span class="n">inner</span><span class="p">.</span><span class="n">init</span><span class="o">:</span><span class="w"> </span><span class="n">int32</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="mi">8</span><span class="p">)</span><span class="w [...]
+<span class="w">          </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="n">j</span><span class="p">.</span><span class="n">init</span><span class="o">:</span><span class="w"> </span><span class="n">int32</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="mi">16</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w" [...]
+<span class="w">            </span><span class="n">compute_3</span><span class="p">[(((</span><span class="n">i</span><span class="p">.</span><span class="n">inner</span><span class="p">.</span><span class="n">init</span><span class="o">*</span><span class="mi">32</span><span class="p">)</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="n">nb_j</span><span class="p">.</span><span class="n">inner</span><span class="o">*</spa [...]
+<span class="w">          </span><span class="p">}</span><span class="w"></span>
+<span class="w">        </span><span class="p">}</span><span class="w"></span>
+<span class="w">        </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="n">elem_idx</span><span class="o">:</span><span class="w"> </span><span class="n">int32</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="p">((</span><span class="n">int32</span><span class="o">*</span><span class="p">)</span><span class="n">placeholder_11</span><span class="p">[(((< [...]
+<span class="w">          </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="n">i</span><span class="p">.</span><span class="n">inner</span><span class="o">:</span><span class="w"> </span><span class="n">int32</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="mi">8</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w" [...]
+<span class="w">            </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="n">j</span><span class="o">:</span><span class="w"> </span><span class="n">int32</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="mi">16</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">              </span><span class="n">compute_3</span><span class="p">[(((</span><span class="n">i</span><span class="p">.</span><span class="n">inner</span><span class="o">*</span><span class="mi">32</span><span class="p">)</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="p">(</span><span class="n">nb_j</span><span class="p">.</span><span class="n">inner</span><span class="o">*</span><span class="mi">16</span><span class="p">))</s [...]
+<span class="w">            </span><span class="p">}</span><span class="w"></span>
+<span class="w">          </span><span class="p">}</span><span class="w"></span>
+<span class="w">        </span><span class="p">}</span><span class="w"></span>
+<span class="w">      </span><span class="p">}</span><span class="w"></span>
+<span class="w">      </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="n">i0</span><span class="p">.</span><span class="n">inner</span><span class="o">:</span><span class="w"> </span><span class="n">int32</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="mi">8</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">        </span><span class="n">compute_2</span><span class="p">[</span><span class="n">ramp</span><span class="p">((((</span><span class="n">floordiv</span><span class="p">(</span><span class="n">i0</span><span class="p">.</span><span class="n">outer</span><span class="p">.</span><span class="n">i1</span><span class="p">.</span><span class="n">outer</span><span class="p">.</span><span class="n">fused</span><span class="p">,</span><span class="w"> </span><span class="mi">1 [...]
+<span class="w">      </span><span class="p">}</span><span class="w"></span>
+<span class="w">    </span><span class="p">}</span><span class="w"></span>
+<span class="w">  </span><span class="p">}</span><span class="w"></span>
+<span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/tune_with_autotvm/sg_execution_times.html b/docs/how_to/tune_with_autotvm/sg_execution_times.html
index 97f58852b..1663540a5 100644
--- a/docs/how_to/tune_with_autotvm/sg_execution_times.html
+++ b/docs/how_to/tune_with_autotvm/sg_execution_times.html
@@ -300,13 +300,13 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-tune-with-autotvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:43.877</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
+<p><strong>00:44.901</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
 <ul class="simple">
-<li><p><strong>00:43.046</strong>: <a class="reference internal" href="tune_conv2d_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-conv2d-cuda-py"><span class="std std-ref">Tuning High Performance Convolution on NVIDIA GPUs</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_cuda.py</span></code>)</p></li>
-<li><p><strong>00:00.211</strong>: <a class="reference internal" href="tune_relay_x86.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-x86-py"><span class="std std-ref">Auto-tuning a Convolutional Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_x86.py</span></code>)</p></li>
-<li><p><strong>00:00.208</strong>: <a class="reference internal" href="tune_relay_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-cuda-py"><span class="std std-ref">Auto-tuning a Convolutional Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_cuda.py</span></code>)</p></li>
-<li><p><strong>00:00.207</strong>: <a class="reference internal" href="tune_relay_mobile_gpu.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-mobile-gpu-py"><span class="std std-ref">Auto-tuning a Convolutional Network for Mobile GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_mobile_gpu.py</span></code>)</p></li>
-<li><p><strong>00:00.205</strong>: <a class="reference internal" href="tune_relay_arm.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-arm-py"><span class="std std-ref">Auto-tuning a Convolutional Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_arm.py</span></code>)</p></li>
+<li><p><strong>00:44.047</strong>: <a class="reference internal" href="tune_conv2d_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-conv2d-cuda-py"><span class="std std-ref">Tuning High Performance Convolution on NVIDIA GPUs</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_cuda.py</span></code>)</p></li>
+<li><p><strong>00:00.223</strong>: <a class="reference internal" href="tune_relay_x86.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-x86-py"><span class="std std-ref">Auto-tuning a Convolutional Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_x86.py</span></code>)</p></li>
+<li><p><strong>00:00.215</strong>: <a class="reference internal" href="tune_relay_mobile_gpu.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-mobile-gpu-py"><span class="std std-ref">Auto-tuning a Convolutional Network for Mobile GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_mobile_gpu.py</span></code>)</p></li>
+<li><p><strong>00:00.209</strong>: <a class="reference internal" href="tune_relay_arm.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-arm-py"><span class="std std-ref">Auto-tuning a Convolutional Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_arm.py</span></code>)</p></li>
+<li><p><strong>00:00.207</strong>: <a class="reference internal" href="tune_relay_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-cuda-py"><span class="std std-ref">Auto-tuning a Convolutional Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_cuda.py</span></code>)</p></li>
 </ul>
 </div>
 
diff --git a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
index 881397799..9278a5d68 100644
--- a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
+++ b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
@@ -362,11 +362,11 @@ sudo make cython3
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span><span class="p">,</span> <span class="n">topi</span><span class="p">,</span> <span class="n">testing</span>
-<span class="kn">from</span> <span class="nn">tvm.topi.testing</span> <span class="k">import</span> <span class="n">conv2d_nchw_python</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span><span class="p">,</span> <span class="n">topi</span><span class="p">,</span> <span class="n">testing</span>
+<span class="kn">from</span> <span class="nn">tvm.topi.testing</span> <span class="kn">import</span> <span class="n">conv2d_nchw_python</span>
 <span class="kn">import</span> <span class="nn">tvm.testing</span>
 
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">autotvm</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">autotvm</span>
 </pre></div>
 </div>
 </div>
@@ -1142,8 +1142,8 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 854, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 4, 32]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 1, 128]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2885496
-No: 6   GFLOPS: 96.35/96.35     result: MeasureResult(costs=(0.002402670208333333,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5966920852661133, timestamp=1650540209.386137)        [(&#39;tile_f&#39;, [-1, 1, 1, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 4, 4]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,3754080
-No: 7   GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+No: 6   GFLOPS: 110.06/110.06   result: MeasureResult(costs=(0.0021034644791666666,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6358001232147217, timestamp=1650555136.586403)       [(&#39;tile_f&#39;, [-1, 1, 1, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 4, 4]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,3754080
+No: 7   GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 571, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 523, in _build_func_common
@@ -1266,7 +1266,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 854, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 16, 32]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 256, 1]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,6225319
-No: 8   GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+No: 8   GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 571, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 523, in _build_func_common
@@ -1389,7 +1389,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 854, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 1, 32]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 64]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,943546
-No: 9   GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+No: 9   GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 571, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 523, in _build_func_common
@@ -1512,7 +1512,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 854, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 16, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 16, 32]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2868708
-No: 10  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+No: 10  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 142, in build
     res = future.result()
   File &quot;/usr/lib/python3.7/concurrent/futures/_base.py&quot;, line 435, in result
@@ -1530,7 +1530,7 @@ No: 10  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
 TimeoutError
 
         [(&#39;tile_f&#39;, [-1, 32, 2, 4]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 4, 2]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,4691833
-No: 11  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+No: 11  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 571, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 523, in _build_func_common
@@ -1653,7 +1653,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 854, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 2, 64]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 4]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,1042124
-No: 12  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+No: 12  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 571, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 523, in _build_func_common
@@ -1776,7 +1776,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 854, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 32, 1, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 32, 16]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,10013405
-No: 13  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+No: 13  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 571, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 523, in _build_func_common
@@ -1899,7 +1899,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 854, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 8, 8, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 4, 32]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,6732082
-No: 14  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+No: 14  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 571, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 523, in _build_func_common
@@ -2022,7 +2022,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 854, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 4, 32]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 128]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,7536735
-No: 15  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+No: 15  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 571, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 523, in _build_func_common
@@ -2145,7 +2145,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 854, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 1, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 128, 4]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,482121
-No: 16  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+No: 16  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 571, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 523, in _build_func_common
@@ -2268,7 +2268,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 854, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 1, 16]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 32, 8]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2824525
-No: 17  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+No: 17  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 571, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 523, in _build_func_common
@@ -2391,7 +2391,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 854, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 64, 1, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 8]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,4559286
-No: 18  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+No: 18  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 571, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 523, in _build_func_common
@@ -2514,7 +2514,7 @@ Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 854, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
 tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 32, 16]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 512]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9677544
-No: 19  GFLOPS: 0.00/96.35      result: Traceback (most recent call last):
+No: 19  GFLOPS: 0.00/110.06     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 721, in __call__
     yield remote, remote.load_module(os.path.split(build_result.filename)[1])
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 685, in run_through_rpc
@@ -2602,7 +2602,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
   15: _PyEval_EvalFrameDefault
   14: 0x0000000000537c30
   13: _PyObject_FastCallKeywords
-  12: 0x00007facecca0fa2
+  12: 0x00007fcd9bcc9fa2
   11: _ctypes_callproc
   10: ffi_call
   9: ffi_call_unix64
@@ -2667,7 +2667,7 @@ Traceback (most recent call last):
   21: _PyFunction_FastCallKeywords
   20: _PyEval_EvalFrameDefault
   19: _PyFunction_FastCall      [(&#39;tile_f&#39;, [-1, 8, 2, 16]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,6390073
-No: 20  GFLOPS: 144.26/144.26   result: MeasureResult(costs=(0.00160480132,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4118306636810303, timestamp=1650540235.138862)       [(&#39;tile_f&#39;, [-1, 1, 4, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9881539
+No: 20  GFLOPS: 143.01/143.01   result: MeasureResult(costs=(0.0016188322999999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4208097457885742, timestamp=1650555162.9244173)      [(&#39;tile_f&#39;, [-1, 1, 4, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9881539
 </pre></div>
 </div>
 <p>Finally we can inspect the best config from log file, check correctness,
@@ -2706,7 +2706,7 @@ and measure running time.</p>
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Best config:
 [(&#39;tile_f&#39;, [-1, 1, 4, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9881539
-Time cost of this operator: 0.002011
+Time cost of this operator: 0.002010
 </pre></div>
 </div>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autotvm-tune-conv2d-cuda-py">
diff --git a/docs/how_to/tune_with_autotvm/tune_relay_arm.html b/docs/how_to/tune_with_autotvm/tune_relay_arm.html
index 33eed1d17..b47f157ce 100644
--- a/docs/how_to/tune_with_autotvm/tune_relay_arm.html
+++ b/docs/how_to/tune_with_autotvm/tune_relay_arm.html
@@ -375,10 +375,10 @@ sudo make cython3
 
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">autotvm</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">autotvm</span>
 <span class="kn">import</span> <span class="nn">tvm.relay.testing</span>
-<span class="kn">from</span> <span class="nn">tvm.autotvm.tuner</span> <span class="k">import</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.XGBTuner" title="View documentation for tvm.autotvm.tuner.XGBTuner"><span class="n">XGBTuner</span></a><span class="p">,</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.GATuner" title="View documentation for tvm.autotvm.tuner.GATuner"><span class="n">GATuner</span></a><span class="p">,</span> <a h [...]
-<span class="kn">from</span> <span class="nn">tvm.contrib.utils</span> <span class="k">import</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.utils.tempdir" title="View documentation for tvm.contrib.utils.tempdir"><span class="n">tempdir</span></a>
+<span class="kn">from</span> <span class="nn">tvm.autotvm.tuner</span> <span class="kn">import</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.XGBTuner" title="View documentation for tvm.autotvm.tuner.XGBTuner"><span class="n">XGBTuner</span></a><span class="p">,</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.GATuner" title="View documentation for tvm.autotvm.tuner.GATuner"><span class="n">GATuner</span></a><span class="p">,</span> <a  [...]
+<span class="kn">from</span> <span class="nn">tvm.contrib.utils</span> <span class="kn">import</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.utils.tempdir" title="View documentation for tvm.contrib.utils.tempdir"><span class="n">tempdir</span></a>
 <span class="kn">import</span> <span class="nn">tvm.contrib.graph_executor</span> <span class="k">as</span> <span class="nn">runtime</span>
 </pre></div>
 </div>
@@ -414,7 +414,7 @@ We can also load models from MXNet, ONNX and TensorFlow.</p>
         <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/testing.html#tvm.relay.testing.inception_v3.get_workload" title="View documentation for tvm.relay.testing.inception_v3.get_workload"><span class="n">relay</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">inception_v3</span><span class="o">.</span><span class="n">get_workload</span></a [...]
     <span class="k">elif</span> <span class="n">name</span> <span class="o">==</span> <span class="s2">&quot;mxnet&quot;</span><span class="p">:</span>
         <span class="c1"># an example for mxnet model</span>
-        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="k">import</span> <span class="n">get_model</span>
+        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="kn">import</span> <span class="n">get_model</span>
 
         <span class="n">block</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="s2">&quot;resnet18_v1&quot;</span><span class="p">,</span> <span class="n">pretrained</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
         <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/frontend.html#tvm.relay.frontend.from_mxnet" title="View documentation for tvm.relay.frontend.from_mxnet"><span class="n">relay</span><span class="o">.</span><span class="n">frontend</span><span class="o">.</span><span class="n">from_mxnet</span></a><span class="p">(</span><span class="n">block</span><span class="p">,</span> <span cla [...]
@@ -626,7 +626,7 @@ We will introduce a more sophisticated tuning scheduler in the future.</p>
         <span class="c1"># export library</span>
         <span class="n">tmp</span> <span class="o">=</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.utils.tempdir" title="View documentation for tvm.contrib.utils.tempdir"><span class="n">tempdir</span></a><span class="p">()</span>
         <span class="k">if</span> <span class="n">use_android</span><span class="p">:</span>
-            <span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">ndk</span>
+            <span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">ndk</span>
 
             <span class="n">filename</span> <span class="o">=</span> <span class="s2">&quot;net.so&quot;</span>
             <span class="n">lib</span><span class="o">.</span><span class="n">export_library</span><span class="p">(</span><span class="n">tmp</span><span class="o">.</span><span class="n">relpath</span><span class="p">(</span><span class="n">filename</span><span class="p">),</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.ndk.create_shared" title="View documentation for tvm.contrib.ndk.create_shared"><span class="n">ndk</span><span class="o">.</span><span class="n">c [...]
diff --git a/docs/how_to/tune_with_autotvm/tune_relay_cuda.html b/docs/how_to/tune_with_autotvm/tune_relay_cuda.html
index a2ff4b03d..5ca115e49 100644
--- a/docs/how_to/tune_with_autotvm/tune_relay_cuda.html
+++ b/docs/how_to/tune_with_autotvm/tune_relay_cuda.html
@@ -374,9 +374,9 @@ sudo make cython3
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">autotvm</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">autotvm</span>
 <span class="kn">import</span> <span class="nn">tvm.relay.testing</span>
-<span class="kn">from</span> <span class="nn">tvm.autotvm.tuner</span> <span class="k">import</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.XGBTuner" title="View documentation for tvm.autotvm.tuner.XGBTuner"><span class="n">XGBTuner</span></a><span class="p">,</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.GATuner" title="View documentation for tvm.autotvm.tuner.GATuner"><span class="n">GATuner</span></a><span class="p">,</span> <a h [...]
+<span class="kn">from</span> <span class="nn">tvm.autotvm.tuner</span> <span class="kn">import</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.XGBTuner" title="View documentation for tvm.autotvm.tuner.XGBTuner"><span class="n">XGBTuner</span></a><span class="p">,</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.GATuner" title="View documentation for tvm.autotvm.tuner.GATuner"><span class="n">GATuner</span></a><span class="p">,</span> <a  [...]
 <span class="kn">import</span> <span class="nn">tvm.contrib.graph_executor</span> <span class="k">as</span> <span class="nn">runtime</span>
 </pre></div>
 </div>
@@ -412,7 +412,7 @@ We can also load models from MXNet, ONNX and TensorFlow.</p>
         <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/testing.html#tvm.relay.testing.inception_v3.get_workload" title="View documentation for tvm.relay.testing.inception_v3.get_workload"><span class="n">relay</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">inception_v3</span><span class="o">.</span><span class="n">get_workload</span></a [...]
     <span class="k">elif</span> <span class="n">name</span> <span class="o">==</span> <span class="s2">&quot;mxnet&quot;</span><span class="p">:</span>
         <span class="c1"># an example for mxnet model</span>
-        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="k">import</span> <span class="n">get_model</span>
+        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="kn">import</span> <span class="n">get_model</span>
 
         <span class="n">block</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="s2">&quot;resnet18_v1&quot;</span><span class="p">,</span> <span class="n">pretrained</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
         <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/frontend.html#tvm.relay.frontend.from_mxnet" title="View documentation for tvm.relay.frontend.from_mxnet"><span class="n">relay</span><span class="o">.</span><span class="n">frontend</span><span class="o">.</span><span class="n">from_mxnet</span></a><span class="p">(</span><span class="n">block</span><span class="p">,</span> <span cla [...]
diff --git a/docs/how_to/tune_with_autotvm/tune_relay_mobile_gpu.html b/docs/how_to/tune_with_autotvm/tune_relay_mobile_gpu.html
index b4218b8c7..16bbc34c3 100644
--- a/docs/how_to/tune_with_autotvm/tune_relay_mobile_gpu.html
+++ b/docs/how_to/tune_with_autotvm/tune_relay_mobile_gpu.html
@@ -376,10 +376,10 @@ sudo make cython3
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">autotvm</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">autotvm</span>
 <span class="kn">import</span> <span class="nn">tvm.relay.testing</span>
-<span class="kn">from</span> <span class="nn">tvm.autotvm.tuner</span> <span class="k">import</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.XGBTuner" title="View documentation for tvm.autotvm.tuner.XGBTuner"><span class="n">XGBTuner</span></a><span class="p">,</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.GATuner" title="View documentation for tvm.autotvm.tuner.GATuner"><span class="n">GATuner</span></a><span class="p">,</span> <a h [...]
-<span class="kn">from</span> <span class="nn">tvm.contrib.utils</span> <span class="k">import</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.utils.tempdir" title="View documentation for tvm.contrib.utils.tempdir"><span class="n">tempdir</span></a>
+<span class="kn">from</span> <span class="nn">tvm.autotvm.tuner</span> <span class="kn">import</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.XGBTuner" title="View documentation for tvm.autotvm.tuner.XGBTuner"><span class="n">XGBTuner</span></a><span class="p">,</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.GATuner" title="View documentation for tvm.autotvm.tuner.GATuner"><span class="n">GATuner</span></a><span class="p">,</span> <a  [...]
+<span class="kn">from</span> <span class="nn">tvm.contrib.utils</span> <span class="kn">import</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.utils.tempdir" title="View documentation for tvm.contrib.utils.tempdir"><span class="n">tempdir</span></a>
 <span class="kn">import</span> <span class="nn">tvm.contrib.graph_executor</span> <span class="k">as</span> <span class="nn">runtime</span>
 </pre></div>
 </div>
@@ -415,7 +415,7 @@ We can also load models from MXNet, ONNX and TensorFlow.</p>
         <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/testing.html#tvm.relay.testing.inception_v3.get_workload" title="View documentation for tvm.relay.testing.inception_v3.get_workload"><span class="n">relay</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">inception_v3</span><span class="o">.</span><span class="n">get_workload</span></a [...]
     <span class="k">elif</span> <span class="n">name</span> <span class="o">==</span> <span class="s2">&quot;mxnet&quot;</span><span class="p">:</span>
         <span class="c1"># an example for mxnet model</span>
-        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="k">import</span> <span class="n">get_model</span>
+        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="kn">import</span> <span class="n">get_model</span>
 
         <span class="n">block</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="s2">&quot;resnet18_v1&quot;</span><span class="p">,</span> <span class="n">pretrained</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
         <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/frontend.html#tvm.relay.frontend.from_mxnet" title="View documentation for tvm.relay.frontend.from_mxnet"><span class="n">relay</span><span class="o">.</span><span class="n">frontend</span><span class="o">.</span><span class="n">from_mxnet</span></a><span class="p">(</span><span class="n">block</span><span class="p">,</span> <span cla [...]
@@ -618,7 +618,7 @@ We will introduce a more sophisticated tuning scheduler in the future.</p>
         <span class="c1"># export library</span>
         <span class="n">tmp</span> <span class="o">=</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.utils.tempdir" title="View documentation for tvm.contrib.utils.tempdir"><span class="n">tempdir</span></a><span class="p">()</span>
         <span class="k">if</span> <span class="n">use_android</span><span class="p">:</span>
-            <span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">ndk</span>
+            <span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">ndk</span>
 
             <span class="n">filename</span> <span class="o">=</span> <span class="s2">&quot;net.so&quot;</span>
             <span class="n">lib</span><span class="o">.</span><span class="n">export_library</span><span class="p">(</span><span class="n">tmp</span><span class="o">.</span><span class="n">relpath</span><span class="p">(</span><span class="n">filename</span><span class="p">),</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.ndk.create_shared" title="View documentation for tvm.contrib.ndk.create_shared"><span class="n">ndk</span><span class="o">.</span><span class="n">c [...]
diff --git a/docs/how_to/tune_with_autotvm/tune_relay_x86.html b/docs/how_to/tune_with_autotvm/tune_relay_x86.html
index 723723ae7..e380c12d2 100644
--- a/docs/how_to/tune_with_autotvm/tune_relay_x86.html
+++ b/docs/how_to/tune_with_autotvm/tune_relay_x86.html
@@ -346,10 +346,10 @@ get it to run, you will need to wrap the body of this tutorial in a <code class=
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">autotvm</span>
-<span class="kn">from</span> <span class="nn">tvm.relay</span> <span class="k">import</span> <span class="n">testing</span>
-<span class="kn">from</span> <span class="nn">tvm.autotvm.tuner</span> <span class="k">import</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.XGBTuner" title="View documentation for tvm.autotvm.tuner.XGBTuner"><span class="n">XGBTuner</span></a><span class="p">,</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.GATuner" title="View documentation for tvm.autotvm.tuner.GATuner"><span class="n">GATuner</span></a><span class="p">,</span> <a h [...]
-<span class="kn">from</span> <span class="nn">tvm.autotvm.graph_tuner</span> <span class="k">import</span> <span class="n">DPTuner</span><span class="p">,</span> <span class="n">PBQPTuner</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span><span class="p">,</span> <span class="n">autotvm</span>
+<span class="kn">from</span> <span class="nn">tvm.relay</span> <span class="kn">import</span> <span class="n">testing</span>
+<span class="kn">from</span> <span class="nn">tvm.autotvm.tuner</span> <span class="kn">import</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.XGBTuner" title="View documentation for tvm.autotvm.tuner.XGBTuner"><span class="n">XGBTuner</span></a><span class="p">,</span> <a href="../../reference/api/python/autotvm.html#tvm.autotvm.tuner.GATuner" title="View documentation for tvm.autotvm.tuner.GATuner"><span class="n">GATuner</span></a><span class="p">,</span> <a  [...]
+<span class="kn">from</span> <span class="nn">tvm.autotvm.graph_tuner</span> <span class="kn">import</span> <span class="n">DPTuner</span><span class="p">,</span> <span class="n">PBQPTuner</span>
 <span class="kn">import</span> <span class="nn">tvm.contrib.graph_executor</span> <span class="k">as</span> <span class="nn">runtime</span>
 </pre></div>
 </div>
@@ -386,7 +386,7 @@ We can also load models from MXNet, ONNX and TensorFlow.</p>
         <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/testing.html#tvm.relay.testing.inception_v3.get_workload" title="View documentation for tvm.relay.testing.inception_v3.get_workload"><span class="n">relay</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">inception_v3</span><span class="o">.</span><span class="n">get_workload</span></a [...]
     <span class="k">elif</span> <span class="n">name</span> <span class="o">==</span> <span class="s2">&quot;mxnet&quot;</span><span class="p">:</span>
         <span class="c1"># an example for mxnet model</span>
-        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="k">import</span> <span class="n">get_model</span>
+        <span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="kn">import</span> <span class="n">get_model</span>
 
         <span class="n">block</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="s2">&quot;resnet18_v1&quot;</span><span class="p">,</span> <span class="n">pretrained</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
         <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/frontend.html#tvm.relay.frontend.from_mxnet" title="View documentation for tvm.relay.frontend.from_mxnet"><span class="n">relay</span><span class="o">.</span><span class="n">frontend</span><span class="o">.</span><span class="n">from_mxnet</span></a><span class="p">(</span><span class="n">block</span><span class="p">,</span> <span cla [...]
@@ -583,7 +583,7 @@ INFO Finished backward pass...
 INFO Finished DPExecutor run.
 INFO Writing optimal schedules to resnet-18_graph_opt.log successfully.
 
-Evaluation of the network compiled in <span class="s1">&#39;default&#39;</span> mode without auto tune:
+Evaluation of the network compiled <span class="k">in</span> <span class="s1">&#39;default&#39;</span> mode without auto tune:
 Compile...
 Evaluate inference <span class="nb">time</span> cost...
 Mean inference <span class="nb">time</span> <span class="o">(</span>std dev<span class="o">)</span>: <span class="m">4</span>.5 ms <span class="o">(</span><span class="m">0</span>.03 ms<span class="o">)</span>
@@ -595,8 +595,8 @@ Mean inference <span class="nb">time</span> <span class="o">(</span>std dev<span
 
 Evaluation of the network been tuned on graph level:
 Compile...
-Config <span class="k">for</span> <span class="nv">target</span><span class="o">=</span>llvm -keys<span class="o">=</span>cpu -link-params<span class="o">=</span><span class="m">0</span>, <span class="nv">workload</span><span class="o">=(</span><span class="s1">&#39;dense_nopack.x86&#39;</span>, <span class="o">(</span><span class="s1">&#39;TENSOR&#39;</span>, <span class="o">(</span><span class="m">1</span>, <span class="m">512</span><span class="o">)</span>, <span class="s1">&#39;float [...]
-Config <span class="k">for</span> <span class="nv">target</span><span class="o">=</span>llvm -keys<span class="o">=</span>cpu -link-params<span class="o">=</span><span class="m">0</span>, <span class="nv">workload</span><span class="o">=(</span><span class="s1">&#39;dense_pack.x86&#39;</span>, <span class="o">(</span><span class="s1">&#39;TENSOR&#39;</span>, <span class="o">(</span><span class="m">1</span>, <span class="m">512</span><span class="o">)</span>, <span class="s1">&#39;float32 [...]
+Config <span class="k">for</span> <span class="nv">target</span><span class="o">=</span>llvm -keys<span class="o">=</span>cpu -link-params<span class="o">=</span><span class="m">0</span>, <span class="nv">workload</span><span class="o">=(</span><span class="s1">&#39;dense_nopack.x86&#39;</span>, <span class="o">(</span><span class="s1">&#39;TENSOR&#39;</span>, <span class="o">(</span><span class="m">1</span>, <span class="m">512</span><span class="o">)</span>, <span class="s1">&#39;float [...]
+Config <span class="k">for</span> <span class="nv">target</span><span class="o">=</span>llvm -keys<span class="o">=</span>cpu -link-params<span class="o">=</span><span class="m">0</span>, <span class="nv">workload</span><span class="o">=(</span><span class="s1">&#39;dense_pack.x86&#39;</span>, <span class="o">(</span><span class="s1">&#39;TENSOR&#39;</span>, <span class="o">(</span><span class="m">1</span>, <span class="m">512</span><span class="o">)</span>, <span class="s1">&#39;float32 [...]
 Evaluate inference <span class="nb">time</span> cost...
 Mean inference <span class="nb">time</span> <span class="o">(</span>std dev<span class="o">)</span>: <span class="m">3</span>.16 ms <span class="o">(</span><span class="m">0</span>.03 ms<span class="o">)</span>
 </pre></div>
diff --git a/docs/how_to/work_with_microtvm/micro_autotune.html b/docs/how_to/work_with_microtvm/micro_autotune.html
index bcd411674..894747b5a 100644
--- a/docs/how_to/work_with_microtvm/micro_autotune.html
+++ b/docs/how_to/work_with_microtvm/micro_autotune.html
@@ -350,7 +350,7 @@
 <span class="kn">import</span> <span class="nn">pathlib</span>
 
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm.relay.backend</span> <span class="k">import</span> <span class="n">Runtime</span>
+<span class="kn">from</span> <span class="nn">tvm.relay.backend</span> <span class="kn">import</span> <span class="n">Runtime</span>
 
 <span class="n">use_physical_hw</span> <span class="o">=</span> <span class="nb">bool</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="s2">&quot;TVM_MICRO_USE_HW&quot;</span><span class="p">))</span>
 </pre></div>
@@ -553,10 +553,10 @@ the tuned operator.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build without Autotuning ##########
 Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs
 ---------                                     ---                                           --------  -------  -----              ------  -------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  311.4     98.723   (1, 2, 10, 10, 3)  2       1
-tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.128     0.992    (1, 6, 10, 10)     1       1
-tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.901     0.286    (1, 1, 10, 10, 3)  1       1
-Total_time                                    -                                             315.429   -        -                  -       -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  579.8     98.933   (1, 2, 10, 10, 3)  2       1
+tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       4.755     0.811    (1, 6, 10, 10)     1       1
+tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         1.501     0.256    (1, 1, 10, 10, 3)  1       1
+Total_time                                    -                                             586.056   -        -                  -       -
 </pre></div>
 </div>
 </div>
@@ -608,10 +608,10 @@ Total_time                                    -
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build with Autotuning ##########
 Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs
 ---------                                     ---                                           --------  -------  -----              ------  -------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  89.75     97.082   (1, 6, 10, 10, 1)  2       1
-tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.752     1.895    (1, 6, 10, 10)     1       1
-tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.946     1.024    (1, 1, 10, 10, 3)  1       1
-Total_time                                    -                                             92.448    -        -                  -       -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  211.9     98.68    (1, 1, 10, 10, 6)  2       1
+tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.933     0.9      (1, 6, 10, 10)     1       1
+tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.901     0.42     (1, 3, 10, 10, 1)  1       1
+Total_time                                    -                                             214.734   -        -                  -       -
 </pre></div>
 </div>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-autotune-py">
diff --git a/docs/how_to/work_with_microtvm/micro_ethosu.html b/docs/how_to/work_with_microtvm/micro_ethosu.html
index 37e2c9916..bf027f1b4 100644
--- a/docs/how_to/work_with_microtvm/micro_ethosu.html
+++ b/docs/how_to/work_with_microtvm/micro_ethosu.html
@@ -510,25 +510,25 @@ for the output of inference.</p></li>
  <span class="kn">import</span> <span class="nn">re</span>
  <span class="kn">import</span> <span class="nn">sys</span>
  <span class="kn">from</span> <span class="nn">PIL</span> <span class="kn">import</span> <span class="n">Image</span>
- <span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
+ <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 
 
  <span class="k">def</span> <span class="nf">create_header_file</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">section</span><span class="p">,</span> <span class="n">tensor_name</span><span class="p">,</span> <span class="n">tensor_data</span><span class="p">,</span> <span class="n">output_path</span><span class="p">):</span>
      <span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">     This function generates a header file containing the data from the numpy array provided.</span>
 <span class="sd">     &quot;&quot;&quot;</span>
-     <span class="n">file_path</span> <span class="o">=</span> <span class="n">pathlib</span><span class="o">.</span><span class="n">Path</span><span class="p">(</span><span class="n">f</span><span class="s2">&quot;{output_path}/&quot;</span> <span class="o">+</span> <span class="n">name</span><span class="p">)</span><span class="o">.</span><span class="n">resolve</span><span class="p">()</span>
+     <span class="n">file_path</span> <span class="o">=</span> <span class="n">pathlib</span><span class="o">.</span><span class="n">Path</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">output_path</span><span class="si">}</span><span class="s2">/&quot;</span> <span class="o">+</span> <span class="n">name</span><span class="p">)</span><span class="o">.</span><span class="n">resolve</span><span class="p">()</span>
      <span class="c1"># Create header file with npy_data as a C array</span>
      <span class="n">raw_path</span> <span class="o">=</span> <span class="n">file_path</span><span class="o">.</span><span class="n">with_suffix</span><span class="p">(</span><span class="s2">&quot;.h&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">resolve</span><span class="p">()</span>
      <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">raw_path</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">header_file</span><span class="p">:</span>
          <span class="n">header_file</span><span class="o">.</span><span class="n">write</span><span class="p">(</span>
              <span class="s2">&quot;#include &lt;tvmgen_default.h&gt;</span><span class="se">\n</span><span class="s2">&quot;</span>
-             <span class="o">+</span> <span class="n">f</span><span class="s2">&quot;const size_t {tensor_name}_len = {tensor_data.size};</span><span class="se">\n</span><span class="s2">&quot;</span>
-             <span class="o">+</span> <span class="n">f</span><span class="s1">&#39;uint8_t {tensor_name}[] __attribute__((section(&quot;{section}&quot;), aligned(16))) = &quot;&#39;</span>
+             <span class="o">+</span> <span class="sa">f</span><span class="s2">&quot;const size_t </span><span class="si">{</span><span class="n">tensor_name</span><span class="si">}</span><span class="s2">_len = </span><span class="si">{</span><span class="n">tensor_data</span><span class="o">.</span><span class="n">size</span><span class="si">}</span><span class="s2">;</span><span class="se">\n</span><span class="s2">&quot;</span>
+             <span class="o">+</span> <span class="sa">f</span><span class="s1">&#39;uint8_t </span><span class="si">{</span><span class="n">tensor_name</span><span class="si">}</span><span class="s1">[] __attribute__((section(&quot;</span><span class="si">{</span><span class="n">section</span><span class="si">}</span><span class="s1">&quot;), aligned(16))) = &quot;&#39;</span>
          <span class="p">)</span>
          <span class="n">data_hexstr</span> <span class="o">=</span> <span class="n">tensor_data</span><span class="o">.</span><span class="n">tobytes</span><span class="p">()</span><span class="o">.</span><span class="n">hex</span><span class="p">()</span>
          <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">data_hexstr</span><span class="p">),</span> <span class="mi">2</span><span class="p">):</span>
-             <span class="n">header_file</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">f</span><span class="s2">&quot;</span><span class="se">\\</span><span class="s2">x{data_hexstr[i:i+2]}&quot;</span><span class="p">)</span>
+             <span class="n">header_file</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="se">\\</span><span class="s2">x</span><span class="si">{</span><span class="n">data_hexstr</span><span class="p">[</span><span class="n">i</span><span class="p">:</span><span class="n">i</span><span class="o">+</span><span class="mi">2</span><span class="p">]</span><span class="si">}</span><span clas [...]
          <span class="n">header_file</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s1">&#39;&quot;;</span><span class="se">\n\n</span><span class="s1">&#39;</span><span class="p">)</span>
 
 
@@ -536,7 +536,7 @@ for the output of inference.</p></li>
      <span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">     This function generates C header files for the input and output arrays required to run inferences</span>
 <span class="sd">     &quot;&quot;&quot;</span>
-     <span class="n">img_path</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="s2">&quot;./&quot;</span><span class="p">,</span> <span class="n">f</span><span class="s2">&quot;{image_name}&quot;</span><span class="p">)</span>
+     <span class="n">img_path</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="s2">&quot;./&quot;</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">image_name</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 
      <span class="c1"># Resize image to 224x224</span>
      <span class="n">resized_image</span> <span class="o">=</span> <span class="n">Image</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">img_path</span><span class="p">)</span><span class="o">.</span><span class="n">resize</span><span class="p">((</span><span class="mi">224</span><span class="p">,</span> <span class="mi">224</span><span class="p">))</span>
@@ -588,16 +588,16 @@ our image has been classified as.</p>
 <span class="sd">     This function generates a header file containing the ImageNet labels as an array of strings</span>
 <span class="sd">     &quot;&quot;&quot;</span>
      <span class="n">labels_path</span> <span class="o">=</span> <span class="n">pathlib</span><span class="o">.</span><span class="n">Path</span><span class="p">(</span><span class="n">labels_file</span><span class="p">)</span><span class="o">.</span><span class="n">resolve</span><span class="p">()</span>
-     <span class="n">file_path</span> <span class="o">=</span> <span class="n">pathlib</span><span class="o">.</span><span class="n">Path</span><span class="p">(</span><span class="n">f</span><span class="s2">&quot;{output_path}/labels.h&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">resolve</span><span class="p">()</span>
+     <span class="n">file_path</span> <span class="o">=</span> <span class="n">pathlib</span><span class="o">.</span><span class="n">Path</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">output_path</span><span class="si">}</span><span class="s2">/labels.h&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">resolve</span><span class="p">()</span>
 
      <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">labels_path</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
          <span class="n">labels</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">readlines</span><span class="p">()</span>
 
      <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">file_path</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">header_file</span><span class="p">:</span>
-         <span class="n">header_file</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">f</span><span class="s1">&#39;char* labels[] __attribute__((section(&quot;{section}&quot;), aligned(16))) = {{&#39;</span><span class="p">)</span>
+         <span class="n">header_file</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;char* labels[] __attribute__((section(&quot;</span><span class="si">{</span><span class="n">section</span><span class="si">}</span><span class="s1">&quot;), aligned(16))) = </span><span class="se">{{</span><span class="s1">&#39;</span><span class="p">)</span>
 
          <span class="k">for</span> <span class="n">_</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">labels</span><span class="p">):</span>
-             <span class="n">header_file</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">f</span><span class="s1">&#39;&quot;{label.rstrip()}&quot;,&#39;</span><span class="p">)</span>
+             <span class="n">header_file</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;&quot;</span><span class="si">{</span><span class="n">label</span><span class="o">.</span><span class="n">rstrip</span><span class="p">()</span><span class="si">}</span><span class="s1">&quot;,&#39;</span><span class="p">)</span>
 
          <span class="n">header_file</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s2">&quot;};</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">)</span>
 
@@ -623,58 +623,58 @@ classied as a “tabby” and the result should be displayed on the console.
 This file should be placed in <code class="docutils literal notranslate"><span class="pre">./src</span></code></p>
 <div class="literal-block-wrapper docutils container" id="demo-c">
 <div class="code-block-caption"><span class="caption-text">demo.c</span><a class="headerlink" href="#demo-c" title="Permalink to this code">¶</a></div>
-<div class="highlight-c notranslate"><div class="highlight"><pre><span></span> <span class="cp">#include</span> <span class="cpf">&lt;stdio.h&gt;</span><span class="cp"></span>
- <span class="cp">#include</span> <span class="cpf">&lt;tvm_runtime.h&gt;</span><span class="cp"></span>
-
- <span class="cp">#include</span> <span class="cpf">&quot;ethosu_mod.h&quot;</span><span class="cp"></span>
- <span class="cp">#include</span> <span class="cpf">&quot;uart.h&quot;</span><span class="cp"></span>
-
- <span class="c1">// Header files generated by convert_image.py and convert_labels.py</span>
- <span class="cp">#include</span> <span class="cpf">&quot;inputs.h&quot;</span><span class="cp"></span>
- <span class="cp">#include</span> <span class="cpf">&quot;labels.h&quot;</span><span class="cp"></span>
- <span class="cp">#include</span> <span class="cpf">&quot;outputs.h&quot;</span><span class="cp"></span>
-
- <span class="kt">int</span> <span class="nf">abs</span><span class="p">(</span><span class="kt">int</span> <span class="n">v</span><span class="p">)</span> <span class="p">{</span> <span class="k">return</span> <span class="n">v</span> <span class="o">*</span> <span class="p">((</span><span class="n">v</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-</span> <span class="p">(</span><span class="n">v</span> <span class="o">&lt;</span>  [...]
-
- <span class="kt">int</span> <span class="nf">main</span><span class="p">(</span><span class="kt">int</span> <span class="n">argc</span><span class="p">,</span> <span class="kt">char</span><span class="o">**</span> <span class="n">argv</span><span class="p">)</span> <span class="p">{</span>
-   <span class="n">uart_init</span><span class="p">();</span>
-   <span class="n">printf</span><span class="p">(</span><span class="s">&quot;Starting Demo</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">);</span>
-   <span class="n">EthosuInit</span><span class="p">();</span>
-
-   <span class="n">printf</span><span class="p">(</span><span class="s">&quot;Allocating memory</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">);</span>
-   <span class="n">StackMemoryManager_Init</span><span class="p">(</span><span class="o">&amp;</span><span class="n">app_workspace</span><span class="p">,</span> <span class="n">g_aot_memory</span><span class="p">,</span> <span class="n">WORKSPACE_SIZE</span><span class="p">);</span>
-
-   <span class="n">printf</span><span class="p">(</span><span class="s">&quot;Running inference</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">);</span>
-   <span class="k">struct</span> <span class="n">tvmgen_default_outputs</span> <span class="n">outputs</span> <span class="o">=</span> <span class="p">{</span>
-       <span class="p">.</span><span class="n">output</span> <span class="o">=</span> <span class="n">output</span><span class="p">,</span>
-   <span class="p">};</span>
-   <span class="k">struct</span> <span class="n">tvmgen_default_inputs</span> <span class="n">inputs</span> <span class="o">=</span> <span class="p">{</span>
-       <span class="p">.</span><span class="n">input</span> <span class="o">=</span> <span class="n">input</span><span class="p">,</span>
-   <span class="p">};</span>
-   <span class="k">struct</span> <span class="n">ethosu_driver</span><span class="o">*</span> <span class="n">driver</span> <span class="o">=</span> <span class="n">ethosu_reserve_driver</span><span class="p">();</span>
-   <span class="k">struct</span> <span class="n">tvmgen_default_devices</span> <span class="n">devices</span> <span class="o">=</span> <span class="p">{</span>
-       <span class="p">.</span><span class="n">ethos_u</span> <span class="o">=</span> <span class="n">driver</span><span class="p">,</span>
-   <span class="p">};</span>
-   <span class="n">tvmgen_default_run</span><span class="p">(</span><span class="o">&amp;</span><span class="n">inputs</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">outputs</span><span class="p">,</span> <span class="o">&amp;</span><span class="n">devices</span><span class="p">);</span>
-   <span class="n">ethosu_release_driver</span><span class="p">(</span><span class="n">driver</span><span class="p">);</span>
-
-   <span class="c1">// Calculate index of max value</span>
-   <span class="kt">uint8_t</span> <span class="n">max_value</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
-   <span class="kt">int32_t</span> <span class="n">max_index</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">;</span>
-   <span class="k">for</span> <span class="p">(</span><span class="kt">unsigned</span> <span class="kt">int</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">output_len</span><span class="p">;</span> <span class="o">++</span><span class="n">i</span><span class="p">)</span> <span class="p">{</span>
-     <span class="k">if</span> <span class="p">(</span><span class="n">output</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">&gt;</span> <span class="n">max_value</span><span class="p">)</span> <span class="p">{</span>
-       <span class="n">max_value</span> <span class="o">=</span> <span class="n">output</span><span class="p">[</span><span class="n">i</span><span class="p">];</span>
-       <span class="n">max_index</span> <span class="o">=</span> <span class="n">i</span><span class="p">;</span>
-     <span class="p">}</span>
-   <span class="p">}</span>
-   <span class="n">printf</span><span class="p">(</span><span class="s">&quot;The image has been classified as &#39;%s&#39;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">,</span> <span class="n">labels</span><span class="p">[</span><span class="n">max_index</span><span class="p">]);</span>
-
-   <span class="c1">// The FVP will shut down when it receives &quot;EXITTHESIM&quot; on the UART</span>
-   <span class="n">printf</span><span class="p">(</span><span class="s">&quot;EXITTHESIM</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">);</span>
-   <span class="k">while</span> <span class="p">(</span><span class="mi">1</span> <span class="o">==</span> <span class="mi">1</span><span class="p">)</span>
-     <span class="p">;</span>
-   <span class="k">return</span> <span class="mi">0</span><span class="p">;</span>
- <span class="p">}</span>
+<div class="highlight-c notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;stdio.h&gt;</span><span class="cp"></span>
+<span class="w"> </span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;tvm_runtime.h&gt;</span><span class="cp"></span>
+
+<span class="w"> </span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&quot;ethosu_mod.h&quot;</span><span class="cp"></span>
+<span class="w"> </span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&quot;uart.h&quot;</span><span class="cp"></span>
+
+<span class="w"> </span><span class="c1">// Header files generated by convert_image.py and convert_labels.py</span>
+<span class="w"> </span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&quot;inputs.h&quot;</span><span class="cp"></span>
+<span class="w"> </span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&quot;labels.h&quot;</span><span class="cp"></span>
+<span class="w"> </span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&quot;outputs.h&quot;</span><span class="cp"></span>
+
+<span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="nf">abs</span><span class="p">(</span><span class="kt">int</span><span class="w"> </span><span class="n">v</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">v</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="p">((</span><span class="n">v</sp [...]
+
+<span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="nf">main</span><span class="p">(</span><span class="kt">int</span><span class="w"> </span><span class="n">argc</span><span class="p">,</span><span class="w"> </span><span class="kt">char</span><span class="o">**</span><span class="w"> </span><span class="n">argv</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">   </span><span class="n">uart_init</span><span class="p">();</span><span class="w"></span>
+<span class="w">   </span><span class="n">printf</span><span class="p">(</span><span class="s">&quot;Starting Demo</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">);</span><span class="w"></span>
+<span class="w">   </span><span class="n">EthosuInit</span><span class="p">();</span><span class="w"></span>
+
+<span class="w">   </span><span class="n">printf</span><span class="p">(</span><span class="s">&quot;Allocating memory</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">);</span><span class="w"></span>
+<span class="w">   </span><span class="n">StackMemoryManager_Init</span><span class="p">(</span><span class="o">&amp;</span><span class="n">app_workspace</span><span class="p">,</span><span class="w"> </span><span class="n">g_aot_memory</span><span class="p">,</span><span class="w"> </span><span class="n">WORKSPACE_SIZE</span><span class="p">);</span><span class="w"></span>
+
+<span class="w">   </span><span class="n">printf</span><span class="p">(</span><span class="s">&quot;Running inference</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">);</span><span class="w"></span>
+<span class="w">   </span><span class="k">struct</span><span class="w"> </span><span class="nc">tvmgen_default_outputs</span><span class="w"> </span><span class="n">outputs</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">       </span><span class="p">.</span><span class="n">output</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">output</span><span class="p">,</span><span class="w"></span>
+<span class="w">   </span><span class="p">};</span><span class="w"></span>
+<span class="w">   </span><span class="k">struct</span><span class="w"> </span><span class="nc">tvmgen_default_inputs</span><span class="w"> </span><span class="n">inputs</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">       </span><span class="p">.</span><span class="n">input</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">input</span><span class="p">,</span><span class="w"></span>
+<span class="w">   </span><span class="p">};</span><span class="w"></span>
+<span class="w">   </span><span class="k">struct</span><span class="w"> </span><span class="nc">ethosu_driver</span><span class="o">*</span><span class="w"> </span><span class="n">driver</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ethosu_reserve_driver</span><span class="p">();</span><span class="w"></span>
+<span class="w">   </span><span class="k">struct</span><span class="w"> </span><span class="nc">tvmgen_default_devices</span><span class="w"> </span><span class="n">devices</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">       </span><span class="p">.</span><span class="n">ethos_u</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">driver</span><span class="p">,</span><span class="w"></span>
+<span class="w">   </span><span class="p">};</span><span class="w"></span>
+<span class="w">   </span><span class="n">tvmgen_default_run</span><span class="p">(</span><span class="o">&amp;</span><span class="n">inputs</span><span class="p">,</span><span class="w"> </span><span class="o">&amp;</span><span class="n">outputs</span><span class="p">,</span><span class="w"> </span><span class="o">&amp;</span><span class="n">devices</span><span class="p">);</span><span class="w"></span>
+<span class="w">   </span><span class="n">ethosu_release_driver</span><span class="p">(</span><span class="n">driver</span><span class="p">);</span><span class="w"></span>
+
+<span class="w">   </span><span class="c1">// Calculate index of max value</span>
+<span class="w">   </span><span class="kt">uint8_t</span><span class="w"> </span><span class="n">max_value</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"></span>
+<span class="w">   </span><span class="kt">int32_t</span><span class="w"> </span><span class="n">max_index</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">-1</span><span class="p">;</span><span class="w"></span>
+<span class="w">   </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="kt">unsigned</span><span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">&lt;</span><span class="w"> </span><span class="n">o [...]
+<span class="w">     </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">output</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="w"> </span><span class="o">&gt;</span><span class="w"> </span><span class="n">max_value</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"></span>
+<span class="w">       </span><span class="n">max_value</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">output</span><span class="p">[</span><span class="n">i</span><span class="p">];</span><span class="w"></span>
+<span class="w">       </span><span class="n">max_index</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">i</span><span class="p">;</span><span class="w"></span>
+<span class="w">     </span><span class="p">}</span><span class="w"></span>
+<span class="w">   </span><span class="p">}</span><span class="w"></span>
+<span class="w">   </span><span class="n">printf</span><span class="p">(</span><span class="s">&quot;The image has been classified as &#39;%s&#39;</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">labels</span><span class="p">[</span><span class="n">max_index</span><span class="p">]);</span><span class="w"></span>
+
+<span class="w">   </span><span class="c1">// The FVP will shut down when it receives &quot;EXITTHESIM&quot; on the UART</span>
+<span class="w">   </span><span class="n">printf</span><span class="p">(</span><span class="s">&quot;EXITTHESIM</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">);</span><span class="w"></span>
+<span class="w">   </span><span class="k">while</span><span class="w"> </span><span class="p">(</span><span class="mi">1</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="mi">1</span><span class="p">)</span><span class="w"></span>
+<span class="w">     </span><span class="p">;</span><span class="w"></span>
+<span class="w">   </span><span class="k">return</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"></span>
+<span class="w"> </span><span class="p">}</span><span class="w"></span>
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/work_with_microtvm/micro_tflite.html b/docs/how_to/work_with_microtvm/micro_tflite.html
index b4b024144..6ad26afd0 100644
--- a/docs/how_to/work_with_microtvm/micro_tflite.html
+++ b/docs/how_to/work_with_microtvm/micro_tflite.html
@@ -435,9 +435,9 @@ directory into a buffer</p>
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
 <span class="kn">import</span> <span class="nn">tvm.contrib.utils</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
 
 <span class="n">use_physical_hw</span> <span class="o">=</span> <span class="nb">bool</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="s2">&quot;TVM_MICRO_USE_HW&quot;</span><span class="p">))</span>
 <span class="n">model_url</span> <span class="o">=</span> <span class="s2">&quot;https://people.linaro.org/~tom.gall/sine_model.tflite&quot;</span>
@@ -543,7 +543,7 @@ QEMU VM based on BOARD. In the example below the x86 arch is selected and a x86
 <span class="n">first_few_lines</span> <span class="o">=</span> <span class="n">c_source_code</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">)[:</span><span class="mi">10</span><span class="p">]</span>
 <span class="k">assert</span> <span class="nb">any</span><span class="p">(</span>
     <span class="n">l</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">&quot;TVM_DLL int32_t tvmgen_default_&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">l</span> <span class="ow">in</span> <span class="n">first_few_lines</span>
-<span class="p">),</span> <span class="n">f</span><span class="s2">&quot;tutorial is broken: </span><span class="si">{first_few_lines!r}</span><span class="s2">&quot;</span>
+<span class="p">),</span> <span class="sa">f</span><span class="s2">&quot;tutorial is broken: </span><span class="si">{</span><span class="n">first_few_lines</span><span class="si">!r}</span><span class="s2">&quot;</span>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">first_few_lines</span><span class="p">))</span>
 
 
@@ -562,7 +562,7 @@ QEMU VM based on BOARD. In the example below the x86 arch is selected and a x86
 <a href="../../reference/api/python/micro.html#tvm.micro.export_model_library_format" title="View documentation for tvm.micro.export_model_library_format"><span class="n">tvm</span><span class="o">.</span><span class="n">micro</span><span class="o">.</span><span class="n">export_model_library_format</span></a><span class="p">(</span><span class="n">module</span><span class="p">,</span> <span class="n">model_library_format_tar_path</span><span class="p">)</span>
 
 <span class="k">with</span> <span class="n">tarfile</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">model_library_format_tar_path</span><span class="p">,</span> <span class="s2">&quot;r:*&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">tar_f</span><span class="p">:</span>
-    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">f</span><span class="s2">&quot; - </span><span class="si">{m.name}</span><span class="s2">&quot;</span> <span class="k">for</span> <span class="n">m</span> <span class="ow">in</span> <span class="n">tar_f</span><span class="o">.</span><span class="n">getmemb [...]
+    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot; - </span><span class="si">{</span><span class="n">m</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s2">&quot;</span> <span class="k">for</span> <span class="n">m</span> <span cl [...]
 
 <span class="c1"># Cleanup for tutorial:</span>
 <span class="n">os</span><span class="o">.</span><span class="n">unlink</span><span class="p">(</span><span class="n">model_library_format_tar_path</span><span class="p">)</span>
diff --git a/docs/how_to/work_with_microtvm/sg_execution_times.html b/docs/how_to/work_with_microtvm/sg_execution_times.html
index a4d5513f9..478247272 100644
--- a/docs/how_to/work_with_microtvm/sg_execution_times.html
+++ b/docs/how_to/work_with_microtvm/sg_execution_times.html
@@ -300,13 +300,13 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-work-with-microtvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:44.271</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
+<p><strong>00:43.890</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
 <ul class="simple">
-<li><p><strong>00:40.200</strong>: <a class="reference internal" href="micro_autotune.html#sphx-glr-how-to-work-with-microtvm-micro-autotune-py"><span class="std std-ref">Autotuning with microTVM</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_autotune.py</span></code>)</p></li>
-<li><p><strong>00:03.503</strong>: <a class="reference internal" href="micro_tflite.html#sphx-glr-how-to-work-with-microtvm-micro-tflite-py"><span class="std std-ref">microTVM with TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tflite.py</span></code>)</p></li>
-<li><p><strong>00:00.192</strong>: <a class="reference internal" href="micro_tvmc.html#sphx-glr-how-to-work-with-microtvm-micro-tvmc-py"><span class="std std-ref">Executing a Tiny Model with TVMC Micro</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tvmc.py</span></code>)</p></li>
-<li><p><strong>00:00.188</strong>: <a class="reference internal" href="micro_ethosu.html#sphx-glr-how-to-work-with-microtvm-micro-ethosu-py"><span class="std std-ref">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_ethosu.py</span></code>)</p></li>
-<li><p><strong>00:00.187</strong>: <a class="reference internal" href="micro_reference_vm.html#sphx-glr-how-to-work-with-microtvm-micro-reference-vm-py"><span class="std std-ref">microTVM Reference Virtual Machines</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_reference_vm.py</span></code>)</p></li>
+<li><p><strong>00:39.796</strong>: <a class="reference internal" href="micro_autotune.html#sphx-glr-how-to-work-with-microtvm-micro-autotune-py"><span class="std std-ref">Autotuning with microTVM</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_autotune.py</span></code>)</p></li>
+<li><p><strong>00:03.507</strong>: <a class="reference internal" href="micro_tflite.html#sphx-glr-how-to-work-with-microtvm-micro-tflite-py"><span class="std std-ref">microTVM with TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tflite.py</span></code>)</p></li>
+<li><p><strong>00:00.197</strong>: <a class="reference internal" href="micro_ethosu.html#sphx-glr-how-to-work-with-microtvm-micro-ethosu-py"><span class="std std-ref">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_ethosu.py</span></code>)</p></li>
+<li><p><strong>00:00.197</strong>: <a class="reference internal" href="micro_reference_vm.html#sphx-glr-how-to-work-with-microtvm-micro-reference-vm-py"><span class="std std-ref">microTVM Reference Virtual Machines</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_reference_vm.py</span></code>)</p></li>
+<li><p><strong>00:00.193</strong>: <a class="reference internal" href="micro_tvmc.html#sphx-glr-how-to-work-with-microtvm-micro-tvmc-py"><span class="std std-ref">Executing a Tiny Model with TVMC Micro</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tvmc.py</span></code>)</p></li>
 </ul>
 </div>
 
diff --git a/docs/how_to/work_with_relay/build_gcn.html b/docs/how_to/work_with_relay/build_gcn.html
index a6a26032e..89c080532 100644
--- a/docs/how_to/work_with_relay/build_gcn.html
+++ b/docs/how_to/work_with_relay/build_gcn.html
@@ -360,11 +360,11 @@ This part reuses the code from the above example.</p>
 <span class="kn">import</span> <span class="nn">torch.nn.functional</span> <span class="k">as</span> <span class="nn">F</span>
 <span class="kn">import</span> <span class="nn">dgl</span>
 <span class="kn">import</span> <span class="nn">networkx</span> <span class="k">as</span> <span class="nn">nx</span>
-<span class="kn">from</span> <span class="nn">dgl.nn.pytorch</span> <span class="k">import</span> <span class="n">GraphConv</span>
+<span class="kn">from</span> <span class="nn">dgl.nn.pytorch</span> <span class="kn">import</span> <span class="n">GraphConv</span>
 
 
 <span class="k">class</span> <span class="nc">GCN</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
-    <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">g</span><span class="p">,</span> <span class="n">n_infeat</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">,</span> <span class="n">n_classes</span><span class="p">,</span> <span class="n">n_layers</span><span class="p">,</span> <span class="n">activation</span><span class="p">):</span>
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">g</span><span class="p">,</span> <span class="n">n_infeat</span><span class="p">,</span> <span class="n">n_hidden</span><span class="p">,</span> <span class="n">n_classes</span><span class="p">,</span> <span class="n">n_layers</span><span class="p">,</span> <span class="n">activation</span><span class="p">):</span>
         <span class="nb">super</span><span class="p">(</span><span class="n">GCN</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">g</span> <span class="o">=</span> <span class="n">g</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">layers</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">ModuleList</span><span class="p">()</span>
@@ -392,8 +392,8 @@ This part reuses the code from the above example.</p>
 <div class="section" id="define-the-functions-to-load-dataset-and-evaluate-accuracy">
 <h2>Define the functions to load dataset and evaluate accuracy<a class="headerlink" href="#define-the-functions-to-load-dataset-and-evaluate-accuracy" title="Permalink to this headline">¶</a></h2>
 <p>You may substitute this part with your own dataset, here we load data from DGL</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">dgl.data</span> <span class="k">import</span> <span class="n">load_data</span>
-<span class="kn">from</span> <span class="nn">collections</span> <span class="k">import</span> <span class="n">namedtuple</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">dgl.data</span> <span class="kn">import</span> <span class="n">load_data</span>
+<span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">namedtuple</span>
 
 
 <span class="k">def</span> <span class="nf">load_dataset</span><span class="p">(</span><span class="n">dataset</span><span class="o">=</span><span class="s2">&quot;cora&quot;</span><span class="p">):</span>
@@ -471,8 +471,8 @@ Done saving data into cached files.
 <div class="section" id="set-up-the-dgl-pytorch-model-and-get-the-golden-results">
 <h2>Set up the DGL-PyTorch model and get the golden results<a class="headerlink" href="#set-up-the-dgl-pytorch-model-and-get-the-golden-results" title="Permalink to this headline">¶</a></h2>
 <p>The weights are trained with <a class="reference external" href="https://github.com/dmlc/dgl/blob/master/examples/pytorch/gcn/train.py">https://github.com/dmlc/dgl/blob/master/examples/pytorch/gcn/train.py</a></p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span>
-<span class="kn">from</span> <span class="nn">dgl</span> <span class="k">import</span> <span class="n">DGLGraph</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">dgl</span> <span class="kn">import</span> <span class="n">DGLGraph</span>
 
 <span class="n">features</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">FloatTensor</span><span class="p">(</span><span class="n">data</span><span class="o">.</span><span class="n">features</span><span class="p">)</span>
 <span class="n">dgl_g</span> <span class="o">=</span> <span class="n">DGLGraph</span><span class="p">(</span><span class="n">g</span><span class="p">)</span>
@@ -531,10 +531,10 @@ this method is temporary and will be updated in next few weeks when we have spar
                             = ((H * W)^t * A^t)^t
                             = ((W^t * H^t) * A^t)^t\]</div>
 </div></blockquote>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span>
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
 
 
 <span class="k">def</span> <span class="nf">GraphConv</span><span class="p">(</span><span class="n">layer_name</span><span class="p">,</span> <span class="n">input_dim</span><span class="p">,</span> <span class="n">output_dim</span><span class="p">,</span> <span class="n">adj</span><span class="p">,</span> <span class="nb">input</span><span class="p">,</span> <span class="n">norm</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">bias</span [...]
diff --git a/docs/how_to/work_with_relay/sg_execution_times.html b/docs/how_to/work_with_relay/sg_execution_times.html
index d61cf38e3..9e261f82f 100644
--- a/docs/how_to/work_with_relay/sg_execution_times.html
+++ b/docs/how_to/work_with_relay/sg_execution_times.html
@@ -300,11 +300,11 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-work-with-relay-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:08.510</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
+<p><strong>00:08.607</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
 <ul class="simple">
-<li><p><strong>00:06.691</strong>: <a class="reference internal" href="using_external_lib.html#sphx-glr-how-to-work-with-relay-using-external-lib-py"><span class="std std-ref">Using External Libraries in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_external_lib.py</span></code>)</p></li>
-<li><p><strong>00:01.615</strong>: <a class="reference internal" href="build_gcn.html#sphx-glr-how-to-work-with-relay-build-gcn-py"><span class="std std-ref">Building a Graph Convolutional Network</span></a> (<code class="docutils literal notranslate"><span class="pre">build_gcn.py</span></code>)</p></li>
-<li><p><strong>00:00.205</strong>: <a class="reference internal" href="using_relay_viz.html#sphx-glr-how-to-work-with-relay-using-relay-viz-py"><span class="std std-ref">Use Relay Visualizer to Visualize Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_relay_viz.py</span></code>)</p></li>
+<li><p><strong>00:06.801</strong>: <a class="reference internal" href="using_external_lib.html#sphx-glr-how-to-work-with-relay-using-external-lib-py"><span class="std std-ref">Using External Libraries in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_external_lib.py</span></code>)</p></li>
+<li><p><strong>00:01.601</strong>: <a class="reference internal" href="build_gcn.html#sphx-glr-how-to-work-with-relay-build-gcn-py"><span class="std std-ref">Building a Graph Convolutional Network</span></a> (<code class="docutils literal notranslate"><span class="pre">build_gcn.py</span></code>)</p></li>
+<li><p><strong>00:00.206</strong>: <a class="reference internal" href="using_relay_viz.html#sphx-glr-how-to-work-with-relay-using-relay-viz-py"><span class="std std-ref">Use Relay Visualizer to Visualize Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_relay_viz.py</span></code>)</p></li>
 </ul>
 </div>
 
diff --git a/docs/how_to/work_with_relay/using_external_lib.html b/docs/how_to/work_with_relay/using_external_lib.html
index 505ed7dff..d51931fe6 100644
--- a/docs/how_to/work_with_relay/using_external_lib.html
+++ b/docs/how_to/work_with_relay/using_external_lib.html
@@ -346,11 +346,11 @@ For Relay users, all we need to do is just to set a target string appropriately.
 For example, to use cuDNN, USE_CUDNN option in <cite>cmake/config.cmake</cite> needs to be enabled, and cuDNN include and library directories need to be specified if necessary.</p>
 <p>To begin with, we import Relay and TVM.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">te</span>
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">graph_executor</span> <span class="k">as</span> <span class="n">runtime</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
-<span class="kn">from</span> <span class="nn">tvm.relay</span> <span class="k">import</span> <span class="n">testing</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">graph_executor</span> <span class="k">as</span> <span class="n">runtime</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm.relay</span> <span class="kn">import</span> <span class="n">testing</span>
 <span class="kn">import</span> <span class="nn">tvm.testing</span>
 </pre></div>
 </div>
diff --git a/docs/how_to/work_with_relay/using_relay_viz.html b/docs/how_to/work_with_relay/using_relay_viz.html
index ba4527287..0d86ac825 100644
--- a/docs/how_to/work_with_relay/using_relay_viz.html
+++ b/docs/how_to/work_with_relay/using_relay_viz.html
@@ -348,21 +348,21 @@ A default parser is provided. Users can implement their own renderers to render
 It is a lightweight, AST-like visualizer, inspired by <a class="reference external" href="https://clang.llvm.org/docs/IntroductionToTheClangAST.html">clang ast-dump</a>.
 We will introduce how to implement customized parsers and renderers through interface classes.</p>
 <p>For more details, please refer to <a class="reference internal" href="../../reference/api/python/contrib.html#module-tvm.contrib.relay_viz" title="tvm.contrib.relay_viz"><code class="xref py py-mod docutils literal notranslate"><span class="pre">tvm.contrib.relay_viz</span></code></a>.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">typing</span> <span class="k">import</span> <span class="p">(</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span>
     <span class="n">Dict</span><span class="p">,</span>
     <span class="n">Union</span><span class="p">,</span>
     <span class="n">Tuple</span><span class="p">,</span>
     <span class="n">List</span><span class="p">,</span>
 <span class="p">)</span>
 <span class="kn">import</span> <span class="nn">tvm</span>
-<span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="k">import</span> <span class="n">relay_viz</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.relay_viz.interface</span> <span class="k">import</span> <span class="p">(</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">relay_viz</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.relay_viz.interface</span> <span class="kn">import</span> <span class="p">(</span>
     <a href="../../reference/api/python/contrib.html#tvm.contrib.relay_viz.interface.VizEdge" title="View documentation for tvm.contrib.relay_viz.interface.VizEdge"><span class="n">VizEdge</span></a><span class="p">,</span>
     <a href="../../reference/api/python/contrib.html#tvm.contrib.relay_viz.interface.VizNode" title="View documentation for tvm.contrib.relay_viz.interface.VizNode"><span class="n">VizNode</span></a><span class="p">,</span>
     <span class="n">VizParser</span><span class="p">,</span>
 <span class="p">)</span>
-<span class="kn">from</span> <span class="nn">tvm.contrib.relay_viz.terminal</span> <span class="k">import</span> <span class="p">(</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.relay_viz.terminal</span> <span class="kn">import</span> <span class="p">(</span>
     <a href="../../reference/api/python/contrib.html#tvm.contrib.relay_viz.terminal.TermGraph" title="View documentation for tvm.contrib.relay_viz.terminal.TermGraph"><span class="n">TermGraph</span></a><span class="p">,</span>
     <span class="n">TermPlotter</span><span class="p">,</span>
     <span class="n">TermVizParser</span><span class="p">,</span>
@@ -423,7 +423,7 @@ It is possible to provide customized parsers as long as it obeys the interface.
 Here demostrate how to customize parsers for <code class="docutils literal notranslate"><span class="pre">relay.var</span></code>.
 We need to implement abstract interface <a class="reference internal" href="../../reference/api/python/contrib.html#tvm.contrib.relay_viz.interface.VizParser" title="tvm.contrib.relay_viz.interface.VizParser"><code class="xref py py-class docutils literal notranslate"><span class="pre">tvm.contrib.relay_viz.interface.VizParser</span></code></a>.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">YourAwesomeParser</span><span class="p">(</span><span class="n">VizParser</span><span class="p">):</span>
-    <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">_delegate</span> <span class="o">=</span> <span class="n">TermVizParser</span><span class="p">()</span>
 
     <span class="k">def</span> <span class="nf">get_node_edges</span><span class="p">(</span>
@@ -434,7 +434,7 @@ We need to implement abstract interface <a class="reference internal" href="../.
     <span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><a href="../../reference/api/python/contrib.html#tvm.contrib.relay_viz.interface.VizNode" title="View documentation for tvm.contrib.relay_viz.interface.VizNode"><span class="n">VizNode</span></a><span class="p">,</span> <span class="kc">None</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><a hr [...]
 
         <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="n">relay</span><span class="o">.</span><span class="n">Var</span><span class="p">):</span>
-            <span class="n">node</span> <span class="o">=</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.relay_viz.interface.VizNode" title="View documentation for tvm.contrib.relay_viz.interface.VizNode"><span class="n">VizNode</span></a><span class="p">(</span><span class="n">node_to_id</span><span class="p">[</span><span class="n">node</span><span class="p">],</span> <span class="s2">&quot;AwesomeVar&quot;</span><span class="p">,</span> <span class="n">f</span><sp [...]
+            <span class="n">node</span> <span class="o">=</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.relay_viz.interface.VizNode" title="View documentation for tvm.contrib.relay_viz.interface.VizNode"><span class="n">VizNode</span></a><span class="p">(</span><span class="n">node_to_id</span><span class="p">[</span><span class="n">node</span><span class="p">],</span> <span class="s2">&quot;AwesomeVar&quot;</span><span class="p">,</span> <span class="sa">f</span><s [...]
             <span class="c1"># no edge is introduced. So return an empty list.</span>
             <span class="k">return</span> <span class="n">node</span><span class="p">,</span> <span class="p">[]</span>
 
@@ -478,7 +478,7 @@ We add a hook duplicating above <code class="docutils literal notranslate"><span
         <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">node</span><span class="p">(</span><span class="n">viz_node</span><span class="p">)</span>
         <span class="c1"># if it&#39;s AwesomeVar, duplicate it.</span>
         <span class="k">if</span> <span class="n">viz_node</span><span class="o">.</span><span class="n">type_name</span> <span class="o">==</span> <span class="s2">&quot;AwesomeVar&quot;</span><span class="p">:</span>
-            <span class="n">duplicated_id</span> <span class="o">=</span> <span class="n">f</span><span class="s2">&quot;duplciated_</span><span class="si">{viz_node.identity}</span><span class="s2">&quot;</span>
... 6942 lines suppressed ...