You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2022/11/16 22:35:12 UTC

[tvm-site] branch asf-site updated: deploying docs (apache/tvm@a80cdc26e291abc52bbd70c950023d9e0340464d)

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new 27e9ea61fb deploying docs (apache/tvm@a80cdc26e291abc52bbd70c950023d9e0340464d)
27e9ea61fb is described below

commit 27e9ea61fb9d40d594f743368211e9a49c534880
Author: tvm-bot <95...@users.noreply.github.com>
AuthorDate: Wed Nov 16 22:35:04 2022 +0000

    deploying docs (apache/tvm@a80cdc26e291abc52bbd70c950023d9e0340464d)
---
 docs/_images/sphx_glr_micro_train_001.png          |  Bin 324216 -> 332672 bytes
 docs/_images/sphx_glr_micro_train_thumb.png        |  Bin 23634 -> 24174 bytes
 .../how_to/compile_models/from_darknet.rst.txt     |    2 +-
 .../how_to/compile_models/from_keras.rst.txt       |    2 +-
 .../how_to/compile_models/from_mxnet.rst.txt       |    2 +-
 .../how_to/compile_models/from_oneflow.rst.txt     |    2 +-
 .../how_to/compile_models/from_pytorch.rst.txt     |    2 +-
 .../how_to/compile_models/from_tensorflow.rst.txt  |    2 +-
 .../compile_models/sg_execution_times.rst.txt      |   22 +-
 .../deploy_models/deploy_model_on_android.rst.txt  |    2 +-
 .../deploy_object_detection_pytorch.rst.txt        |    4 +-
 .../deploy_models/deploy_prequantized.rst.txt      |    6 +-
 .../deploy_prequantized_tflite.rst.txt             |    4 +-
 .../how_to/deploy_models/deploy_quantized.rst.txt  |    2 +-
 .../deploy_models/deploy_ssd_gluoncv.rst.txt       |    4 +-
 .../deploy_models/sg_execution_times.rst.txt       |   20 +-
 .../extend_tvm/bring_your_own_datatypes.rst.txt    |    2 +-
 .../how_to/extend_tvm/sg_execution_times.rst.txt   |   10 +-
 .../how_to/extend_tvm/use_pass_instrument.rst.txt  |   16 +-
 .../optimize_operators/opt_conv_cuda.rst.txt       |    2 +-
 .../optimize_operators/opt_conv_tensorcore.rst.txt |    2 +-
 .../how_to/optimize_operators/opt_gemm.rst.txt     |   16 +-
 .../optimize_operators/sg_execution_times.rst.txt  |    8 +-
 .../sg_execution_times.rst.txt                     |   14 +-
 .../tune_conv2d_layer_cuda.rst.txt                 | 1038 ++------------
 .../tune_network_cuda.rst.txt                      |    4 +-
 .../tune_network_x86.rst.txt                       |    4 +-
 .../tune_sparse_x86.rst.txt                        |  130 +-
 .../tune_with_autotvm/sg_execution_times.rst.txt   |    6 +-
 .../tune_with_autotvm/tune_conv2d_cuda.rst.txt     |  335 +++--
 .../work_with_microtvm/micro_autotune.rst.txt      |   16 +-
 .../work_with_microtvm/micro_pytorch.rst.txt       |    4 +-
 .../how_to/work_with_microtvm/micro_train.rst.txt  |   18 +-
 .../work_with_microtvm/sg_execution_times.rst.txt  |   14 +-
 .../work_with_relay/sg_execution_times.rst.txt     |    8 +-
 .../how_to/work_with_schedules/intrin_math.rst.txt |    2 +-
 .../work_with_schedules/sg_execution_times.rst.txt |   14 +-
 .../how_to/work_with_schedules/tensorize.rst.txt   |    2 +-
 .../tutorials/autotvm/sg_execution_times.rst.txt   |    6 +-
 .../frontend/deploy_classification.rst.txt         |    2 +-
 .../tutorials/frontend/deploy_detection.rst.txt    |    2 +-
 .../tutorials/frontend/sg_execution_times.rst.txt  |    6 +-
 .../tutorials/optimize/sg_execution_times.rst.txt  |    6 +-
 .../topic/vta/tutorials/sg_execution_times.rst.txt |    6 +-
 .../tutorial/auto_scheduler_matmul_x86.rst.txt     |    4 +-
 docs/_sources/tutorial/autotvm_matmul_x86.rst.txt  |   20 +-
 docs/_sources/tutorial/autotvm_relay_x86.rst.txt   |   57 +-
 .../tutorial/cross_compilation_and_rpc.rst.txt     |    2 +-
 docs/_sources/tutorial/intro_topi.rst.txt          |    2 +-
 docs/_sources/tutorial/sg_execution_times.rst.txt  |   24 +-
 .../tutorial/tensor_expr_get_started.rst.txt       |   49 +-
 docs/commit_hash                                   |    2 +-
 docs/how_to/compile_models/from_darknet.html       |    2 +-
 docs/how_to/compile_models/from_keras.html         |    2 +-
 docs/how_to/compile_models/from_mxnet.html         |    2 +-
 docs/how_to/compile_models/from_oneflow.html       |   14 +-
 docs/how_to/compile_models/from_pytorch.html       |   11 +-
 docs/how_to/compile_models/from_tensorflow.html    |    2 +-
 docs/how_to/compile_models/sg_execution_times.html |   26 +-
 .../deploy_models/deploy_model_on_android.html     |    2 +-
 .../deploy_object_detection_pytorch.html           |   32 +-
 docs/how_to/deploy_models/deploy_prequantized.html |    8 +-
 .../deploy_models/deploy_prequantized_tflite.html  |    4 +-
 docs/how_to/deploy_models/deploy_quantized.html    |    2 +-
 docs/how_to/deploy_models/deploy_ssd_gluoncv.html  |   40 +-
 docs/how_to/deploy_models/sg_execution_times.html  |   20 +-
 .../extend_tvm/bring_your_own_datatypes.html       |    2 +-
 docs/how_to/extend_tvm/sg_execution_times.html     |   10 +-
 docs/how_to/extend_tvm/use_pass_instrument.html    |   16 +-
 docs/how_to/optimize_operators/opt_conv_cuda.html  |    2 +-
 .../optimize_operators/opt_conv_tensorcore.html    |    2 +-
 docs/how_to/optimize_operators/opt_gemm.html       |   16 +-
 .../optimize_operators/sg_execution_times.html     |    8 +-
 .../sg_execution_times.html                        |   14 +-
 .../tune_conv2d_layer_cuda.html                    | 1038 ++------------
 .../tune_with_autoscheduler/tune_network_cuda.html |    4 +-
 .../tune_with_autoscheduler/tune_network_x86.html  |    4 +-
 .../tune_with_autoscheduler/tune_sparse_x86.html   |  130 +-
 .../tune_with_autotvm/sg_execution_times.html      |    6 +-
 .../how_to/tune_with_autotvm/tune_conv2d_cuda.html |  335 +++--
 docs/how_to/work_with_microtvm/micro_autotune.html |   16 +-
 docs/how_to/work_with_microtvm/micro_pytorch.html  |    5 +-
 docs/how_to/work_with_microtvm/micro_train.html    |   16 +-
 .../work_with_microtvm/sg_execution_times.html     |   14 +-
 .../how_to/work_with_relay/sg_execution_times.html |    8 +-
 docs/how_to/work_with_schedules/intrin_math.html   |    2 +-
 .../work_with_schedules/sg_execution_times.html    |   14 +-
 docs/how_to/work_with_schedules/tensorize.html     |    2 +-
 docs/install/nnpack.html                           |   12 +-
 docs/reference/api/doxygen/bias__add_8h__incl.svg  |  188 +--
 .../api/doxygen/broadcast_8h__dep__incl.svg        |   92 +-
 docs/reference/api/doxygen/broadcast_8h__incl.svg  |   20 +-
 .../api/doxygen/constant__utils_8h__dep__incl.svg  |  168 +--
 .../doxygen/detail_2broadcast_8h__dep__incl.svg    |  104 +-
 .../api/doxygen/detail_2broadcast_8h__incl.svg     |    8 +-
 docs/reference/api/doxygen/einsum_8h__incl.svg     |   60 +-
 .../api/doxygen/elemwise_8h__dep__incl.svg         |   48 +-
 docs/reference/api/doxygen/flatten_8h__incl.svg    |   12 +-
 .../api/doxygen/namespacemembers_func_o.html       |   24 +-
 docs/reference/api/doxygen/namespacemembers_o.html |   24 +-
 .../reference/api/doxygen/namespacetvm_1_1tir.html |   40 +-
 docs/reference/api/doxygen/nn_2bnn_8h__incl.svg    |   12 +-
 .../reference/api/doxygen/nn_2pooling_8h__incl.svg |  276 ++--
 .../reference/api/doxygen/nn_2softmax_8h__incl.svg |  228 +--
 .../api/doxygen/reduction_8h__dep__incl.svg        |   40 +-
 docs/reference/api/doxygen/reduction_8h__incl.svg  |  232 +--
 docs/reference/api/doxygen/reorg_8h__incl.svg      |  288 ++--
 docs/reference/api/doxygen/search/all_10.js        |    2 +-
 docs/reference/api/doxygen/search/functions_f.js   |    2 +-
 docs/reference/api/doxygen/stmt_8h_source.html     |    4 +-
 .../api/doxygen/strided__slice_8h__dep__incl.svg   |   72 +-
 docs/reference/api/doxygen/tir_2analysis_8h.html   |    2 +-
 .../api/doxygen/tir_2analysis_8h__dep__incl.svg    |  176 +--
 .../api/doxygen/tir_2analysis_8h__incl.svg         | 1484 ++++++++++----------
 .../api/doxygen/tir_2analysis_8h_source.html       |    2 +-
 .../api/doxygen/tir_2op__attr__types_8h.html       |   11 +-
 .../doxygen/tir_2op__attr__types_8h__dep__incl.svg |  184 +--
 .../api/doxygen/tir_2op__attr__types_8h__incl.svg  | 1082 +++++++-------
 .../doxygen/tir_2op__attr__types_8h_source.html    |    7 +-
 .../api/doxygen/topi_2nn_8h__dep__incl.svg         |   12 +-
 docs/reference/api/doxygen/topi_2nn_8h__incl.svg   |  288 ++--
 .../api/doxygen/topi_2transform_8h__dep__incl.svg  |   64 +-
 .../api/doxygen/topi_2transform_8h__incl.svg       |  100 +-
 docs/reference/api/python/auto_scheduler.html      |    4 +-
 .../api/typedoc/classes/bytestreamreader.html      |   12 +-
 .../api/typedoc/classes/cachedcallstack.html       |   34 +-
 docs/reference/api/typedoc/classes/dldatatype.html |   12 +-
 docs/reference/api/typedoc/classes/dldevice.html   |   10 +-
 .../reference/api/typedoc/classes/environment.html |   12 +-
 docs/reference/api/typedoc/classes/ffilibrary.html |   20 +-
 .../api/typedoc/classes/graphexecutor.html         |   16 +-
 docs/reference/api/typedoc/classes/instance.html   |   40 +-
 docs/reference/api/typedoc/classes/memory.html     |   34 +-
 docs/reference/api/typedoc/classes/module.html     |   10 +-
 docs/reference/api/typedoc/classes/ndarray.html    |   22 +-
 .../api/typedoc/classes/packedfunccell.html        |    6 +-
 docs/reference/api/typedoc/classes/rpcserver.html  |   14 +-
 docs/reference/api/typedoc/classes/scalar.html     |    6 +-
 .../api/typedoc/classes/webgpucontext.html         |   12 +-
 docs/reference/api/typedoc/enums/argtypecode.html  |   30 +-
 .../api/typedoc/enums/aynccallbackcode.html        |    4 +-
 .../api/typedoc/enums/dldatatypecode.html          |    8 +-
 .../api/typedoc/enums/rpcserverstate.html          |   12 +-
 docs/reference/api/typedoc/enums/sizeof.html       |   18 +-
 docs/reference/api/typedoc/index.html              |  112 +-
 .../api/typedoc/interfaces/disposable.html         |    2 +-
 .../api/typedoc/interfaces/functioninfo.html       |    6 +-
 .../api/typedoc/interfaces/libraryprovider.html    |    4 +-
 docs/searchindex.js                                |    2 +-
 .../vta/tutorials/autotvm/sg_execution_times.html  |    6 +-
 .../tutorials/frontend/deploy_classification.html  |    2 +-
 .../vta/tutorials/frontend/deploy_detection.html   |    2 +-
 .../vta/tutorials/frontend/sg_execution_times.html |    6 +-
 .../vta/tutorials/optimize/sg_execution_times.html |    6 +-
 docs/topic/vta/tutorials/sg_execution_times.html   |    6 +-
 docs/tutorial/auto_scheduler_matmul_x86.html       |    4 +-
 docs/tutorial/autotvm_matmul_x86.html              |   20 +-
 docs/tutorial/autotvm_relay_x86.html               |  267 ++--
 docs/tutorial/cross_compilation_and_rpc.html       |    2 +-
 docs/tutorial/intro_topi.html                      |    2 +-
 docs/tutorial/sg_execution_times.html              |   30 +-
 docs/tutorial/tensor_expr_get_started.html         |   45 +-
 162 files changed, 4244 insertions(+), 5744 deletions(-)

diff --git a/docs/_images/sphx_glr_micro_train_001.png b/docs/_images/sphx_glr_micro_train_001.png
index 58230570fb..3f59a37dab 100644
Binary files a/docs/_images/sphx_glr_micro_train_001.png and b/docs/_images/sphx_glr_micro_train_001.png differ
diff --git a/docs/_images/sphx_glr_micro_train_thumb.png b/docs/_images/sphx_glr_micro_train_thumb.png
index c7f45c5bc4..6746d44e45 100644
Binary files a/docs/_images/sphx_glr_micro_train_thumb.png and b/docs/_images/sphx_glr_micro_train_thumb.png differ
diff --git a/docs/_sources/how_to/compile_models/from_darknet.rst.txt b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
index 67f25f0733..75cbab884f 100644
--- a/docs/_sources/how_to/compile_models/from_darknet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
@@ -315,7 +315,7 @@ The process is no different from other examples.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  11.575 seconds)
+   **Total running time of the script:** ( 1 minutes  11.995 seconds)
 
 
 .. _sphx_glr_download_how_to_compile_models_from_darknet.py:
diff --git a/docs/_sources/how_to/compile_models/from_keras.rst.txt b/docs/_sources/how_to/compile_models/from_keras.rst.txt
index d7e4cf10da..60b0949a69 100644
--- a/docs/_sources/how_to/compile_models/from_keras.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_keras.rst.txt
@@ -228,7 +228,7 @@ Look up prediction top 1 index in 1000 class synset.
  .. code-block:: none
 
     Relay top-1 id: 285, class name: Egyptian cat
-
    1/1 [==============================] - ETA: 0s
    1/1 [==============================] - 1s 938ms/step
+
    1/1 [==============================] - ETA: 0s
    1/1 [==============================] - 1s 940ms/step
     Keras top-1 id: 285, class name: Egyptian cat
 
 
diff --git a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
index 228562f5cf..2b5eda5530 100644
--- a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
@@ -115,7 +115,7 @@ In this section, we download a pretrained imagenet model and classify an image.
 
  .. code-block:: none
 
-    Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip9c46c9d6-2dfb-4d52-98f2-171233342729 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+    Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip9ca116cb-bf2a-48a3-912a-847c98f2dcd4 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
     x (1, 3, 224, 224)
 
 
diff --git a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
index c979770699..d60337f1f3 100644
--- a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
@@ -116,7 +116,7 @@ Load a pretrained OneFlow model and save model
  .. code-block:: none
 
     Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
-
      0%|          | 0.00/41.5M [00:00<?, ?B/s]
     15%|#5        | 6.33M/41.5M [00:00<00:00, 51.2MB/s]
     27%|##7       | 11.2M/41.5M [00:00<00:00, 49.7MB/s]
     39%|###8      | 16.0M/41.5M [00:00<00:00, 46.7MB/s]
     58%|#####7    | 24.0M/41.5M [00:00<00:00, 44.4MB/s]
     77%|#######7  | 32.0M/41.5M [00:00<00:00, 54.4MB/s]
     92%|#########2| 38.3M/41.5M [00:00<00:00, 57.5MB/s]
    100%|##########| 41.5M/41.5M [00:00<00:00, 53.9MB/s]
+
      0%|          | 0.00/41.5M [00:00<?, ?B/s]
     19%|#9        | 7.99M/41.5M [00:00<00:00, 43.7MB/s]
     35%|###4      | 14.3M/41.5M [00:00<00:00, 46.6MB/s]
     45%|####5     | 18.8M/41.5M [00:00<00:00, 45.8MB/s]
     58%|#####7    | 24.0M/41.5M [00:00<00:00, 37.1MB/s]
     77%|#######7  | 32.0M/41.5M [00:00<00:00, 43.6MB/s]
     92%|#########2| 38.3M/41.5M [00:01<00:00, 36.3MB/s]
    100%|##########| 41.5M/41.5M [00:01<00:00, 40.7MB/s]
 
 
 
diff --git a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
index bc687dd6a6..646870daa2 100644
--- a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
@@ -98,7 +98,7 @@ Load a pretrained PyTorch model
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.
       warnings.warn(msg)
     Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
-
      0%|          | 0.00/44.7M [00:00<?, ?B/s]
     18%|#7        | 7.99M/44.7M [00:00<00:00, 50.2MB/s]
     33%|###2      | 14.6M/44.7M [00:00<00:00, 59.1MB/s]
     54%|#####3    | 24.0M/44.7M [00:00<00:00, 70.2MB/s]
     72%|#######2  | 32.3M/44.7M [00:00<00:00, 76.0MB/s]
     90%|########9 | 40.1M/44.7M [00:00<00:00, 77.8MB/s]
    100%|##########| 44.7M/44.7M [00:00<00:00, 77.9MB/s]
+
      0%|          | 0.00/44.7M [00:00<?, ?B/s]
     18%|#7        | 7.99M/44.7M [00:00<00:00, 46.8MB/s]
     52%|#####1    | 23.1M/44.7M [00:00<00:00, 96.2MB/s]
     76%|#######5  | 33.8M/44.7M [00:00<00:00, 92.3MB/s]
     99%|#########8| 44.2M/44.7M [00:00<00:00, 61.3MB/s]
    100%|##########| 44.7M/44.7M [00:00<00:00, 67.9MB/s]
 
 
 
diff --git a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
index 46cce7c6fc..a92119d66f 100644
--- a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
@@ -416,7 +416,7 @@ Run the corresponding model on tensorflow
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  10.321 seconds)
+   **Total running time of the script:** ( 1 minutes  15.234 seconds)
 
 
 .. _sphx_glr_download_how_to_compile_models_from_tensorflow.py:
diff --git a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
index 36a74731c4..f6bc7601f8 100644
--- a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
 
 Computation times
 =================
-**05:44.277** total execution time for **how_to_compile_models** files:
+**05:47.039** total execution time for **how_to_compile_models** files:
 
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``)       | 01:11.575 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:15.234 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:10.321 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``)       | 01:11.995 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``)         | 00:45.942 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``)         | 00:46.704 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``)       | 00:33.204 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``)       | 00:31.920 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``)           | 00:29.863 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``)           | 00:28.777 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``)         | 00:26.164 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``)         | 00:26.959 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``)         | 00:24.637 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``)         | 00:25.130 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``)       | 00:22.423 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``)       | 00:22.065 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``)           | 00:17.770 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``)           | 00:15.885 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``)             | 00:02.380 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``)             | 00:02.370 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
index 0c53ee1b5b..5fc85f8be2 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
@@ -433,7 +433,7 @@ Execute on TVM
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      15.8131      15.7943      15.9751      15.7570       0.0639   
+      15.7907      15.7028      16.4872      15.6577       0.2365   
                
 
 
diff --git a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
index dcbac545a4..dc60122ad6 100644
--- a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
@@ -127,7 +127,7 @@ Load pre-trained maskrcnn from torchvision and do tracing
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MaskRCNN_ResNet50_FPN_Weights.COCO_V1`. You can also use `weights=MaskRCNN_ResNet50_FPN_Weights.DEFAULT` to get the most up-to-date weights.
       warnings.warn(msg)
     Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
-
      0%|          | 0.00/170M [00:00<?, ?B/s]
      8%|8         | 14.3M/170M [00:00<00:01, 149MB/s]
     17%|#6        | 28.5M/170M [00:00<00:01, 116MB/s]
     24%|##3       | 40.0M/170M [00:00<00:01, 106MB/s]
     30%|##9       | 50.4M/170M [00:00<00:01, 107MB/s]
     36%|###5      | 60.7M/170M [00:00<00:01, 104MB/s]
     42%|####1     | 70.7M/170M [00:00<00:01, 103MB/s]
     47%|####7     | 80.6M/170M [00:00<00:00, 94.1MB/s]
     53%|#####3    | 90.4M/170M [00:00<00:00, 96.3MB/s]
     60%|######    | 102M/170M [00:01<00:00, 105MB/s]  
     66%|######6   | 113M/170M [00:01<00:00, 97.5MB/s]
     72%|#######2  | 123M/170M [00:01<00:00, 95.5MB/s]
     80%|#######9  | 135M/170M [00:01<00:00, 106MB/s] 
     86%|########5 | 145M/170M [00:01<00:00, 80.1MB/s]
     95%|#########5| 162M/170M [00:01<00:00, 98.2MB/s]
    100%|##########| 170M/170M [00:01<00:00, 102MB/s] 
+
      0%|          | 0.00/170M [00:00<?, ?B/s]
      6%|6         | 10.6M/170M [00:00<00:02, 57.1MB/s]
     12%|#2        | 20.4M/170M [00:00<00:02, 76.4MB/s]
     19%|#8        | 32.0M/170M [00:00<00:01, 90.6MB/s]
     28%|##8       | 48.0M/170M [00:00<00:01, 111MB/s] 
     38%|###7      | 64.0M/170M [00:00<00:00, 124MB/s]
     46%|####5     | 77.7M/170M [00:00<00:00, 130MB/s]
     53%|#####3    | 90.3M/170M [00:00<00:00, 113MB/s]
     60%|#####9    | 102M/170M [00:00<00:00, 111MB/s] 
     66%|######6   | 113M/170M [00:01<00:00, 102MB/s]
     72%|#######2  | 123M/170M [00:01<00:00, 99.5MB/s]
     80%|#######9  | 136M/170M [00:01<00:00, 110MB/s] 
     86%|########6 | 147M/170M [00:01<00:00, 100MB/s]
     92%|#########2| 156M/170M [00:01<00:00, 97.9MB/s]
     98%|#########7| 166M/170M [00:01<00:00, 99.0MB/s]
    100%|##########| 170M/170M [00:01<00:00, 98.0MB/s]
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/nn/functional.py:3897: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
       for i in range(dim)
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/detection/anchor_utils.py:124: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -296,7 +296,7 @@ Get boxes with score larger than 0.9
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 3 minutes  10.512 seconds)
+   **Total running time of the script:** ( 3 minutes  11.352 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
index 01d0233a19..8640ea751c 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
@@ -236,7 +236,7 @@ training. Other models require a full post training calibration.
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MobileNet_V2_Weights.IMAGENET1K_V1`. You can also use `weights=MobileNet_V2_Weights.DEFAULT` to get the most up-to-date weights.
       warnings.warn(msg)
     Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
-
      0%|          | 0.00/13.6M [00:00<?, ?B/s]
     59%|#####9    | 8.06M/13.6M [00:00<00:00, 84.5MB/s]
    100%|##########| 13.6M/13.6M [00:00<00:00, 115MB/s] 
+
      0%|          | 0.00/13.6M [00:00<?, ?B/s]
     59%|#####8    | 7.99M/13.6M [00:00<00:00, 45.3MB/s]
    100%|##########| 13.6M/13.6M [00:00<00:00, 59.0MB/s]
 
 
 
@@ -418,7 +418,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      90.2928      90.1269      96.4289      89.9255       0.6989   
+      90.3788      90.2874      91.6289      89.9409       0.3055   
                
 
 
@@ -467,7 +467,7 @@ TODO
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  5.283 seconds)
+   **Total running time of the script:** ( 1 minutes  5.409 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_prequantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
index b59f4250c8..c899f06c23 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
@@ -432,7 +432,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      119.6372     119.5717     124.3739     118.9210      0.5732   
+      120.5285     120.5633     122.2465     119.1566      0.5380   
                
 
 
@@ -469,7 +469,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  26.314 seconds)
+   **Total running time of the script:** ( 2 minutes  26.101 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
index 0955dc582e..15eb189a3b 100644
--- a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
@@ -253,7 +253,7 @@ We create a Relay VM to build and execute the model.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  34.749 seconds)
+   **Total running time of the script:** ( 1 minutes  41.075 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_quantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
index ae71cdc6ff..72d4efc0b8 100644
--- a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
@@ -166,7 +166,7 @@ Convert and compile model for CPU.
             data: None
       input_sym_arg_type = in_param.infer_type()[0]
     Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
-
      0%|          | 0/132723 [00:00<?, ?KB/s]
      4%|3         | 5023/132723 [00:00<00:02, 50217.48KB/s]
      8%|7         | 10045/132723 [00:00<00:03, 35305.58KB/s]
     10%|#         | 13833/132723 [00:00<00:03, 31021.29KB/s]
     14%|#3        | 18364/132723 [00:00<00:03, 35480.39KB/s]
     19%|#9        | 25541/132723 [00:00<00:02, 46671.01KB/s]
     24%|##4       | 31974/132723 [00:00<00:01, 52070.03KB/s]
     28%|##8       | 37453/132723 [00:00<00:02, 43895.77KB/s]
     34%|###3      | 44756/132723 [00:00<00:01, 51613.55KB/s]
     38%|###7      | 50324/132723 [00:01<00:01, 42353.31KB/s]
     44%|####4     | 58441/132723 [00:01<00:01, 51651.95KB/s]
     50%|#####     | 66583/132723 [00:01<00:01, 59240.34KB/s]
     56%|#####6    | 74773/132723 [00:01<00:00, 65287.53KB/s]
     62%|######1   | 82046/132723 [00:01<00:00, 66400.59KB/s]
     68%|######7   | 90044/132723 [00:01<00:00, 70200.92KB/s]
     74%|#######4  | 98288/132723 [00:01<00:00, 69739.41KB/s]
     80%|########  |
  106294/132723 [00:01<00:00, 72612.93KB/s]
     86%|########6 | 114672/132723 [00:01<00:00, 75788.06KB/s]
     92%|#########2| 122380/132723 [00:02<00:00, 65867.14KB/s]
     98%|#########8| 130499/132723 [00:02<00:00, 69896.38KB/s]
    100%|##########| 132723/132723 [00:02<00:00, 58197.62KB/s]
+
      0%|          | 0/132723 [00:00<?, ?KB/s]
      4%|4         | 5343/132723 [00:00<00:02, 53423.68KB/s]
     10%|9         | 12897/132723 [00:00<00:01, 66425.70KB/s]
     16%|#5        | 20616/132723 [00:00<00:01, 71338.58KB/s]
     21%|##1       | 28452/132723 [00:00<00:01, 74099.80KB/s]
     27%|##7       | 36419/132723 [00:00<00:01, 76104.81KB/s]
     33%|###3      | 44387/132723 [00:00<00:01, 77319.13KB/s]
     39%|###9      | 52418/132723 [00:00<00:01, 78294.72KB/s]
     46%|####5     | 60422/132723 [00:00<00:00, 78847.80KB/s]
     51%|#####1    | 68307/132723 [00:00<00:01, 63664.85KB/s]
     57%|#####7    | 76293/132723 [00:01<00:00, 67953.60KB/s]
     63%|######3   | 84258/132723 [00:01<00:00, 71163.86KB/s]
     69%|######9   | 92226/132723 [00:01<00:00, 73561.85KB/s]
     75%|#######5  | 100145/132723 [00:01<00:00, 75175.09KB/s]
     82%|########1 | 108219/132723 [00:01<00:00, 76791.56KB/s]
     87%|########7 | 116023/132723 [00:01<00:00, 68569.23KB/s]
     93%|########
 #3| 124014/132723 [00:01<00:00, 71646.66KB/s]
    100%|#########9| 132161/132723 [00:01<00:00, 74389.55KB/s]
    100%|##########| 132723/132723 [00:01<00:00, 72610.60KB/s]
 
 
 
@@ -242,7 +242,7 @@ Display result
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 3 minutes  2.827 seconds)
+   **Total running time of the script:** ( 2 minutes  59.243 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
index d16a964c12..5c9298e44e 100644
--- a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
@@ -5,24 +5,24 @@
 
 Computation times
 =================
-**12:45.094** total execution time for **how_to_deploy_models** files:
+**12:47.834** total execution time for **how_to_deploy_models** files:
 
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:10.512 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:11.352 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)                           | 03:02.827 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)                           | 02:59.243 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)           | 02:26.314 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)           | 02:26.101 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``)                               | 01:34.749 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``)                               | 01:41.075 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``)                         | 01:05.283 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``)                         | 01:05.409 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``)                 | 00:35.423 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``)                 | 00:35.326 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``)                       | 00:25.200 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``)                       | 00:24.857 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)                       | 00:24.779 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)                       | 00:24.464 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``)                                     | 00:00.006 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``)                                     | 00:00.007 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
index 8ea84ee187..66f81d0ac3 100644
--- a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
@@ -472,7 +472,7 @@ First let us define two helper functions to get the mobilenet model and a cat im
 
  .. code-block:: none
 
-    Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip6669ce90-4254-4ca1-8a7b-de202ad7b39f from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+    Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip8412d4bf-c047-439a-a9c1-17e3480e18d4 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
 
 
 
diff --git a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
index b7c1c849c6..3187779b1a 100644
--- a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**00:47.083** total execution time for **how_to_extend_tvm** files:
+**00:47.456** total execution time for **how_to_extend_tvm** files:
 
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:43.638 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:44.053 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``)           | 00:02.411 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``)           | 00:02.385 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``)                     | 00:01.025 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``)                     | 00:01.010 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``)       | 00:00.009 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``)       | 00:00.008 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
index 4d55514fed..a15a33ac37 100644
--- a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
@@ -216,10 +216,10 @@ profile the execution time of each passes.
  .. code-block:: none
 
     Printing results of timing profile...
-    InferType: 7157us [7157us] (46.40%; 46.40%)
-    FoldScaleAxis: 8268us [6us] (53.60%; 53.60%)
-            FoldConstant: 8262us [1673us] (53.56%; 99.93%)
-                    InferType: 6589us [6589us] (42.72%; 79.75%)
+    InferType: 7107us [7107us] (46.22%; 46.22%)
+    FoldScaleAxis: 8271us [7us] (53.78%; 53.78%)
+            FoldConstant: 8264us [1645us] (53.74%; 99.92%)
+                    InferType: 6620us [6620us] (43.05%; 80.10%)
 
 
 
@@ -258,10 +258,10 @@ Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for th
  .. code-block:: none
 
     Printing results of timing profile...
-    InferType: 6620us [6620us] (44.98%; 44.98%)
-    FoldScaleAxis: 8100us [5us] (55.02%; 55.02%)
-            FoldConstant: 8095us [1657us] (54.99%; 99.94%)
-                    InferType: 6438us [6438us] (43.73%; 79.53%)
+    InferType: 6581us [6581us] (45.29%; 45.29%)
+    FoldScaleAxis: 7949us [5us] (54.71%; 54.71%)
+            FoldConstant: 7944us [1616us] (54.67%; 99.94%)
+                    InferType: 6328us [6328us] (43.55%; 79.65%)
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
index eefe03d6b3..0bf84389ef 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
@@ -340,7 +340,7 @@ latency of convolution.
 
  .. code-block:: none
 
-    Convolution: 53.378337 ms
+    Convolution: 50.913246 ms
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
index 50e4f689bf..dbb2e0397e 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
@@ -657,7 +657,7 @@ be able to run on our build server
 
  .. code-block:: none
 
-    conv2d with tensor core: 12.173318 ms
+    conv2d with tensor core: 12.974272 ms
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
index d2caba5f21..dbee1aef6d 100644
--- a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
@@ -143,8 +143,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 
  .. code-block:: none
 
-    Numpy running time: 0.018595
-    Baseline: 3.199267
+    Numpy running time: 0.018348
+    Baseline: 3.426208
 
 
 
@@ -238,7 +238,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 
  .. code-block:: none
 
-    Opt1: 0.297510
+    Opt1: 0.290674
 
 
 
@@ -340,7 +340,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
 
  .. code-block:: none
 
-    Opt2: 0.332023
+    Opt2: 0.333141
 
 
 
@@ -435,7 +435,7 @@ the access pattern for A matrix is more cache friendly.
 
  .. code-block:: none
 
-    Opt3: 0.115810
+    Opt3: 0.117250
 
 
 
@@ -559,7 +559,7 @@ flattening.
 
  .. code-block:: none
 
-    Opt4: 0.109238
+    Opt4: 0.109634
 
 
 
@@ -680,7 +680,7 @@ write to C when all the block results are ready.
 
  .. code-block:: none
 
-    Opt5: 0.111007
+    Opt5: 0.111331
 
 
 
@@ -804,7 +804,7 @@ Furthermore, we can also utilize multi-core processors to do the thread-level pa
 
  .. code-block:: none
 
-    Opt6: 0.146501
+    Opt6: 0.146571
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
index d0c6dadf78..bee4fe02aa 100644
--- a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
 
 Computation times
 =================
-**00:34.117** total execution time for **how_to_optimize_operators** files:
+**00:34.896** total execution time for **how_to_optimize_operators** files:
 
 +-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``)                       | 00:31.586 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``)                       | 00:32.200 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.464 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.529 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``)             | 00:01.067 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``)             | 00:01.167 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
index 83c6f39438..3aaf0e3f96 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
 
 Computation times
 =================
-**09:04.571** total execution time for **how_to_tune_with_autoscheduler** files:
+**08:56.121** total execution time for **how_to_tune_with_autoscheduler** files:
 
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 05:36.615 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 05:33.874 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``)             | 01:32.239 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``)             | 01:31.040 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``)           | 01:03.601 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``)           | 01:00.605 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)               | 00:28.731 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)               | 00:27.200 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``)             | 00:12.062 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``)             | 00:12.036 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``)           | 00:11.323 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``)           | 00:11.366 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
index ab6a110449..97529007bb 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
@@ -239,483 +239,86 @@ cooperative fetching, unrolling and operator fusion.
                  bias: Buffer(bias_2: Pointer(float32), float32, [1, 512, 1, 1], []),
                  compute: Buffer(compute_2: Pointer(float32), float32, [1, 512, 7, 7], [])}
       buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute} {
-      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 28;
-      allocate(conv2d_nchw: Pointer(local float32), float32, [14]), storage_scope = local;
-      allocate(pad_temp.shared: Pointer(shared float32), float32, [72]), storage_scope = shared;
-      allocate(kernel.shared: Pointer(shared float32), float32, [3072]), storage_scope = shared;
-      attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
-        conv2d_nchw_1: Buffer(conv2d_nchw, float32, [14], [], scope="local", align=32)[0] = 0f32
-        conv2d_nchw_1[1] = 0f32
+      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 16;
+      allocate(conv2d_nchw: Pointer(local float32), float32, [4]), storage_scope = local;
+      allocate(pad_temp.shared: Pointer(shared float32), float32, [1008]), storage_scope = shared;
+      allocate(kernel.shared: Pointer(shared float32), float32, [1536]), storage_scope = shared;
+      attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 392 {
+        conv2d_nchw_1: Buffer(conv2d_nchw, float32, [4], [], scope="local", align=8)[0] = 0f32
         conv2d_nchw_1[2] = 0f32
+        conv2d_nchw_1[1] = 0f32
         conv2d_nchw_1[3] = 0f32
-        conv2d_nchw_1[4] = 0f32
-        conv2d_nchw_1[5] = 0f32
-        conv2d_nchw_1[6] = 0f32
-        conv2d_nchw_1[7] = 0f32
-        conv2d_nchw_1[8] = 0f32
-        conv2d_nchw_1[9] = 0f32
-        conv2d_nchw_1[10] = 0f32
-        conv2d_nchw_1[11] = 0f32
-        conv2d_nchw_1[12] = 0f32
-        conv2d_nchw_1[13] = 0f32
-        for (rc.outer.outer: int32, 0, 64) {
+        for (rc.outer.outer: int32, 0, 32) {
           for (ry.outer.outer: int32, 0, 3) {
-            let cse_var_2: int32 = (rc.outer.outer*72)
+            let cse_var_4: int32 = (rc.outer.outer*784)
+            let cse_var_3: int32 = (rc.outer.outer*144)
+            let cse_var_2: int32 = (ry.outer.outer*7)
             let cse_var_1: int32 = (ry.outer.outer*3)
              {
-              attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
-                if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
-                  pad_temp.shared_1: Buffer(pad_temp.shared, float32, [72], [], scope="shared")[(threadIdx.x_1*4)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1*4), 9))) && (floormod((threadIdx.x_1*4), 9) < 8)), data_3: Buffer(data_2, float32, [25088], [])[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1*4), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + fl [...]
-                }
-                if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
-                  pad_temp.shared_1[((threadIdx.x_1*4) + 1)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 1), 9))) && (floormod(((threadIdx.x_1*4) + 1), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 1), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 1), 9)) - 8)], 0f32, dtype=float32)
+              attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 392 {
+                pad_temp.shared_1: Buffer(pad_temp.shared, float32, [1008], [], scope="shared")[(threadIdx.x_1*2)] = @tir.if_then_else(((((1 <= (floordiv(floormod((threadIdx.x_1*2), 63), 9) + ry.outer.outer)) && ((floordiv(floormod((threadIdx.x_1*2), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod((threadIdx.x_1*2), 9))) && (floormod((threadIdx.x_1*2), 9) < 8)), data_3: Buffer(data_2, float32, [25088], [])[((((cse_var_4 + (floordiv((threadIdx.x_1*2), 9)*7)) + cse_var_2) + floormod((thr [...]
+                pad_temp.shared_1[((threadIdx.x_1*2) + 1)] = @tir.if_then_else(((((1 <= (floordiv(floormod(((threadIdx.x_1*2) + 1), 63), 9) + ry.outer.outer)) && ((floordiv(floormod(((threadIdx.x_1*2) + 1), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod(((threadIdx.x_1*2) + 1), 9))) && (floormod(((threadIdx.x_1*2) + 1), 9) < 8)), data_3[((((cse_var_4 + (floordiv(((threadIdx.x_1*2) + 1), 9)*7)) + cse_var_2) + floormod(((threadIdx.x_1*2) + 1), 9)) - 8)], 0f32, dtype=float32)
+              }
+              attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 392 {
+                if @tir.likely((threadIdx.x_1 < 112), dtype=bool) {
+                  pad_temp.shared_1[((threadIdx.x_1*2) + 784)] = @tir.if_then_else(((((1 <= (floordiv(floormod(((threadIdx.x_1*2) + 28), 63), 9) + ry.outer.outer)) && ((floordiv(floormod(((threadIdx.x_1*2) + 28), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod(((threadIdx.x_1*2) + 1), 9))) && (floormod(((threadIdx.x_1*2) + 1), 9) < 8)), data_3[((((cse_var_4 + (floordiv(((threadIdx.x_1*2) + 784), 9)*7)) + cse_var_2) + floormod(((threadIdx.x_1*2) + 1), 9)) - 8)], 0f32, dtype=float32)
                 }
-                if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
-                  pad_temp.shared_1[((threadIdx.x_1*4) + 2)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 2), 9))) && (floormod(((threadIdx.x_1*4) + 2), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 2), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 2), 9)) - 8)], 0f32, dtype=float32)
+                if @tir.likely((threadIdx.x_1 < 112), dtype=bool) {
+                  pad_temp.shared_1[((threadIdx.x_1*2) + 785)] = @tir.if_then_else(((((1 <= (floordiv(floormod(((threadIdx.x_1*2) + 29), 63), 9) + ry.outer.outer)) && ((floordiv(floormod(((threadIdx.x_1*2) + 29), 63), 9) + ry.outer.outer) < 8)) && (1 <= floormod(((threadIdx.x_1*2) + 2), 9))) && (floormod(((threadIdx.x_1*2) + 2), 9) < 8)), data_3[((((cse_var_4 + (floordiv(((threadIdx.x_1*2) + 785), 9)*7)) + cse_var_2) + floormod(((threadIdx.x_1*2) + 2), 9)) - 8)], 0f32, dtype=float32)
                 }
-                if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
-                  pad_temp.shared_1[((threadIdx.x_1*4) + 3)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 3), 9))) && (floormod(((threadIdx.x_1*4) + 3), 9) < 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 3), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 3), 9)) - 8)], 0f32, dtype=float32)
+              }
+              attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 392;
+              kernel.shared_1: Buffer(kernel.shared, float32, [1536], [], scope="shared")[threadIdx.x_2] = kernel_3: Buffer(kernel_2, float32, [2359296], [])[((((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 48)*4608)) + cse_var_3) + (floordiv(floormod(threadIdx.x_2, 48), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 392;
+              kernel.shared_1[(threadIdx.x_2 + 392)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 392), 48)*4608)) + cse_var_3) + (floordiv(floormod((threadIdx.x_2 + 8), 48), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 392;
+              kernel.shared_1[(threadIdx.x_2 + 784)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 784), 48)*4608)) + cse_var_3) + (floordiv(floormod((threadIdx.x_2 + 16), 48), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 392;
+              if @tir.likely((threadIdx.x_2 < 360), dtype=bool) {
+                kernel.shared_1[(threadIdx.x_2 + 1176)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1176), 48)*4608)) + cse_var_3) + (floormod((floordiv(threadIdx.x_2, 3) + 8), 16)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
+              }
+              for (rc.outer.inner: int32, 0, 2) {
+                for (rx.outer.inner: int32, 0, 3) {
+                  conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7))]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner)]))
+                  conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7))]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 768)]))
+                  conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7))]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 48)]))
+                  conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7))]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 816)]))
+                  conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 3)]))
+                  conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 771)]))
+                  conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 51)]))
+                  conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 819)]))
+                  conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 6)]))
+                  conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 774)]))
+                  conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 54)]))
+                  conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 822)]))
+                  conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 189)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 9)]))
+                  conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 189)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 777)]))
+                  conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 189)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 57)]))
+                  conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 189)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 825)]))
+                  conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 252)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 12)]))
+                  conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 252)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 780)]))
+                  conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 252)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 60)]))
+                  conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 252)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 828)]))
+                  conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 315)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 15)]))
+                  conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 315)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 783)]))
+                  conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 315)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 63)]))
+                  conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 315)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 831)]))
+                  conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 378)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 18)]))
+                  conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 378)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 786)]))
+                  conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 378)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 66)]))
+                  conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 378)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 834)]))
+                  conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 441)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 21)]))
+                  conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 441)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 789)]))
+                  conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 441)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 69)]))
+                  conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 441)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 837)]))
                 }
               }
-              attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1: Buffer(kernel.shared, float32, [3072], [], scope="shared")[threadIdx.x_2] = kernel_3: Buffer(kernel_2, float32, [2359296], [])[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 64)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 64), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 128)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 128), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 192)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 36864)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 256)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 256), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 320)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 320), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 384)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 73728)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 448)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 448), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 512)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 512), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 576)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 110592)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 640)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 640), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 704)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 704), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 768)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 147456)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 832)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 832), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 896)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 896), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 960)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 184320)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1024)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1024), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1088)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1088), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1152)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 221184)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1216)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1216), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1280)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1280), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 258048)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1408)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1408), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1472)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1472), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1536)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 294912)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1600)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1600), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1664)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1664), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1728)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 331776)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1792), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1856)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1856), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1920)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 368640)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 1984)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1984), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2048)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2048), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2112)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 405504)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2176)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2176), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2240), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2304)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 442368)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2368)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2368), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2432)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2432), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2496)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 479232)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2560)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2560), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2624)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2624), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 516096)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2752)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2752), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2816)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2816), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2880)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 552960)]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 2944)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2944), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-              attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
-              kernel.shared_1[(threadIdx.x_2 + 3008)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 3008), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[0]*kernel.shared_1[(threadIdx.x*48)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 3)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[1]*kernel.shared_1[(threadIdx.x*48)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 3)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[2]*kernel.shared_1[(threadIdx.x*48)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 3)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[3]*kernel.shared_1[(threadIdx.x*48)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 3)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[4]*kernel.shared_1[(threadIdx.x*48)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 3)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[5]*kernel.shared_1[(threadIdx.x*48)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 3)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[6]*kernel.shared_1[(threadIdx.x*48)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 3)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[0]*kernel.shared_1[((threadIdx.x*48) + 24)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 27)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 24)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 27)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 24)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 27)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 24)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 27)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 24)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 27)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 24)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 27)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 24)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 27)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 1)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 4)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 1)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 4)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 1)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 4)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 1)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 4)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 1)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 4)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 1)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 4)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 1)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 4)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 25)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 28)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 25)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 28)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 25)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 28)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 25)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 28)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 25)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 28)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 25)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 28)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 25)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 28)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 2)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 5)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 2)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 5)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 2)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 5)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 2)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 5)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 2)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 5)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 2)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 5)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 2)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 5)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 26)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 29)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 26)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 29)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 26)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 29)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 26)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 29)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 26)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 29)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 26)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 29)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 26)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 29)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 6)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 9)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 6)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 9)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 6)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 9)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 6)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 9)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 6)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 9)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 6)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 9)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 6)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 9)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 30)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 33)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 30)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 33)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 30)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 33)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 30)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 33)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 30)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 33)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 30)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 33)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 30)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 33)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 7)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 10)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 7)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 10)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 7)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 10)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 7)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 10)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 7)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 10)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 7)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 10)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 7)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 10)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 31)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 34)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 31)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 34)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 31)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 34)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 31)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 34)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 31)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 34)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 31)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 34)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 31)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 34)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 8)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 11)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 8)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 11)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 8)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 11)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 8)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 11)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 8)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 11)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 8)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 11)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 8)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 11)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 32)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 35)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 32)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 35)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 32)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 35)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 32)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 35)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 32)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 35)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 32)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 35)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 32)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 35)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 12)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 15)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 12)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 15)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 12)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 15)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 12)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 15)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 12)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 15)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 12)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 15)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 12)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 15)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 36)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 39)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 36)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 39)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 36)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 39)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 36)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 39)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 36)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 39)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 36)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 39)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 36)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 39)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 13)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 16)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 13)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 16)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 13)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 16)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 13)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 16)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 13)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 16)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 13)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 16)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 13)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 16)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 37)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 40)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 37)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 40)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 37)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 40)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 37)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 40)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 37)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 40)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 37)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 40)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 37)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 40)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 14)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 17)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 14)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 17)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 14)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 17)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 14)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 17)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 14)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 17)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 14)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 17)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 14)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 17)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 38)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 41)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 38)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 41)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 38)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 41)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 38)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 41)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 38)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 41)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 38)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 41)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 38)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 41)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 18)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 21)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 18)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 21)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 18)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 21)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 18)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 21)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 18)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 21)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 18)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 21)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 18)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 21)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 42)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 45)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 42)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 45)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 42)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 45)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 42)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 45)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 42)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 45)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 42)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 45)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 42)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 45)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 19)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 22)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 19)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 22)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 19)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 22)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 19)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 22)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 19)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 22)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 19)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 22)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 19)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 22)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 43)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 46)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 43)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 46)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 43)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 46)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 43)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 46)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 43)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 46)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 43)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 46)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 43)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 46)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 20)]))
-              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 23)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 20)]))
-              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 23)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 20)]))
-              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 23)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 20)]))
-              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 23)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 20)]))
-              conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 23)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 20)]))
-              conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 23)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 20)]))
-              conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 23)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 44)]))
-              conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 47)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 44)]))
-              conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 47)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 44)]))
-              conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 47)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 44)]))
-              conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 47)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 44)]))
-              conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 47)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 44)]))
-              conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 47)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 44)]))
-              conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 47)]))
             }
           }
         }
         for (i1.inner: int32, 0, 2) {
-          for (i3.inner: int32, 0, 7) {
-            compute_3: Buffer(compute_2, float32, [25088], [])[(((((floordiv(blockIdx.x, 7)*6272) + (threadIdx.x*98)) + (i1.inner*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[((i1.inner*7) + i3.inner)] + bias_3: Buffer(bias_2, float32, [512], [])[(((floordiv(blockIdx.x, 7)*128) + (threadIdx.x*2)) + i1.inner)]), 0f32)
-          }
+          compute_3: Buffer(compute_2, float32, [25088], [])[((((blockIdx.x*1568) + (floordiv(threadIdx.x, 49)*98)) + (i1.inner*49)) + floormod(threadIdx.x, 49))] = max((conv2d_nchw_1[i1.inner] + bias_3: Buffer(bias_2, float32, [512], [])[(((blockIdx.x*32) + (floordiv(threadIdx.x, 49)*2)) + i1.inner)]), 0f32)
+          compute_3[(((((blockIdx.x*1568) + (floordiv(threadIdx.x, 49)*98)) + (i1.inner*49)) + floormod(threadIdx.x, 49)) + 784)] = max((conv2d_nchw_1[(i1.inner + 2)] + bias_3[((((blockIdx.x*32) + (floordiv(threadIdx.x, 49)*2)) + i1.inner) + 16)]), 0f32)
         }
       }
     }
@@ -770,7 +373,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 0.362 ms
+    Execution time of this operator: 0.315 ms
 
 
 
@@ -818,20 +421,20 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_i, factor=1)
     conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
     conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
-    conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
-    conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
-    conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
-    conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
+    conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=2)
+    conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
+    conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=8)
+    conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=2)
     conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
     conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
-    conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
+    conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
     conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
     conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
-    conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
-    conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
+    conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
+    conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
     conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
-    conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
-    conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
+    conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=8)
+    conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=2)
     conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
     conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
     conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
@@ -841,13 +444,13 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
     compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
     compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
-    compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
-    compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
+    compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=8)
+    compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=2)
     compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
-    compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
+    compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
     compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-    compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
-    compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
+    compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+    compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
     compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
     s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
     s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -867,14 +470,14 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
     kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
     s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=392)
     s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
     pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
-    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
+    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=2)
     s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=392)
     s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
-    s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 512)
+    s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 64)
     s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
 
     CUDA source code:
@@ -892,430 +495,73 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
       #define int64_t long long
       #define uint64_t unsigned long long
     #endif
-    extern "C" __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
-      float conv2d_nchw[14];
-      __shared__ float pad_temp_shared[72];
-      __shared__ float kernel_shared[3072];
+    extern "C" __global__ void __launch_bounds__(392) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+      float conv2d_nchw[4];
+      __shared__ float pad_temp_shared[1008];
+      __shared__ float kernel_shared[1536];
       conv2d_nchw[0] = 0.000000e+00f;
-      conv2d_nchw[1] = 0.000000e+00f;
       conv2d_nchw[2] = 0.000000e+00f;
+      conv2d_nchw[1] = 0.000000e+00f;
       conv2d_nchw[3] = 0.000000e+00f;
-      conv2d_nchw[4] = 0.000000e+00f;
-      conv2d_nchw[5] = 0.000000e+00f;
-      conv2d_nchw[6] = 0.000000e+00f;
-      conv2d_nchw[7] = 0.000000e+00f;
-      conv2d_nchw[8] = 0.000000e+00f;
-      conv2d_nchw[9] = 0.000000e+00f;
-      conv2d_nchw[10] = 0.000000e+00f;
-      conv2d_nchw[11] = 0.000000e+00f;
-      conv2d_nchw[12] = 0.000000e+00f;
-      conv2d_nchw[13] = 0.000000e+00f;
-      for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
+      for (int rc_outer_outer = 0; rc_outer_outer < 32; ++rc_outer_outer) {
         for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
           __syncthreads();
-          if (((int)threadIdx.x) < 18) {
-            pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) * 4) % 9))) && (((((int)threadIdx.x) * 4) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
-          }
-          if (((int)threadIdx.x) < 18) {
-            pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 1) % 9))) && ((((((int)threadIdx.x) * 4) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[(((int)threadIdx.x) * 2)] = (((((1 <= ((((((int)threadIdx.x) * 2) % 63) / 9) + ry_outer_outer)) && (((((((int)threadIdx.x) * 2) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= ((((int)threadIdx.x) * 2) % 9))) && (((((int)threadIdx.x) * 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 2) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) * 2) % 9)) - 8)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 2) + 1)] = (((((1 <= (((((((int)threadIdx.x) * 2) + 1) % 63) / 9) + ry_outer_outer)) && ((((((((int)threadIdx.x) * 2) + 1) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= (((((int)threadIdx.x) * 2) + 1) % 9))) && ((((((int)threadIdx.x) * 2) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 7)) + (ry_outer_outer * 7)) + (((((int)threadIdx.x) * 2) + 1) % 9)) - 8)] : 0.000000e+00f);
+          if (((int)threadIdx.x) < 112) {
+            pad_temp_shared[((((int)threadIdx.x) * 2) + 784)] = (((((1 <= (((((((int)threadIdx.x) * 2) + 28) % 63) / 9) + ry_outer_outer)) && ((((((((int)threadIdx.x) * 2) + 28) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= (((((int)threadIdx.x) * 2) + 1) % 9))) && ((((((int)threadIdx.x) * 2) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 2) + 784) / 9) * 7)) + (ry_outer_outer * 7)) + (((((int)threadIdx.x) * 2) + 1) % 9)) - 8)] : 0.000000e+00f);
           }
-          if (((int)threadIdx.x) < 18) {
-            pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 2) % 9))) && ((((((int)threadIdx.x) * 4) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
+          if (((int)threadIdx.x) < 112) {
+            pad_temp_shared[((((int)threadIdx.x) * 2) + 785)] = (((((1 <= (((((((int)threadIdx.x) * 2) + 29) % 63) / 9) + ry_outer_outer)) && ((((((((int)threadIdx.x) * 2) + 29) % 63) / 9) + ry_outer_outer) < 8)) && (1 <= (((((int)threadIdx.x) * 2) + 2) % 9))) && ((((((int)threadIdx.x) * 2) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 2) + 785) / 9) * 7)) + (ry_outer_outer * 7)) + (((((int)threadIdx.x) * 2) + 2) % 9)) - 8)] : 0.000000e+00f);
           }
-          if (((int)threadIdx.x) < 18) {
-            pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 3) % 9))) && ((((((int)threadIdx.x) * 4) + 3) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
+          kernel_shared[((int)threadIdx.x)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) % 48) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 392)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 392) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 8) % 48) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 784)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 784) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 16) % 48) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          if (((int)threadIdx.x) < 360) {
+            kernel_shared[(((int)threadIdx.x) + 1176)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1176) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 8) & 15) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
           }
-          kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
-          kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
-          kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
-          kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
-          kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
-          kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
-          kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
-          kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
-          kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
-          kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
-          kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
-          kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
-          kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
-          kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
-          kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
-          kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
           __syncthreads();
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+          for (int rc_outer_inner = 0; rc_outer_inner < 2; ++rc_outer_inner) {
+            for (int rx_outer_inner = 0; rx_outer_inner < 3; ++rx_outer_inner) {
+              conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7))] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner)]));
+              conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 768)]));
+              conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 48)]));
+              conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 816)]));
+              conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 3)]));
+              conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 771)]));
+              conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 51)]));
+              conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 819)]));
+              conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 6)]));
+              conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 774)]));
+              conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 54)]));
+              conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 822)]));
+              conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 189)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 9)]));
+              conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 189)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 777)]));
+              conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 189)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 57)]));
+              conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 189)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 825)]));
+              conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 252)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 12)]));
+              conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 252)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 780)]));
+              conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 252)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 60)]));
+              conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 252)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 828)]));
+              conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 315)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 15)]));
+              conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 315)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 783)]));
+              conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 315)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 63)]));
+              conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 315)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 831)]));
+              conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 378)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 18)]));
+              conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 378)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 786)]));
+              conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 378)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 66)]));
+              conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 378)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 834)]));
+              conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 441)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 21)]));
+              conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 441)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 789)]));
+              conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 441)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 69)]));
+              conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 441)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 837)]));
+            }
+          }
         }
       }
       for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
-        for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
-          compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
-        }
+        compute[((((((int)blockIdx.x) * 1568) + ((((int)threadIdx.x) / 49) * 98)) + (i1_inner * 49)) + (((int)threadIdx.x) % 49))] = max((conv2d_nchw[i1_inner] + bias[(((((int)blockIdx.x) * 32) + ((((int)threadIdx.x) / 49) * 2)) + i1_inner)]), 0.000000e+00f);
+        compute[(((((((int)blockIdx.x) * 1568) + ((((int)threadIdx.x) / 49) * 98)) + (i1_inner * 49)) + (((int)threadIdx.x) % 49)) + 784)] = max((conv2d_nchw[(i1_inner + 2)] + bias[((((((int)blockIdx.x) * 32) + ((((int)threadIdx.x) / 49) * 2)) + i1_inner) + 16)]), 0.000000e+00f);
       }
     }
 
@@ -1377,7 +623,7 @@ In the example below we resume the status and do more 5 trials.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 5 minutes  36.615 seconds)
+   **Total running time of the script:** ( 5 minutes  33.874 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
index 177bc570ea..e900ce2cd9 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
@@ -643,7 +643,7 @@ so we can read the log file and load the best schedules.
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-       7.8919       7.8903       7.8979       7.8877       0.0043   
+       7.8687       7.8667       7.8751       7.8643       0.0046   
                
 
 
@@ -671,7 +671,7 @@ Other Tips
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  3.601 seconds)
+   **Total running time of the script:** ( 1 minutes  0.605 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
index 87a024c5eb..29b8445b98 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
@@ -662,7 +662,7 @@ so we can read the log file and load the best schedules.
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      754.9071     754.6821     755.7237     754.3153      0.5966   
+      755.3878     755.7386     755.9539     754.4709      0.6543   
                
 
 
@@ -690,7 +690,7 @@ Other Tips
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  32.239 seconds)
+   **Total running time of the script:** ( 1 minutes  31.040 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_x86.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
index fcf853eaaa..e192da6536 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
@@ -386,77 +386,75 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
                  placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [128, 512], []),
                  compute: Buffer(compute_2: Pointer(float32), float32, [128, 512], [])}
       buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute} {
-      for (i0.outer.i1.outer.fused: int32, 0, 16) "parallel" {
-        allocate(compute_3: Pointer(global float32), float32, [4096]), storage_scope = global {
-          for (i.outer.inner: int32, 0, 2) {
-            for (nb_j.inner: int32, 0, 2) {
-              for (i.inner.init: int32, 0, 64) {
-                let cse_var_1: int32 = (((i.outer.inner*2048) + (i.inner.init*32)) + (nb_j.inner*16))
-                 {
-                  compute_4: Buffer(compute_3, float32, [4096], [])[cse_var_1] = 0f32
-                  compute_4[(cse_var_1 + 1)] = 0f32
-                  compute_4[(cse_var_1 + 2)] = 0f32
-                  compute_4[(cse_var_1 + 3)] = 0f32
-                  compute_4[(cse_var_1 + 4)] = 0f32
-                  compute_4[(cse_var_1 + 5)] = 0f32
-                  compute_4[(cse_var_1 + 6)] = 0f32
-                  compute_4[(cse_var_1 + 7)] = 0f32
-                  compute_4[(cse_var_1 + 8)] = 0f32
-                  compute_4[(cse_var_1 + 9)] = 0f32
-                  compute_4[(cse_var_1 + 10)] = 0f32
-                  compute_4[(cse_var_1 + 11)] = 0f32
-                  compute_4[(cse_var_1 + 12)] = 0f32
-                  compute_4[(cse_var_1 + 13)] = 0f32
-                  compute_4[(cse_var_1 + 14)] = 0f32
-                  compute_4[(cse_var_1 + 15)] = 0f32
-                }
+      for (i0.outer.i1.outer.fused: int32, 0, 64) "parallel" {
+        allocate(compute_3: Pointer(global float32), float32, [1024]), storage_scope = global {
+          for (nb_j.inner: int32, 0, 2) {
+            for (i.inner.init: int32, 0, 32) {
+              let cse_var_1: int32 = ((i.inner.init*32) + (nb_j.inner*16))
+               {
+                compute_4: Buffer(compute_3, float32, [1024], [])[cse_var_1] = 0f32
+                compute_4[(cse_var_1 + 1)] = 0f32
+                compute_4[(cse_var_1 + 2)] = 0f32
+                compute_4[(cse_var_1 + 3)] = 0f32
+                compute_4[(cse_var_1 + 4)] = 0f32
+                compute_4[(cse_var_1 + 5)] = 0f32
+                compute_4[(cse_var_1 + 6)] = 0f32
+                compute_4[(cse_var_1 + 7)] = 0f32
+                compute_4[(cse_var_1 + 8)] = 0f32
+                compute_4[(cse_var_1 + 9)] = 0f32
+                compute_4[(cse_var_1 + 10)] = 0f32
+                compute_4[(cse_var_1 + 11)] = 0f32
+                compute_4[(cse_var_1 + 12)] = 0f32
+                compute_4[(cse_var_1 + 13)] = 0f32
+                compute_4[(cse_var_1 + 14)] = 0f32
+                compute_4[(cse_var_1 + 15)] = 0f32
               }
-              for (elem_idx: int32, 0, let cse_var_2: int32 = ((i0.outer.i1.outer.fused*2) + nb_j.inner) in (placeholder_15: Buffer(placeholder_13, int32, [33], [])[(cse_var_2 + 1)] - placeholder_15[cse_var_2])) {
-                for (i.inner: int32, 0, 64) {
-                  let cse_var_21: int32 = (elem_idx*16)
-                  let cse_var_20: int32 = ((i0.outer.i1.outer.fused*2) + nb_j.inner)
-                  let cse_var_19: int32 = ((i.outer.inner*16384) + (i.inner*256))
-                  let cse_var_18: int32 = (((i.outer.inner*2048) + (i.inner*32)) + (nb_j.inner*16))
-                  let cse_var_17: int32 = (cse_var_18 + 9)
-                  let cse_var_16: int32 = (cse_var_18 + 8)
-                  let cse_var_15: int32 = (cse_var_18 + 7)
-                  let cse_var_14: int32 = (cse_var_18 + 6)
-                  let cse_var_13: int32 = (cse_var_18 + 5)
-                  let cse_var_12: int32 = (cse_var_18 + 4)
-                  let cse_var_11: int32 = (cse_var_18 + 3)
-                  let cse_var_10: int32 = (cse_var_18 + 2)
-                  let cse_var_9: int32 = (cse_var_18 + 15)
-                  let cse_var_8: int32 = (cse_var_18 + 14)
-                  let cse_var_7: int32 = (cse_var_18 + 13)
-                  let cse_var_6: int32 = (cse_var_18 + 12)
-                  let cse_var_5: int32 = (cse_var_18 + 11)
-                  let cse_var_4: int32 = (cse_var_18 + 10)
-                  let cse_var_3: int32 = (cse_var_18 + 1)
-                   {
-                    compute_4[cse_var_18] = (compute_4[cse_var_18] + (placeholder_16: Buffer(placeholder_11, float32, [78656], [])[((placeholder_15[cse_var_20]*16) + cse_var_21)]*max(placeholder_17: Buffer(placeholder_10, float32, [32768], [])[(cse_var_19 + placeholder_18: Buffer(placeholder_12, int32, [4916], [])[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_3] = (compute_4[cse_var_3] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 1)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_10] = (compute_4[cse_var_10] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 2)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_11] = (compute_4[cse_var_11] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 3)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_12] = (compute_4[cse_var_12] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 4)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_13] = (compute_4[cse_var_13] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 5)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_14] = (compute_4[cse_var_14] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 6)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_15] = (compute_4[cse_var_15] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 7)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_16] = (compute_4[cse_var_16] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 8)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_17] = (compute_4[cse_var_17] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 9)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_4] = (compute_4[cse_var_4] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 10)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_5] = (compute_4[cse_var_5] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 11)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_6] = (compute_4[cse_var_6] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 12)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_7] = (compute_4[cse_var_7] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 13)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_8] = (compute_4[cse_var_8] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 14)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                    compute_4[cse_var_9] = (compute_4[cse_var_9] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 15)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                  }
+            }
+            for (elem_idx: int32, 0, let cse_var_2: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) in (placeholder_15: Buffer(placeholder_13, int32, [33], [])[(cse_var_2 + 1)] - placeholder_15[cse_var_2])) {
+              for (i.inner: int32, 0, 32) {
+                let cse_var_21: int32 = (elem_idx*16)
+                let cse_var_20: int32 = ((i.inner*32) + (nb_j.inner*16))
+                let cse_var_19: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)
+                let cse_var_18: int32 = ((floordiv(i0.outer.i1.outer.fused, 16)*8192) + (i.inner*256))
+                let cse_var_17: int32 = (cse_var_20 + 9)
+                let cse_var_16: int32 = (cse_var_20 + 8)
+                let cse_var_15: int32 = (cse_var_20 + 7)
+                let cse_var_14: int32 = (cse_var_20 + 6)
+                let cse_var_13: int32 = (cse_var_20 + 5)
+                let cse_var_12: int32 = (cse_var_20 + 4)
+                let cse_var_11: int32 = (cse_var_20 + 3)
+                let cse_var_10: int32 = (cse_var_20 + 2)
+                let cse_var_9: int32 = (cse_var_20 + 15)
+                let cse_var_8: int32 = (cse_var_20 + 14)
+                let cse_var_7: int32 = (cse_var_20 + 13)
+                let cse_var_6: int32 = (cse_var_20 + 12)
+                let cse_var_5: int32 = (cse_var_20 + 11)
+                let cse_var_4: int32 = (cse_var_20 + 10)
+                let cse_var_3: int32 = (cse_var_20 + 1)
+                 {
+                  compute_4[cse_var_20] = (compute_4[cse_var_20] + (placeholder_16: Buffer(placeholder_11, float32, [78656], [])[((placeholder_15[cse_var_19]*16) + cse_var_21)]*max(placeholder_17: Buffer(placeholder_10, float32, [32768], [])[(cse_var_18 + placeholder_18: Buffer(placeholder_12, int32, [4916], [])[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_3] = (compute_4[cse_var_3] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 1)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_10] = (compute_4[cse_var_10] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 2)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_11] = (compute_4[cse_var_11] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 3)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_12] = (compute_4[cse_var_12] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 4)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_13] = (compute_4[cse_var_13] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 5)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_14] = (compute_4[cse_var_14] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 6)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_15] = (compute_4[cse_var_15] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 7)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_16] = (compute_4[cse_var_16] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 8)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_17] = (compute_4[cse_var_17] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 9)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_4] = (compute_4[cse_var_4] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 10)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_5] = (compute_4[cse_var_5] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 11)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_6] = (compute_4[cse_var_6] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 12)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_7] = (compute_4[cse_var_7] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 13)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_8] = (compute_4[cse_var_8] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 14)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+                  compute_4[cse_var_9] = (compute_4[cse_var_9] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 15)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
                 }
               }
             }
           }
-          for (i0.inner: int32, 0, 128) {
+          for (i0.inner: int32, 0, 32) {
             for (i1.inner: int32, 0, 32) {
-              let cse_var_22: int32 = (((i0.inner*512) + (i0.outer.i1.outer.fused*32)) + i1.inner)
+              let cse_var_22: int32 = ((((floordiv(i0.outer.i1.outer.fused, 16)*16384) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)) + i1.inner)
               compute_5: Buffer(compute_2, float32, [65536], [])[cse_var_22] = max((compute_4[((i0.inner*32) + i1.inner)] + placeholder_19: Buffer(placeholder_14, float32, [65536], [])[cse_var_22]), 0f32)
             }
           }
@@ -514,7 +512,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 1.856 ms
+    Execution time of this operator: 1.735 ms
 
 
 
diff --git a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
index a36006c5fe..c65bba6135 100644
--- a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
 
 Computation times
 =================
-**00:37.080** total execution time for **how_to_tune_with_autotvm** files:
+**00:37.316** total execution time for **how_to_tune_with_autotvm** files:
 
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)           | 00:37.044 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)           | 00:37.281 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)               | 00:00.021 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)               | 00:00.020 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``)             | 00:00.005 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
index 0de9164187..ba24e7a174 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
@@ -387,7 +387,7 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 4, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 128]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9469753
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 256, 1, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 256, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,610123
     No: 2   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
@@ -510,161 +510,149 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 4, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 256]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,767289
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 32, 16, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 256, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9939639
     No: 3   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
-      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 738, in __call__
-        yield remote, remote.load_module(os.path.split(build_result.filename)[1])
-      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 702, in run_through_rpc
-        costs = time_f(*args).results
-      File "/workspace/python/tvm/runtime/module.py", line 357, in evaluator
-        blob = feval(*args)
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
+        res = future.result()
+      File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
+        return self.__get_result()
+      File "/usr/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
+        raise self._exception
+      File "/usr/lib/python3.7/concurrent/futures/thread.py", line 57, in run
+        result = self.fn(*self.args, **self.kwargs)
+      File "/workspace/python/tvm/contrib/popen_pool.py", line 432, in <lambda>
+        worker = lambda *args: self._worker_run(*args)
+      File "/workspace/python/tvm/contrib/popen_pool.py", line 401, in _worker_run
+        return proc.recv()
+      File "/workspace/python/tvm/contrib/popen_pool.py", line 309, in recv
+        raise TimeoutError()
+    TimeoutError
+
+            [('tile_f', [-1, 4, 1, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4192266
+    No: 4   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
+        func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
+        func = build(s, args, target_host=task.target_host, runtime=runtime)
+      File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+        input_mod = lower(inputs, args, name=name, binds=binds)
+      File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
       File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
-      File "tvm/_ffi/_cython/./packed_func.pxi", line 262, in tvm._ffi._cy3.core.FuncCall
-      File "tvm/_ffi/_cython/./packed_func.pxi", line 251, in tvm._ffi._cy3.core.FuncCall3
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
       File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
     tvm._ffi.base.TVMError: Traceback (most recent call last):
-      4: TVMFuncCall
+      24: TVMFuncCall
             at ../src/runtime/c_runtime_api.cc:477
-      3: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
-            at ../include/tvm/runtime/packed_func.h:1217
-      2: tvm::runtime::RPCWrappedFunc::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
-            at ../src/runtime/rpc/rpc_module.cc:129
-      1: tvm::runtime::RPCClientSession::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)> const&)
-            at ../src/runtime/rpc/rpc_endpoint.cc:1012
-      0: tvm::runtime::RPCEndpoint::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)>)
-            at ../src/runtime/rpc/rpc_endpoint.cc:804
-      File "../src/runtime/rpc/rpc_endpoint.cc", line 804
-    TVMError: 
-    ---------------------------------------------------------------
-    An error occurred during the execution of TVM.
-    For more information, please see: https://tvm.apache.org/docs/errors.html
-    ---------------------------------------------------------------
-      Check failed: (code == RPCCode::kReturn) is false: code=kShutdown
-
-    During handling of the above exception, another exception occurred:
-
-    Traceback (most recent call last):
-      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 702, in run_through_rpc
-        costs = time_f(*args).results
-      File "/usr/lib/python3.7/contextlib.py", line 130, in __exit__
-        self.gen.throw(type, value, traceback)
-      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 742, in __call__
-        remote.remove(build_result.filename)
-      File "/workspace/python/tvm/rpc/client.py", line 144, in remove
-        self._remote_funcs["remove"] = self.get_function("tvm.rpc.server.remove")
-      File "/workspace/python/tvm/rpc/client.py", line 72, in get_function
-        return self._sess.get_function(name)
-      File "/workspace/python/tvm/runtime/module.py", line 171, in get_function
-        self.handle, c_str(name), ctypes.c_int(query_imports), ctypes.byref(ret_handle)
-      File "/workspace/python/tvm/_ffi/base.py", line 348, in check_call
-        raise get_last_ffi_error()
-    tvm._ffi.base.TVMError: Traceback (most recent call last):
-      52: 0xffffffffffffffff
-      51: _start
-      50: __libc_start_main
-      49: _Py_UnixMain
-      48: 0x0000000000650da0
-      47: 0x0000000000650afa
-      46: _PyFunction_FastCallDict
-      45: _PyEval_EvalCodeWithName
-      44: _PyEval_EvalFrameDefault
-      43: _PyFunction_FastCallKeywords
-      42: _PyEval_EvalCodeWithName
-      41: _PyEval_EvalFrameDefault
-      40: _PyMethodDef_RawFastCallKeywords
-      39: 0x0000000000546369
-      38: _PyEval_EvalCodeWithName
-      37: _PyEval_EvalFrameDefault
-      36: _PyFunction_FastCallKeywords
-      35: _PyEval_EvalCodeWithName
-      34: _PyEval_EvalFrameDefault
-      33: _PyFunction_FastCallDict
-      32: _PyEval_EvalCodeWithName
-      31: _PyEval_EvalFrameDefault
-      30: _PyObject_FastCallDict
-      29: 0x00000000004c06e1
-      28: _PyFunction_FastCallDict
-      27: _PyEval_EvalFrameDefault
-      26: _PyMethodDescr_FastCallKeywords
-      25: 0x00000000005dcb58
-      24: 0x00000000005dc83f
-      23: 0x00000000004ba127
-      22: _PyEval_EvalFrameDefault
-      21: _PyFunction_FastCallKeywords
-      20: _PyEval_EvalFrameDefault
-      19: _PyFunction_FastCallKeywords
-      18: _PyEval_EvalFrameDefault
-      17: _PyFunction_FastCallKeywords
-      16: _PyEval_EvalCodeWithName
-      15: _PyEval_EvalFrameDefault
-      14: 0x0000000000537c30
-      13: _PyObject_FastCallKeywords
-      12: 0x00007fd33478ffa2
-      11: _ctypes_callproc
-      10: ffi_call
-      9: ffi_call_unix64
-      8: TVMModGetFunction
-            at ../src/runtime/c_runtime_api.cc:408
-      7: tvm::runtime::ModuleNode::GetFunction(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
-            at ../src/runtime/module.cc:66
-      6: tvm::runtime::RPCModuleNode::GetFunction(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)
-            at ../src/runtime/rpc/rpc_module.cc:185
-      5: tvm::runtime::RPCClientSession::GetFunction(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)
-            at ../src/runtime/rpc/rpc_endpoint.cc:1007
-      4: tvm::runtime::TVMRetValue tvm::runtime::RPCEndpoint::SysCallRemote<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>(tvm::runtime::RPCCode, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)
-            at ../src/runtime/rpc/rpc_endpoint.h:223
-      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>(int&&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) const
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1731
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1671
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1631
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1631
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1631
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1631
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1631
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1646
+      13: operator()
+            at ../src/driver/driver_api.cc:388
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:374
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:269
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:453
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1750
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1694
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
             at ../include/tvm/runtime/packed_func.h:1618
       2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
             at ../include/tvm/runtime/packed_func.h:1217
       1: Call
             at ../include/tvm/runtime/packed_func.h:1213
       0: operator()
-            at ../src/runtime/rpc/rpc_endpoint.cc:684
-      File "../src/runtime/rpc/rpc_endpoint.cc", line 684
-    TVMError: 
-    ---------------------------------------------------------------
-    An error occurred during the execution of TVM.
-    For more information, please see: https://tvm.apache.org/docs/errors.html
-    ---------------------------------------------------------------
-      Check failed: (code == RPCCode::kReturn) is false: code=1
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
 
     Traceback (most recent call last):
-      52: 0xffffffffffffffff
-      51: _start
-      50: __libc_start_main
-      49: _Py_UnixMain
-      48: 0x0000000000650da0
-      47: 0x0000000000650afa
-      46: _PyFunction_FastCallDict
-      45: _PyEval_EvalCodeWithName
-      44: _PyEval_EvalFrameDefault
-      43: _PyFunction_FastCallKeywords
-      42: _PyEval_EvalCodeWithName
-      41: _PyEval_EvalFrameDefault
-      40: _PyMethodDef_RawFastCallKeywords
-      39: 0x0000000000546369
-      38: _PyEval_EvalCodeWithName
-      37: _PyEval_EvalFrameDefault
-      36: _PyFunction_FastCallKeywords
-      35: _PyEval_EvalCodeWithName
-      34: _PyEval_EvalFrameDefault
-      33: _PyFunction_FastCallDict
-      32: _PyEval_EvalCodeWithName
-      31: _PyEval_EvalFrameDefault
-      30: _PyObject_FastCallDict
-      29: 0x00000000004c06e1
-      28: _PyFunction_FastCallDict
-      27: _PyEval_EvalFrameDefault
-      26: _PyMethodDescr_FastCallKeywords
-      25: 0x00000000005dcb58
-      24: 0x00000000005dc83f
-      23: 0x00000000004ba127
-      22: _PyEval_EvalFrameDefault
-      21: _PyFunction_FastCallKeywords
-      20: _PyEval_EvalFrameDefault
-      19: _PyFunction_FastCall      [('tile_f', [-1, 128, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,234317
-    No: 4   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1731
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1671
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1631
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1631
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1631
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1631
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1631
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1646
+      13: operator()
+            at ../src/driver/driver_api.cc:388
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:374
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:269
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:453
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1750
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1694
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1618
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 32, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9871662
+    No: 5   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -786,8 +774,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 1, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 256]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8703999
-    No: 5   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 2, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8690190
+    No: 6   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -909,8 +897,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 2, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,771685
-    No: 6   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 1, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4234258
+    No: 7   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1032,9 +1020,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 64, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5535966
-    No: 7   GFLOPS: 7.23/7.23       result: MeasureResult(costs=(0.032004703499999995,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.247319459915161, timestamp=1668621631.5396304)        [('tile_f', [-1, 4, 4, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1212537
-    No: 8   GFLOPS: 0.00/7.23       result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 32, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 128, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6451904
+    No: 8   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1156,9 +1143,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 4, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 16, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9641963
-    No: 9   GFLOPS: 920.13/920.13   result: MeasureResult(costs=(0.0002515955407407407,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.322073221206665, timestamp=1668621633.0297952)       [('tile_f', [-1, 2, 4, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 8, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9305633
-    No: 10  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 16, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4628103
+    No: 9   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1280,8 +1266,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 16, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2687499
-    No: 11  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 16, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5689328
+    No: 10  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1403,8 +1389,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 8, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 128]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10240277
-    No: 12  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 1, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 128, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2611284
+    No: 11  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1526,10 +1512,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 2, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6876512
-    No: 13  GFLOPS: 0.83/920.13     result: MeasureResult(costs=(0.27770135475,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.8979127407073975, timestamp=1668621642.613656)       [('tile_f', [-1, 8, 1, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,1757548
-    No: 14  GFLOPS: 3.64/920.13     result: MeasureResult(costs=(0.0636859435,), error_no=MeasureErrorNo.NO_ERROR, all_cost=6.455172538757324, timestamp=1668621644.0596006)        [('tile_f', [-1, 2, 8, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10262495
-    No: 15  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 16, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9699617
+    No: 12  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1651,8 +1635,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 1, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6344106
-    No: 16  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 4, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2453682
+    No: 13  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1774,8 +1758,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 2, 256]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 16, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,329338
-    No: 17  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 2, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 128, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9125614
+    No: 14  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1897,8 +1881,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 4, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6744780
-    No: 18  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 8, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3179839
+    No: 15  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2020,8 +2004,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 16, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 256, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4675477
-    No: 19  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 1, 32]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2013189
+    No: 16  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2143,8 +2127,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 16, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9950760
-    No: 20  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6925484
+    No: 17  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2266,7 +2250,10 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 4, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2310591
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 32, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5745872
+    No: 18  GFLOPS: 119.53/119.53   result: MeasureResult(costs=(0.0019367525892857143,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.878584861755371, timestamp=1668632874.2540498)       [('tile_f', [-1, 2, 64, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3604746
+    No: 19  GFLOPS: 133.82/133.82   result: MeasureResult(costs=(0.001729959672413793,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.642688274383545, timestamp=1668632874.896128) [('tile_f', [-1, 16, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9684423
+    No: 20  GFLOPS: 63.65/133.82    result: MeasureResult(costs=(0.0036371566071428567,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.447140693664551, timestamp=1668632875.603516)        [('tile_f', [-1, 4, 1, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3882918
 
 
 
@@ -2321,9 +2308,9 @@ and measure running time.
     Finish loading 20 records
 
     Best config:
-    [('tile_f', [-1, 2, 4, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 8, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9305633
+    [('tile_f', [-1, 16, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9684423
     Finish loading 20 records
-    Time cost of this operator: 0.000535
+    Time cost of this operator: 0.002187
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
index 2e15467889..fbe93955ff 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
@@ -327,10 +327,10 @@ Timing the untuned program
     ########## Build without Autotuning ##########
     Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  Measurements(us)  
     ---------                                     ---                                           --------  -------  -----              ------  -------  ----------------  
-    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  313.7     98.74    (1, 2, 10, 10, 3)  2       1        [313.7]           
-    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.017     0.95     (1, 6, 10, 10)     1       1        [3.017]           
-    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.987     0.311    (1, 1, 10, 10, 3)  1       1        [0.987]           
-    Total_time                                    -                                             317.704   -        -                  -       -        -                 
+    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  310.4     98.711   (1, 2, 10, 10, 3)  2       1        [310.4]           
+    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.08      0.98     (1, 6, 10, 10)     1       1        [3.08]            
+    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.973     0.309    (1, 1, 10, 10, 3)  1       1        [0.973]           
+    Total_time                                    -                                             314.453   -        -                  -       -        -                 
 
 
 
@@ -394,10 +394,10 @@ Timing the tuned program
     ########## Build with Autotuning ##########
     Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  Measurements(us)  
     ---------                                     ---                                           --------  -------  -----              ------  -------  ----------------  
-    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  100.4     97.289   (1, 6, 10, 10, 1)  2       1        [100.4]           
-    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.811     1.755    (1, 6, 10, 10)     1       1        [1.811]           
-    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.986     0.956    (1, 1, 10, 10, 3)  1       1        [0.986]           
-    Total_time                                    -                                             103.197   -        -                  -       -        -                 
+    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  121.3     97.76    (1, 6, 10, 10, 1)  2       1        [121.3]           
+    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.82      1.467    (1, 6, 10, 10)     1       1        [1.82]            
+    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.959     0.773    (1, 1, 10, 10, 3)  1       1        [0.959]           
+    Total_time                                    -                                             124.079   -        -                  -       -        -                 
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
index 95eee03d3b..2b8ba9b29f 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
@@ -109,7 +109,7 @@ download a cat image and preprocess it to use as the model input.
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/ao/quantization/utils.py:281: UserWarning: must run observer before calling calculate_qparams. Returning default values.
       "must run observer before calling calculate_qparams. " +
     Downloading: "https://download.pytorch.org/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2_qnnpack_37f702c5.pth
-
      0%|          | 0.00/3.42M [00:00<?, ?B/s]
    100%|##########| 3.42M/3.42M [00:00<00:00, 85.4MB/s]
+
      0%|          | 0.00/3.42M [00:00<?, ?B/s]
     72%|#######1  | 2.46M/3.42M [00:00<00:00, 23.5MB/s]
    100%|##########| 3.42M/3.42M [00:00<00:00, 30.7MB/s]
     /workspace/python/tvm/relay/frontend/pytorch_utils.py:47: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
       return LooseVersion(torch_ver) > ver
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/setuptools/_distutils/version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
@@ -314,7 +314,7 @@ Look up prediction top 1 index in 1000 class synset.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  1.947 seconds)
+   **Total running time of the script:** ( 1 minutes  2.708 seconds)
 
 
 .. _sphx_glr_download_how_to_work_with_microtvm_micro_pytorch.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
index 4a3148e411..1acd348e55 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
@@ -225,7 +225,7 @@ take about **2 minutes** to download the Stanford Cars, while COCO 2017 validati
  .. code-block:: none
 
 
-    '/tmp/tmpl8_9f7vj/images/random'
+    '/tmp/tmp8f79bv0e/images/random'
 
 
 
@@ -316,7 +316,7 @@ objects to other stuff? We can display some examples from our datasets using ``m
 
 
 .. image-sg:: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
-   :alt: [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]
+   :alt: [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]
    :srcset: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
    :class: sphx-glr-single-img
 
@@ -325,8 +325,8 @@ objects to other stuff? We can display some examples from our datasets using ``m
 
  .. code-block:: none
 
-    /tmp/tmpl8_9f7vj/images/target contains 8144 images
-    /tmp/tmpl8_9f7vj/images/random contains 5000 images
+    /tmp/tmp8f79bv0e/images/target contains 8144 images
+    /tmp/tmp8f79bv0e/images/random contains 5000 images
 
 
 
@@ -501,13 +501,13 @@ the time on our validation set).
  .. code-block:: none
 
     Epoch 1/3
-    328/328 - 47s - loss: 0.2049 - accuracy: 0.9255 - val_loss: 0.1223 - val_accuracy: 0.9622 - 47s/epoch - 143ms/step
+    328/328 - 46s - loss: 0.2469 - accuracy: 0.9177 - val_loss: 0.1360 - val_accuracy: 0.9517 - 46s/epoch - 141ms/step
     Epoch 2/3
-    328/328 - 43s - loss: 0.0911 - accuracy: 0.9663 - val_loss: 0.0991 - val_accuracy: 0.9690 - 43s/epoch - 131ms/step
+    328/328 - 43s - loss: 0.1019 - accuracy: 0.9630 - val_loss: 0.1356 - val_accuracy: 0.9547 - 43s/epoch - 131ms/step
     Epoch 3/3
-    328/328 - 43s - loss: 0.0602 - accuracy: 0.9778 - val_loss: 0.1140 - val_accuracy: 0.9630 - 43s/epoch - 131ms/step
+    328/328 - 43s - loss: 0.0745 - accuracy: 0.9730 - val_loss: 0.1347 - val_accuracy: 0.9569 - 43s/epoch - 131ms/step
 
-    <keras.callbacks.History object at 0x7f5326724510>
+    <keras.callbacks.History object at 0x7f69b7684c90>
 
 
 
@@ -864,7 +864,7 @@ Arduino tutorial for how to do that `on GitHub <https://github.com/guberti/tvm-a
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 4 minutes  38.559 seconds)
+   **Total running time of the script:** ( 4 minutes  39.935 seconds)
 
 
 .. _sphx_glr_download_how_to_work_with_microtvm_micro_train.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
index 56387207c8..441634ebcd 100644
--- a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
@@ -5,20 +5,20 @@
 
 Computation times
 =================
-**06:41.491** total execution time for **how_to_work_with_microtvm** files:
+**06:44.796** total execution time for **how_to_work_with_microtvm** files:
 
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``)               | 04:38.559 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``)               | 04:39.935 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_pytorch.py` (``micro_pytorch.py``)           | 01:01.947 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_pytorch.py` (``micro_pytorch.py``)           | 01:02.708 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``)         | 00:49.192 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``)         | 00:49.942 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``)                   | 00:08.118 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``)                   | 00:08.359 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``)             | 00:03.672 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``)             | 00:03.850 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_reference_vm.py` (``micro_reference_vm.py``) | 00:00.001 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_reference_vm.py` (``micro_reference_vm.py``) | 00:00.002 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``)             | 00:00.001 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
index 083992c8ae..7a85717f95 100644
--- a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**00:42.944** total execution time for **how_to_work_with_relay** files:
+**00:43.710** total execution time for **how_to_work_with_relay** files:
 
 +----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:31.089 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:32.016 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``)           | 00:10.213 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``)           | 00:10.162 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``)                             | 00:01.636 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``)                             | 00:01.526 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``)                 | 00:00.007 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
index 8baeb9f701..37c7b45358 100644
--- a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
@@ -261,7 +261,7 @@ The following example customizes CUDA lowering rule for :code:`exp`.
  .. code-block:: none
 
 
-    <function my_cuda_math_rule at 0x7f533893d200>
+    <function my_cuda_math_rule at 0x7f6a15288cb0>
 
 
 
diff --git a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
index c57f6544f4..59be57f030 100644
--- a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
@@ -5,22 +5,22 @@
 
 Computation times
 =================
-**00:06.268** total execution time for **how_to_work_with_schedules** files:
+**00:07.284** total execution time for **how_to_work_with_schedules** files:
 
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``)                 | 00:04.007 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``)                 | 00:04.902 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``)                     | 00:00.978 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``)                     | 00:01.041 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``)                     | 00:00.546 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``)                     | 00:00.576 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``)                               | 00:00.529 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``)                               | 00:00.556 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``)                     | 00:00.113 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``)                     | 00:00.114 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.048 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``)                               | 00:00.028 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``)               | 00:00.019 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``)               | 00:00.018 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
index 40d3eebd15..4b1af09dfd 100644
--- a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
@@ -343,7 +343,7 @@ The importing needs to happen before the tensorized GEMV being executed.
                  B: Buffer(B_2: Pointer(float32), float32, [512, 64], []),
                  C: Buffer(C_2: Pointer(float32), float32, [1024, 512], [])}
       buffer_map = {A_1: A, B_1: B, C_1: C} {
-      attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmp8kh4i1du/input0.cc'\nsource_filename = \"/tmp/tmp8kh4i1du/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca float*, align 8\n  %8 = alloca float*, align 8\n  %9 = alloca floa [...]
+      attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmp_0ju_8iw/input0.cc'\nsource_filename = \"/tmp/tmp_0ju_8iw/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca float*, align 8\n  %8 = alloca float*, align 8\n  %9 = alloca floa [...]
       for (i, 0, 1024) {
         for (j.outer: int32, 0, 32) {
           @tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
index fa1e1cbf28..372dc4a5e0 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:26.011** total execution time for **topic_vta_tutorials_autotvm** files:
+**00:27.200** total execution time for **topic_vta_tutorials_autotvm** files:
 
 +---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:26.005 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:27.193 | 0.0 MB |
 +---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``)     | 00:00.006 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``)     | 00:00.007 | 0.0 MB |
 +---------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
index 27a997a1dc..896e18cc2d 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -289,7 +289,7 @@ The compilation steps are:
       DeprecationWarning,
     /workspace/vta/tutorials/frontend/deploy_classification.py:213: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the  new recommended usage.
       relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
-    resnet18_v1 inference graph built in 28.53s!
+    resnet18_v1 inference graph built in 29.28s!
 
 
 
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
index 243351edde..e6a9648446 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
@@ -333,7 +333,7 @@ The compilation steps are:
 
     /workspace/python/tvm/relay/build_module.py:348: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
       DeprecationWarning,
-    yolov3-tiny inference graph built in 19.39s!
+    yolov3-tiny inference graph built in 19.54s!
 
 
 
diff --git a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
index 3cf60dba8d..739d52e042 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**01:39.922** total execution time for **topic_vta_tutorials_frontend** files:
+**01:40.945** total execution time for **topic_vta_tutorials_frontend** files:
 
 +------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``)           | 00:51.401 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``)           | 00:51.743 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:48.521 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:49.203 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
index 3c2f821255..c5670cacb8 100644
--- a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:03.058** total execution time for **topic_vta_tutorials_optimize** files:
+**00:03.197** total execution time for **topic_vta_tutorials_optimize** files:
 
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)         | 00:02.634 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)         | 00:02.717 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.425 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.480 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
index b63910e565..653bd1ad1b 100644
--- a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:00.744** total execution time for **topic_vta_tutorials** files:
+**00:00.821** total execution time for **topic_vta_tutorials** files:
 
 +---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.401 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.436 | 0.0 MB |
 +---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.343 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.385 | 0.0 MB |
 +---------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
index de4805ceae..d1dc3318cb 100644
--- a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
@@ -325,7 +325,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 94.181 ms
+    Execution time of this operator: 93.662 ms
 
 
 
@@ -443,7 +443,7 @@ operations.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  10.892 seconds)
+   **Total running time of the script:** ( 1 minutes  14.468 seconds)
 
 
 .. _sphx_glr_download_tutorial_auto_scheduler_matmul_x86.py:
diff --git a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
index 08690d8d02..97b608b19b 100644
--- a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
@@ -450,16 +450,16 @@ reduce variance, we take 5 measurements and average them.
     waiting for device...
     device available
     Get devices for measurement successfully!
-    No: 1   GFLOPS: 12.40/12.40     result: MeasureResult(costs=(0.0216560844,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4955587387084961, timestamp=1668620275.027664)        [('tile_y', [-1, 2]), ('tile_x', [-1, 512])],None,91
-    No: 2   GFLOPS: 2.13/12.40      result: MeasureResult(costs=(0.1261618544,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.1588635444641113, timestamp=1668620277.934932)        [('tile_y', [-1, 1]), ('tile_x', [-1, 8])],None,30
-    No: 3   GFLOPS: 14.48/14.48     result: MeasureResult(costs=(0.0185341218,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5047709941864014, timestamp=1668620278.394096)        [('tile_y', [-1, 32]), ('tile_x', [-1, 64])],None,65
-    No: 4   GFLOPS: 9.95/14.48      result: MeasureResult(costs=(0.026977757800000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.9039874076843262, timestamp=1668620279.726128)        [('tile_y', [-1, 16]), ('tile_x', [-1, 128])],None,74
-    No: 5   GFLOPS: 4.16/14.48      result: MeasureResult(costs=(0.0645887294,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.1997129917144775, timestamp=1668620281.0469656)       [('tile_y', [-1, 16]), ('tile_x', [-1, 16])],None,44
-    No: 6   GFLOPS: 0.45/14.48      result: MeasureResult(costs=(0.5967149748,), error_no=MeasureErrorNo.NO_ERROR, all_cost=9.683677673339844, timestamp=1668620290.7656863)        [('tile_y', [-1, 512]), ('tile_x', [-1, 1])],None,9
-    No: 7   GFLOPS: 2.81/14.48      result: MeasureResult(costs=(0.0954460474,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.671795129776001, timestamp=1668620293.1846426)        [('tile_y', [-1, 512]), ('tile_x', [-1, 8])],None,39
-    No: 8   GFLOPS: 12.32/14.48     result: MeasureResult(costs=(0.0217822392,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5574278831481934, timestamp=1668620293.7427123)       [('tile_y', [-1, 8]), ('tile_x', [-1, 256])],None,83
-    No: 9   GFLOPS: 10.72/14.48     result: MeasureResult(costs=(0.0250293714,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5174264907836914, timestamp=1668620294.3726704)       [('tile_y', [-1, 2]), ('tile_x', [-1, 64])],None,61
-    No: 10  GFLOPS: 1.91/14.48      result: MeasureResult(costs=(0.1406689588,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.3599817752838135, timestamp=1668620296.7884088)       [('tile_y', [-1, 512]), ('tile_x', [-1, 4])],None,29
+    No: 1   GFLOPS: 10.38/10.38     result: MeasureResult(costs=(0.025861730399999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5724539756774902, timestamp=1668631496.56468) [('tile_y', [-1, 2]), ('tile_x', [-1, 64])],None,61
+    No: 2   GFLOPS: 3.42/10.38      result: MeasureResult(costs=(0.0783984658,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4411935806274414, timestamp=1668631498.7478187)       [('tile_y', [-1, 256]), ('tile_x', [-1, 16])],None,48
+    No: 3   GFLOPS: 0.51/10.38      result: MeasureResult(costs=(0.5274176902,), error_no=MeasureErrorNo.NO_ERROR, all_cost=8.57837200164795, timestamp=1668631508.0970442) [('tile_y', [-1, 128]), ('tile_x', [-1, 1])],None,7
+    No: 4   GFLOPS: 0.89/10.38      result: MeasureResult(costs=(0.30190693999999996,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.96060585975647, timestamp=1668631513.0902867)  [('tile_y', [-1, 256]), ('tile_x', [-1, 2])],None,18
+    No: 5   GFLOPS: 12.42/12.42     result: MeasureResult(costs=(0.021617002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5116605758666992, timestamp=1668631513.718003) [('tile_y', [-1, 32]), ('tile_x', [-1, 128])],None,75
+    No: 6   GFLOPS: 2.29/12.42      result: MeasureResult(costs=(0.11722831859999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.0336122512817383, timestamp=1668631515.7642033)        [('tile_y', [-1, 512]), ('tile_x', [-1, 16])],None,49
+    No: 7   GFLOPS: 12.93/12.93     result: MeasureResult(costs=(0.0207582024,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.49199438095092773, timestamp=1668631517.0051625)      [('tile_y', [-1, 8]), ('tile_x', [-1, 512])],None,93
+    No: 8   GFLOPS: 1.92/12.93      result: MeasureResult(costs=(0.139460038,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.415658473968506, timestamp=1668631519.4398894) [('tile_y', [-1, 2]), ('tile_x', [-1, 4])],None,21
+    No: 9   GFLOPS: 7.96/12.93      result: MeasureResult(costs=(0.0337442142,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6975445747375488, timestamp=1668631520.2507834)       [('tile_y', [-1, 512]), ('tile_x', [-1, 64])],None,69
+    No: 10  GFLOPS: 6.00/12.93      result: MeasureResult(costs=(0.044773432,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.8220565319061279, timestamp=1668631521.1295543)        [('tile_y', [-1, 1]), ('tile_x', [-1, 32])],None,50
 
 
 
diff --git a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
index 71a3a246c4..fb2e4b563a 100644
--- a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
@@ -320,7 +320,7 @@ standard deviation.
 
  .. code-block:: none
 
-    {'mean': 511.1832385599928, 'median': 511.49898314999973, 'std': 0.982361837709333}
+    {'mean': 516.7287188500086, 'median': 516.592596549981, 'std': 2.0230427055862883}
 
 
 
@@ -554,30 +554,29 @@ the tuning data to.
 
  .. code-block:: none
 
-
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  1/25]  Current/Best:    3.21/  22.51 GFLOPS | Progress: (4/20) | 8.30 s
    [Task  1/25]  Current/Best:   14.26/  22.51 GFLOPS | Progress: (8/20) | 14.89 s
    [Task  1/25]  Current/Best:   23.34/  23.34 GFLOPS | Progress: (12/20) | 18.27 s
    [Task  1/25]  Current/Best:    9.25/  23.34 GFLOPS | Progress: (16/20) | 21.14 s
    [Task  1/25]  Current/Best:    8.39/  23.34 GFLOPS | Progress: (20/20) | 24.82 s Done.
-
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  2/25]  Current/Best:    7.02/  19.82 GFLOPS | Progress: (4/20) | 2.60 s
    [Task  2/25]  Current/Best:   17.92/  19.82 GFLOPS | Progress: (8/20) | 3.71 s
    [Task  2/25]  Current/Best:   10.08/  19.82 GFLOPS | Progress: (12/20) | 5.37 s
    [Task  2/25]  Current/Best:   11.26/  22.27 GFLOPS | Progress: (16/20) | 6.66 s
    [Task  2/25]  Current/Best:   13.52/  22.27 GFLOPS | Progress: (20/20) | 7.99 s Done.
-
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  3/25]  Current/Best:   12.99/  16.87 GFLOPS | Progress: (4/20) | 3.13 s
    [Task  3/25]  Current/Best:   24.13/  24.13 GFLOPS | Progress: (8/20) | 4.82 s
    [Task  3/25]  Current/Best:   11.61/  24.13 GFLOPS | Progress: (12/20) | 7.04 s
    [Task  3/25]  Current/Best:   13.63/  24.13 GFLOPS | Progress: (16/20) | 9.81 s
    [Task  3/25]  Current/Best:   13.28/  24.13 GFLOPS | Progress: (20/20) | 12.02 s Done.
-
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  4/25]  Current/Best:    6.06/  19.72 GFLOPS | Progress: (4/20) | 2.94 s
    [Task  4/25]  Current/Best:   11.94/  19.72 GFLOPS | Progress: (8/20) | 11.46 s
    [Task  4/25]  Current/Best:   12.21/  19.72 GFLOPS | Progress: (12/20) | 13.28 s
    [Task  4/25]  Current/Best:   11.96/  19.72 GFLOPS | Progress: (16/20) | 14.99 s
    [Task  4/25]  Current/Best:    6.45/  20.58 GFLOPS | Progress: (20/20) | 16.74 s Done.
-
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  5/25]  Current/Best:   14.30/  14.30 GFLOPS | Progress: (4/20) | 4.78 s
    [Task  5/25]  Current/Best:    3.20/  21.70 GFLOPS | Progress: (8/20) | 6.80 s
    [Task  5/25]  Current/Best:   16.02/  21.70 GFLOPS | Progress: (12/20) | 8.51 s
    [Task  5/25]  Current/Best:   14.16/  21.70 GFLOPS | Progress: (16/20) | 10.58 s
    [Task  5/25]  Current/Best:   11.93/  21.70 GFLOPS | Progress: (20/20) | 11.98 s Done.
-
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  6/25]  Current/Best:   14.33/  14.63 GFLOPS | Progress: (4/20) | 4.32 s
    [Task  6/25]  Current/Best:    4.57/  16.76 GFLOPS | Progress: (8/20) | 7.77 s
    [Task  6/25]  Current/Best:    5.80/  16.76 GFLOPS | Progress: (12/20) | 11.36 s
    [Task  6/25]  Current/Best:   15.95/  21.11 GFLOPS | Progress: (16/20) | 13.31 s
    [Task  6/25]  Current/Best:   18.21/  21.11 GFLOPS | Progress: (20/20) | 15.27 s Done.
-
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  7/25]  Current/Best:    5.92/  23.30 GFLOPS | Progress: (4/20) | 3.83 s
    [Task  7/25]  Current/Best:   14.87/  23.30 GFLOPS | Progress: (8/20) | 6.95 s
    [Task  7/25]  Current/Best:   12.59/  23.30 GFLOPS | Progress: (12/20) | 9.69 s
    [Task  7/25]  Current/Best:   12.35/  23.30 GFLOPS | Progress: (16/20) | 11.58 s
    [Task  7/25]  Current/Best:   16.76/  23.30 GFLOPS | Progress: (20/20) | 13.55 s Done.
-
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  8/25]  Current/Best:   15.99/  15.99 GFLOPS | Progress: (4/20) | 3.34 s
    [Task  8/25]  Current/Best:   12.43/  15.99 GFLOPS | Progress: (8/20) | 10.95 s
    [Task  8/25]  Current/Best:    5.22/  15.99 GFLOPS | Progress: (12/20) | 17.15 s
    [Task  8/25]  Current/Best:   23.18/  23.18 GFLOPS | Progress: (16/20) | 19.09 s
    [Task  8/25]  Current/Best:   11.86/  23.18 GFLOPS | Progress: (20/20) | 28.67 s Done.
-
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  9/25]  Current/Best:   10.00/  16.72 GFLOPS | Progress: (4/20) | 4.04 s
    [Task  9/25]  Current/Best:   17.53/  22.98 GFLOPS | Progress: (8/20) | 5.45 s
    [Task  9/25]  Current/Best:   20.73/  22.98 GFLOPS | Progress: (12/20) | 12.28 s
    [Task  9/25]  Current/Best:    6.14/  22.98 GFLOPS | Progress: (16/20) | 15.55 s
    [Task  9/25]  Current/Best:   14.38/  22.98 GFLOPS | Progress: (20/20) | 16.91 s Done.
-
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 10/25]  Current/Best:   13.44/  21.20 GFLOPS | Progress: (4/20) | 3.99 s
    [Task 10/25]  Current/Best:   11.06/  21.20 GFLOPS | Progress: (8/20) | 6.56 s
    [Task 10/25]  Current/Best:   10.21/  21.20 GFLOPS | Progress: (12/20) | 8.83 s
    [Task 10/25]  Current/Best:   14.64/  21.20 GFLOPS | Progress: (16/20) | 10.43 s
    [Task 10/25]  Current/Best:   10.05/  21.20 GFLOPS | Progress: (20/20) | 12.28 s Done.
-
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 11/25]  Current/Best:   19.34/  19.34 GFLOPS | Progress: (4/20) | 3.33 s
    [Task 11/25]  Current/Best:   11.49/  21.14 GFLOPS | Progress: (8/20) | 5.20 s
    [Task 11/25]  Current/Best:    5.90/  21.36 GFLOPS | Progress: (12/20) | 7.51 s
    [Task 11/25]  Current/Best:   14.24/  21.36 GFLOPS | Progress: (16/20) | 10.27 s
    [Task 11/25]  Current/Best:   19.54/  22.50 GFLOPS | Progress: (20/20) | 12.12 s Done.
-
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 12/25]  Current/Best:   10.51/  11.81 GFLOPS | Progress: (4/20) | 4.54 s
    [Task 12/25]  Current/Best:    5.96/  14.85 GFLOPS | Progress: (8/20) | 8.23 s
    [Task 12/25]  Current/Best:   13.70/  18.05 GFLOPS | Progress: (12/20) | 10.11 s
    [Task 12/25]  Current/Best:    6.03/  18.05 GFLOPS | Progress: (16/20) | 12.33 s
    [Task 12/25]  Current/Best:   19.36/  19.36 GFLOPS | Progress: (20/20) | 14.20 s Done.
-
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 13/25]  Current/Best:   16.75/  17.96 GFLOPS | Progress: (4/20) | 4.49 s
    [Task 13/25]  Current/Best:    1.57/  17.96 GFLOPS | Progress: (8/20) | 9.26 s
    [Task 13/25]  Current/Best:   11.98/  21.30 GFLOPS | Progress: (12/20) | 11.10 s
    [Task 13/25]  Current/Best:    6.53/  21.30 GFLOPS | Progress: (16/20) | 13.99 s
    [Task 13/25]  Current/Best:    9.00/  21.42 GFLOPS | Progress: (20/20) | 17.13 s Done.
-
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 14/25]  Current/Best:    9.76/  19.57 GFLOPS | Progress: (4/20) | 5.84 s
    [Task 14/25]  Current/Best:   18.70/  19.57 GFLOPS | Progress: (8/20) | 8.40 s
    [Task 14/25]  Current/Best:   13.25/  19.57 GFLOPS | Progress: (12/20) | 10.83 s
    [Task 14/25]  Current/Best:   13.29/  19.57 GFLOPS | Progress: (16/20) | 13.05 s
    [Task 14/25]  Current/Best:   19.15/  19.57 GFLOPS | Progress: (20/20) | 16.78 s Done.
-
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 15/25]  Current/Best:   21.29/  21.29 GFLOPS | Progress: (4/20) | 6.77 s
    [Task 15/25]  Current/Best:   14.23/  21.29 GFLOPS | Progress: (8/20) | 8.14 s
    [Task 15/25]  Current/Best:    8.67/  21.29 GFLOPS | Progress: (12/20) | 12.14 s
    [Task 15/25]  Current/Best:   14.74/  21.29 GFLOPS | Progress: (16/20) | 14.22 s
    [Task 15/25]  Current/Best:   16.35/  21.29 GFLOPS | Progress: (20/20) | 20.04 s
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 16/25]  Current/Best:   18.94/  18.94 GFLOPS | Progress: (4/20) | 4.40 s
    [Task 16/25]  Current/Best:   14.41/  18.94 GFLOPS | Progress: (8/20) | 7.81 s
    [Task 16/25]  Current/Best:    6.72/  18.94 GFLOPS | Progress: (12/20) | 9.99 s
    [Task 16/25]  Current/Best:   10.29/  18.94 GFLOPS | Progress: (16/20) | 11.48 s
    [Task 16/25]  Current/Best:    9.68/  18.94 GFLOPS | Progress: (20/20)
  | 13.55 s Done.
-
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 17/25]  Current/Best:    6.12/  22.33 GFLOPS | Progress: (4/20) | 3.44 s
    [Task 17/25]  Current/Best:   23.72/  23.72 GFLOPS | Progress: (8/20) | 5.75 s
    [Task 17/25]  Current/Best:   15.40/  23.72 GFLOPS | Progress: (12/20) | 7.38 s
    [Task 17/25]  Current/Best:   17.77/  23.72 GFLOPS | Progress: (16/20) | 9.57 s
    [Task 17/25]  Current/Best:   12.26/  23.72 GFLOPS | Progress: (20/20) | 11.58 s Done.
-
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 18/25]  Current/Best:    5.81/  20.83 GFLOPS | Progress: (4/20) | 3.28 s
    [Task 18/25]  Current/Best:    7.53/  20.83 GFLOPS | Progress: (8/20) | 5.30 s
    [Task 18/25]  Current/Best:    3.06/  20.83 GFLOPS | Progress: (12/20) | 7.92 s
    [Task 18/25]  Current/Best:   14.86/  20.83 GFLOPS | Progress: (16/20) | 10.80 s
    [Task 18/25]  Current/Best:   13.84/  20.83 GFLOPS | Progress: (20/20) | 12.28 s Done.
-
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 19/25]  Current/Best:   17.47/  20.39 GFLOPS | Progress: (4/20) | 3.71 s
    [Task 19/25]  Current/Best:   18.04/  20.39 GFLOPS | Progress: (8/20) | 6.69 s
    [Task 19/25]  Current/Best:    1.55/  20.39 GFLOPS | Progress: (12/20) | 10.55 s
    [Task 19/25]  Current/Best:   11.11/  20.39 GFLOPS | Progress: (16/20) | 13.87 s
    [Task 19/25]  Current/Best:   11.69/  20.39 GFLOPS | Progress: (20/20) | 16.85 s Done.
-
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 20/25]  Current/Best:   14.01/  14.01 GFLOPS | Progress: (4/20) | 5.29 s
    [Task 20/25]  Current/Best:   10.23/  15.47 GFLOPS | Progress: (8/20) | 7.15 s
    [Task 20/25]  Current/Best:   13.32/  16.46 GFLOPS | Progress: (12/20) | 9.81 s
    [Task 20/25]  Current/Best:   11.05/  16.46 GFLOPS | Progress: (16/20) | 12.44 s
    [Task 20/25]  Current/Best:    3.09/  20.41 GFLOPS | Progress: (20/20) | 14.84 s
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 21/25]  Current/Best:   16.21/  16.21 GFLOPS | Progress: (4/20) | 2.91 s Done.
+
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  1/25]  Current/Best:   16.72/  23.45 GFLOPS | Progress: (4/20) | 7.03 s
    [Task  1/25]  Current/Best:   23.52/  23.52 GFLOPS | Progress: (8/20) | 10.40 s
    [Task  1/25]  Current/Best:   19.09/  23.52 GFLOPS | Progress: (12/20) | 12.91 s
    [Task  1/25]  Current/Best:   11.11/  23.52 GFLOPS | Progress: (16/20) | 14.80 s
    [Task  1/25]  Current/Best:   10.99/  23.52 GFLOPS | Progress: (20/20) | 17.78 s Done.
+
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  2/25]  Current/Best:   11.26/  17.02 GFLOPS | Progress: (4/20) | 3.62 s
    [Task  2/25]  Current/Best:    6.28/  17.02 GFLOPS | Progress: (8/20) | 5.03 s
    [Task  2/25]  Current/Best:   11.09/  17.27 GFLOPS | Progress: (12/20) | 6.23 s
    [Task  2/25]  Current/Best:   15.68/  17.27 GFLOPS | Progress: (16/20) | 8.30 s
    [Task  2/25]  Current/Best:    6.42/  17.27 GFLOPS | Progress: (20/20) | 9.91 s Done.
+
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  3/25]  Current/Best:   12.08/  19.15 GFLOPS | Progress: (4/20) | 3.44 s
    [Task  3/25]  Current/Best:   19.00/  19.15 GFLOPS | Progress: (8/20) | 5.44 s
    [Task  3/25]  Current/Best:   18.55/  19.15 GFLOPS | Progress: (12/20) | 7.30 s
    [Task  3/25]  Current/Best:   13.31/  23.25 GFLOPS | Progress: (16/20) | 8.87 s
    [Task  3/25]  Current/Best:    8.39/  23.25 GFLOPS | Progress: (20/20) | 11.73 s Done.
+
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  4/25]  Current/Best:   17.40/  17.40 GFLOPS | Progress: (4/20) | 4.28 s
    [Task  4/25]  Current/Best:   10.77/  17.40 GFLOPS | Progress: (8/20) | 12.16 s
    [Task  4/25]  Current/Best:   13.90/  17.40 GFLOPS | Progress: (12/20) | 14.66 s
    [Task  4/25]  Current/Best:   11.62/  17.40 GFLOPS | Progress: (16/20) | 17.03 s
    [Task  4/25]  Current/Best:    7.03/  19.37 GFLOPS | Progress: (20/20) | 19.46 s Done.
+
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  5/25]  Current/Best:    7.08/  15.24 GFLOPS | Progress: (4/20) | 3.53 s
    [Task  5/25]  Current/Best:    9.22/  15.24 GFLOPS | Progress: (8/20) | 5.06 s
    [Task  5/25]  Current/Best:   10.11/  19.69 GFLOPS | Progress: (12/20) | 6.67 s
    [Task  5/25]  Current/Best:   13.17/  19.92 GFLOPS | Progress: (16/20) | 8.15 s
    [Task  5/25]  Current/Best:    6.40/  19.92 GFLOPS | Progress: (20/20) | 10.17 s Done.
+
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  6/25]  Current/Best:   11.64/  13.44 GFLOPS | Progress: (4/20) | 4.65 s
    [Task  6/25]  Current/Best:   11.00/  18.45 GFLOPS | Progress: (8/20) | 6.71 s
    [Task  6/25]  Current/Best:   13.45/  18.45 GFLOPS | Progress: (12/20) | 8.53 s
    [Task  6/25]  Current/Best:    4.85/  18.45 GFLOPS | Progress: (16/20) | 11.14 s
    [Task  6/25]  Current/Best:    8.88/  18.45 GFLOPS | Progress: (20/20) | 14.29 s Done.
+
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  7/25]  Current/Best:   11.71/  22.26 GFLOPS | Progress: (4/20) | 4.83 s
    [Task  7/25]  Current/Best:   15.33/  22.26 GFLOPS | Progress: (8/20) | 6.73 s
    [Task  7/25]  Current/Best:   17.69/  22.38 GFLOPS | Progress: (12/20) | 8.94 s
    [Task  7/25]  Current/Best:   18.63/  22.38 GFLOPS | Progress: (16/20) | 11.25 s
    [Task  7/25]  Current/Best:   13.20/  22.38 GFLOPS | Progress: (20/20) | 13.42 s Done.
+
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  8/25]  Current/Best:   10.75/  10.75 GFLOPS | Progress: (4/20) | 12.00 s
    [Task  8/25]  Current/Best:   14.32/  14.32 GFLOPS | Progress: (8/20) | 17.52 s
    [Task  8/25]  Current/Best:   18.83/  18.83 GFLOPS | Progress: (12/20) | 19.82 s
    [Task  8/25]  Current/Best:   10.17/  18.83 GFLOPS | Progress: (16/20) | 22.34 s
    [Task  8/25]  Current/Best:   12.94/  18.83 GFLOPS | Progress: (20/20) | 26.46 s
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  9/25]  Current/Best:   16.65/  16.65 GFLOPS | Progress: (4/20) | 6.80 s
    [Task  9/25]  Current/Best:   18.22/  18.22 GFLOPS | Progress: (8/20) | 14.17 s
    [Task  9/25]  Current/Best:   21.06/  21.06 GFLOPS | Progress: (12/20) | 15.36 s
    [Task  9/25]  Current/Best:    8.90/  21.06 GFLOPS | Progress: (16/20) | 16.74 s
    [Task  9/25]  Current/Best:   11.37/  21.06 GFLOPS | Progress: (20
 /20) | 19.57 s Done.
+
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 10/25]  Current/Best:   13.17/  13.17 GFLOPS | Progress: (4/20) | 3.51 s
    [Task 10/25]  Current/Best:    6.05/  17.75 GFLOPS | Progress: (8/20) | 5.67 s
    [Task 10/25]  Current/Best:   17.74/  17.75 GFLOPS | Progress: (12/20) | 7.55 s
    [Task 10/25]  Current/Best:   11.62/  17.75 GFLOPS | Progress: (16/20) | 9.23 s
    [Task 10/25]  Current/Best:    9.69/  17.75 GFLOPS | Progress: (20/20) | 11.82 s Done.
+
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 11/25]  Current/Best:   20.32/  20.32 GFLOPS | Progress: (4/20) | 3.28 s
    [Task 11/25]  Current/Best:   12.35/  20.32 GFLOPS | Progress: (8/20) | 6.00 s
    [Task 11/25]  Current/Best:   16.02/  20.32 GFLOPS | Progress: (12/20) | 8.26 s
    [Task 11/25]  Current/Best:    8.76/  20.32 GFLOPS | Progress: (16/20) | 10.70 s
    [Task 11/25]  Current/Best:   19.26/  20.32 GFLOPS | Progress: (20/20) | 12.38 s Done.
+
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 12/25]  Current/Best:   13.23/  15.07 GFLOPS | Progress: (4/20) | 3.49 s
    [Task 12/25]  Current/Best:   13.90/  18.92 GFLOPS | Progress: (8/20) | 6.00 s
    [Task 12/25]  Current/Best:    7.76/  18.92 GFLOPS | Progress: (12/20) | 11.19 s
    [Task 12/25]  Current/Best:    8.41/  18.92 GFLOPS | Progress: (16/20) | 13.21 s
    [Task 12/25]  Current/Best:   18.52/  18.92 GFLOPS | Progress: (20/20) | 15.92 s Done.
+
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 13/25]  Current/Best:   15.60/  21.80 GFLOPS | Progress: (4/20) | 3.61 s
    [Task 13/25]  Current/Best:   18.67/  21.80 GFLOPS | Progress: (8/20) | 6.49 s
    [Task 13/25]  Current/Best:   17.95/  21.80 GFLOPS | Progress: (12/20) | 10.50 s
    [Task 13/25]  Current/Best:   22.29/  22.29 GFLOPS | Progress: (16/20) | 13.50 s
    [Task 13/25]  Current/Best:   10.02/  22.29 GFLOPS | Progress: (20/20) | 15.99 s Done.
+
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 14/25]  Current/Best:   14.54/  14.54 GFLOPS | Progress: (4/20) | 3.89 s
    [Task 14/25]  Current/Best:   15.09/  15.09 GFLOPS | Progress: (8/20) | 10.44 s
    [Task 14/25]  Current/Best:   16.01/  17.43 GFLOPS | Progress: (12/20) | 12.01 s Done.
+
    [Task 14/25]  Current/Best:    9.41/  17.43 GFLOPS | Progress: (16/20) | 19.15 s
    [Task 14/25]  Current/Best:   20.24/  20.24 GFLOPS | Progress: (20/20) | 22.32 s Done.
+
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 15/25]  Current/Best:    1.71/  22.30 GFLOPS | Progress: (4/20) | 3.48 s
    [Task 15/25]  Current/Best:   11.39/  22.30 GFLOPS | Progress: (8/20) | 6.26 s
    [Task 15/25]  Current/Best:   20.82/  22.30 GFLOPS | Progress: (12/20) | 10.53 s
    [Task 15/25]  Current/Best:   13.94/  22.30 GFLOPS | Progress: (16/20) | 12.45 s
    [Task 15/25]  Current/Best:    9.05/  22.30 GFLOPS | Progress: (20/20) | 13.91 s
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 16/25]  Current/Best:   14.38/  14.38 GFLOPS | Progress: (4/20) | 4.88 s
    [Task 16/25]  Current/Best:    6.73/  21.02 GFLOPS | Progress: (8/20) | 6.56 s
    [Task 16/25]  Current/Best:   10.58/  21.02 GFLOPS | Progress: (12/20) | 9.21 s
    [Task 16/25]  Current/Best:   13.48/  21.02 GFLOPS | Progress: (16/20) | 10.74 s
    [Task 16/25]  Current/Best:   15.14/  21.02 GFLOPS | Progress: (20/20)
  | 12.64 s Done.
+
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 17/25]  Current/Best:   16.83/  18.93 GFLOPS | Progress: (4/20) | 3.13 s
    [Task 17/25]  Current/Best:   10.07/  18.93 GFLOPS | Progress: (8/20) | 5.52 s
    [Task 17/25]  Current/Best:   11.85/  18.93 GFLOPS | Progress: (12/20) | 9.99 s
    [Task 17/25]  Current/Best:   11.98/  18.93 GFLOPS | Progress: (16/20) | 12.32 s
    [Task 17/25]  Current/Best:   19.46/  20.27 GFLOPS | Progress: (20/20) | 15.64 s Done.
+
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 18/25]  Current/Best:   16.22/  16.91 GFLOPS | Progress: (4/20) | 3.15 s
    [Task 18/25]  Current/Best:   13.39/  16.91 GFLOPS | Progress: (8/20) | 6.01 s
    [Task 18/25]  Current/Best:   13.60/  16.91 GFLOPS | Progress: (12/20) | 7.98 s
    [Task 18/25]  Current/Best:   11.18/  16.91 GFLOPS | Progress: (16/20) | 11.09 s
    [Task 18/25]  Current/Best:    8.77/  20.60 GFLOPS | Progress: (20/20) | 13.02 s Done.
+
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 19/25]  Current/Best:   18.19/  20.13 GFLOPS | Progress: (4/20) | 3.88 s
    [Task 19/25]  Current/Best:   11.65/  22.14 GFLOPS | Progress: (8/20) | 6.90 s
    [Task 19/25]  Current/Best:   10.71/  22.14 GFLOPS | Progress: (12/20) | 11.38 s
    [Task 19/25]  Current/Best:    7.99/  22.14 GFLOPS | Progress: (16/20) | 14.94 s
    [Task 19/25]  Current/Best:   15.71/  22.14 GFLOPS | Progress: (20/20) | 17.54 s Done.
+
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 20/25]  Current/Best:    6.89/  10.62 GFLOPS | Progress: (4/20) | 3.89 s
    [Task 20/25]  Current/Best:    4.75/  22.72 GFLOPS | Progress: (8/20) | 6.29 s
    [Task 20/25]  Current/Best:    9.76/  22.72 GFLOPS | Progress: (12/20) | 10.42 s
    [Task 20/25]  Current/Best:   12.45/  22.72 GFLOPS | Progress: (16/20) | 13.65 s
    [Task 20/25]  Current/Best:   14.85/  22.72 GFLOPS | Progress: (20/20) | 15.43 s
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 21/25]  Current/Best:   16.52/  20.13 GFLOPS | Progress: (4/20) | 3.61 s Done.
      Done.
-
    [Task 21/25]  Current/Best:   20.04/  20.04 GFLOPS | Progress: (8/20) | 9.86 s
    [Task 21/25]  Current/Best:   13.14/  20.04 GFLOPS | Progress: (12/20) | 11.98 s
    [Task 21/25]  Current/Best:    5.48/  20.04 GFLOPS | Progress: (16/20) | 13.93 s
    [Task 21/25]  Current/Best:   17.36/  20.04 GFLOPS | Progress: (20/20) | 15.45 s
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 22/25]  Current/Best:    4.43/  17.44 GFLOPS | Progress: (4/20) | 3.23 s
    [Task 22/25]  Current/Best:   18.86/  18.86 GFLOPS | Progress: (8/20) | 5.76 s
    [Task 22/25]  Current/Best:    6.74/  18.86 GFLOPS | Progress: (12/20) | 8.99 s
    [Task 22/25]  Current/Best:   16.04/  18.86 GFLOPS | Progress: (16/20) | 10.40 s
    [Task 22/25]  Current/Best:   14.99/  18.86 GFLOPS | Progress: (20/20) | 12.55 s Done.
-
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 23/25]  Current/Best:   12.40/  20.70 GFLOPS | Progress: (4/20) | 4.30 s
    [Task 23/25]  Current/Best:   19.44/  20.97 GFLOPS | Progress: (8/20) | 5.95 s
    [Task 23/25]  Current/Best:   17.59/  20.97 GFLOPS | Progress: (12/20) | 8.41 s
    [Task 23/25]  Current/Best:   18.26/  20.97 GFLOPS | Progress: (16/20) | 11.73 s
    [Task 23/25]  Current/Best:   10.69/  22.78 GFLOPS | Progress: (20/20) | 14.39 s Done.
-
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 24/25]  Current/Best:    5.48/   5.48 GFLOPS | Progress: (4/20) | 7.03 s
    [Task 24/25]  Current/Best:    2.88/   5.48 GFLOPS | Progress: (8/20) | 17.80 s
    [Task 24/25]  Current/Best:    2.53/   5.48 GFLOPS | Progress: (12/20) | 29.09 s
    [Task 24/25]  Current/Best:    2.64/  10.05 GFLOPS | Progress: (16/20) | 40.74 s
    [Task 24/25]  Current/Best:    7.97/  10.05 GFLOPS | Progress: (20/20) | 51.20 s
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s Done.
-
    [Task 25/25]  Current/Best:    1.55/   5.75 GFLOPS | Progress: (4/20) | 3.33 s
    [Task 25/25]  Current/Best:    3.02/   5.75 GFLOPS | Progress: (8/20) | 14.02 s
    [Task 25/25]  Current/Best:    8.74/   8.80 GFLOPS | Progress: (12/20) | 15.76 s
    [Task 25/25]  Current/Best:    2.80/   8.80 GFLOPS | Progress: (16/20) | 26.50 s
    [Task 25/25]  Current/Best:    3.34/   8.80 GFLOPS | Progress: (20/20) | 37.20 s
+
    [Task 21/25]  Current/Best:   19.30/  20.13 GFLOPS | Progress: (8/20) | 5.91 s
    [Task 21/25]  Current/Best:   22.17/  22.17 GFLOPS | Progress: (12/20) | 7.75 s
    [Task 21/25]  Current/Best:   14.62/  22.17 GFLOPS | Progress: (16/20) | 9.76 s
    [Task 21/25]  Current/Best:   10.65/  22.17 GFLOPS | Progress: (20/20) | 11.38 s
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 22/25]  Current/Best:    2.69/  17.77 GFLOPS | Progress: (4/20) | 4.55 s
    [Task 22/25]  Current/Best:   10.94/  17.77 GFLOPS | Progress: (8/20) | 6.01 s
    [Task 22/25]  Current/Best:    6.86/  17.77 GFLOPS | Progress: (12/20) | 9.33 s
    [Task 22/25]  Current/Best:   16.68/  17.77 GFLOPS | Progress: (16/20) | 10.89 s
    [Task 22/25]  Current/Best:    6.51/  17.77 GFLOPS | Progress: (20/20) | 12.66 s Done.
+
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 23/25]  Current/Best:   18.09/  22.46 GFLOPS | Progress: (4/20) | 3.46 s
    [Task 23/25]  Current/Best:   10.08/  22.46 GFLOPS | Progress: (8/20) | 6.29 s
    [Task 23/25]  Current/Best:   18.10/  22.46 GFLOPS | Progress: (12/20) | 11.41 s
    [Task 23/25]  Current/Best:   11.62/  22.46 GFLOPS | Progress: (16/20) | 14.35 s
    [Task 23/25]  Current/Best:   10.40/  22.46 GFLOPS | Progress: (20/20) | 19.43 s Done.
+
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 24/25]  Current/Best:   10.46/  10.46 GFLOPS | Progress: (4/20) | 12.21 s
    [Task 24/25]  Current/Best:    2.85/  10.46 GFLOPS | Progress: (8/20) | 23.42 s
    [Task 24/25]  Current/Best:    1.18/  10.46 GFLOPS | Progress: (12/20) | 35.00 s
    [Task 24/25]  Current/Best:    2.86/  10.46 GFLOPS | Progress: (16/20) | 45.69 s
    [Task 24/25]  Current/Best:    3.98/  10.46 GFLOPS | Progress: (20/20) | 56.41 s
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 25/25]  Current/Best:    9.47/   9.47 GFLOPS | Progress: (4/20) | 5.55 s
    [Task 25/25]  Current/Best:    4.40/   9.47 GFLOPS | Progress: (8/20) | 16.27 s
    [Task 25/25]  Current/Best:    6.25/   9.47 GFLOPS | Progress: (12/20) | 19.67 s
    [Task 25/25]  Current/Best:    1.52/   9.47 GFLOPS | Progress: (16/20) | 30.40 s
    [Task 25/25]  Current/Best:    5.02/   9.47 GFLOPS | Progress: (20
 /20) | 41.10 s
 
 
 
@@ -673,8 +672,8 @@ Verify that the optimized model runs and produces the same results:
 
  .. code-block:: none
 
-    class='n02123045 tabby, tabby cat' with probability=0.621104
-    class='n02123159 tiger cat' with probability=0.356378
+    class='n02123045 tabby, tabby cat' with probability=0.621103
+    class='n02123159 tiger cat' with probability=0.356379
     class='n02124075 Egyptian cat' with probability=0.019712
     class='n02129604 tiger, Panthera tigris' with probability=0.001215
     class='n04040759 radiator' with probability=0.000262
@@ -731,8 +730,8 @@ improvement in comparing the optimized model to the unoptimized model.
 
  .. code-block:: none
 
-    optimized: {'mean': 411.14162294999915, 'median': 411.1788299499949, 'std': 1.5243749138132034}
-    unoptimized: {'mean': 511.1832385599928, 'median': 511.49898314999973, 'std': 0.982361837709333}
+    optimized: {'mean': 422.4504965399865, 'median': 422.1017261999805, 'std': 1.6927660884575157}
+    unoptimized: {'mean': 516.7287188500086, 'median': 516.592596549981, 'std': 2.0230427055862883}
 
 
 
@@ -755,7 +754,7 @@ profiling/benchmarking.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 10 minutes  47.282 seconds)
+   **Total running time of the script:** ( 10 minutes  58.976 seconds)
 
 
 .. _sphx_glr_download_tutorial_autotvm_relay_x86.py:
diff --git a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
index 9f5a50d7fd..27f447e088 100644
--- a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
@@ -270,7 +270,7 @@ device and returns the measured cost. Network overhead is excluded.
 
  .. code-block:: none
 
-    1.263e-07 secs/op
+    1.254e-07 secs/op
 
 
 
diff --git a/docs/_sources/tutorial/intro_topi.rst.txt b/docs/_sources/tutorial/intro_topi.rst.txt
index cf50859544..4dd8807578 100644
--- a/docs/_sources/tutorial/intro_topi.rst.txt
+++ b/docs/_sources/tutorial/intro_topi.rst.txt
@@ -260,7 +260,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
 
  .. code-block:: none
 
-    [stage(a, placeholder(a, 0x19edd990)), stage(b, placeholder(b, 0x8652c50)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min [...]
+    [stage(a, placeholder(a, 0x10b91a30)), stage(b, placeholder(b, 0xd9dbd30)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min [...]
 
 
 
diff --git a/docs/_sources/tutorial/sg_execution_times.rst.txt b/docs/_sources/tutorial/sg_execution_times.rst.txt
index c3a5da7a5f..7fccd92079 100644
--- a/docs/_sources/tutorial/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorial/sg_execution_times.rst.txt
@@ -5,32 +5,32 @@
 
 Computation times
 =================
-**14:00.313** total execution time for **tutorial** files:
+**14:18.944** total execution time for **tutorial** files:
 
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``)                 | 10:47.282 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``)                 | 10:58.976 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:10.892 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:14.468 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)     | 00:58.523 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)     | 01:01.148 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``)                 | 00:35.650 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``)                 | 00:33.589 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``)               | 00:26.419 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``)               | 00:28.922 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``)                               | 00:00.758 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``)       | 00:00.895 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``)       | 00:00.633 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``)                               | 00:00.763 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.149 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.174 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorial_introduction.py` (``introduction.py``)                           | 00:00.005 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_uma.py` (``uma.py``)                                             | 00:00.001 | 0.0 MB |
-+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tvmc_python.py` (``tvmc_python.py``)                             | 00:00.001 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_uma.py` (``uma.py``)                                             | 00:00.002 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)   | 00:00.001 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorial_install.py` (``install.py``)                                     | 00:00.001 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorial_tvmc_python.py` (``tvmc_python.py``)                             | 00:00.001 | 0.0 MB |
++------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
index 83b8c64db3..abea810e8e 100644
--- a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
@@ -294,8 +294,8 @@ helper function to run a profile of the TVM generated code.
 
  .. code-block:: none
 
-    Numpy running time: 0.000008
-    naive: 0.000007
+    Numpy running time: 0.000007
+    naive: 0.000008
 
 
 
@@ -448,7 +448,7 @@ factor to be the number of threads on your CPU.
 
  .. code-block:: none
 
-    vector: 0.000026
+    vector: 0.000025
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [n: int32], [stride: int32], type="auto"),
@@ -499,10 +499,10 @@ We can now compare the different schedules
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                   numpy    7.568420001007325e-06                    1.0
-                   naive              6.6944e-06      0.8845175081600919
-                parallel    6.962899999999999e-06     0.9199938691395649
-                  vector             2.64708e-05       3.497533170262333
+                   numpy    7.123659997887444e-06                    1.0
+                   naive              7.5702e-06      1.0626840700208853
+                parallel              7.2365e-06      1.0158401723476442
+                  vector             2.45008e-05       3.439355613163154
 
 
 
@@ -923,7 +923,7 @@ matrix multiplication.
 
  .. code-block:: none
 
-    Numpy running time: 0.018356
+    Numpy running time: 0.019168
 
 
 
@@ -981,7 +981,7 @@ optimizations.
 
  .. code-block:: none
 
-    none: 3.221829
+    none: 3.428157
 
 
 
@@ -1083,7 +1083,7 @@ schedule.
 
  .. code-block:: none
 
-    blocking: 0.297802
+    blocking: 0.294709
 
 
 
@@ -1178,7 +1178,7 @@ already cache friendly from our previous optimizations.
 
  .. code-block:: none
 
-    vectorization: 0.333530
+    vectorization: 0.332103
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1251,7 +1251,7 @@ more cache friendly.
 
  .. code-block:: none
 
-    loop permutation: 0.115401
+    loop permutation: 0.115070
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1349,7 +1349,7 @@ optimized schedule.
 
  .. code-block:: none
 
-    array packing: 0.108338
+    array packing: 0.108467
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1441,7 +1441,7 @@ to `C` when all the block results are ready.
 
  .. code-block:: none
 
-    block caching: 0.110008
+    block caching: 0.110431
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1526,7 +1526,7 @@ of thread-level parallelization.
 
  .. code-block:: none
 
-    parallelization: 0.146702
+    parallelization: 0.146028
     @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
       attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
       buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
@@ -1606,13 +1606,13 @@ working, we can compare the results.
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                    none            3.2218294001                     1.0
-                blocking     0.29780162839999996     0.09243246349131855
-           vectorization     0.33353031019999996     0.10352202701659119
-        loop permutation     0.11540077819999998     0.03581840124632861
-           array packing     0.10833812840000001    0.033626277169311755
-           block caching            0.1100077873    0.034144510350729795
-         parallelization     0.14670205719999999    0.045533775685157825
+                    none            3.4281573111                     1.0
+                blocking     0.29470882499999995     0.08596712410068373
+           vectorization            0.3321031136     0.09687510912194323
+        loop permutation     0.11506966199999999    0.033566038999265566
+           array packing            0.1084667718     0.03163996338464294
+           block caching            0.1104306749     0.03221283764967188
+         parallelization     0.14602832780000002     0.04259674062423454
 
 
 
@@ -1652,6 +1652,11 @@ operations with tunable parameters that allows you to automatically optimize
 the computation for specific platforms.
 
 
+.. rst-class:: sphx-glr-timing
+
+   **Total running time of the script:** ( 1 minutes  1.148 seconds)
+
+
 .. _sphx_glr_download_tutorial_tensor_expr_get_started.py:
 
 .. only:: html
diff --git a/docs/commit_hash b/docs/commit_hash
index eff1bf8f81..bd08d725ad 100644
--- a/docs/commit_hash
+++ b/docs/commit_hash
@@ -1 +1 @@
-b4d4b82dbb9be2e4d0954f9dfd8e1c46079b66ee
+a80cdc26e291abc52bbd70c950023d9e0340464d
diff --git a/docs/how_to/compile_models/from_darknet.html b/docs/how_to/compile_models/from_darknet.html
index a36e4824a6..dcf7a653a2 100644
--- a/docs/how_to/compile_models/from_darknet.html
+++ b/docs/how_to/compile_models/from_darknet.html
@@ -585,7 +585,7 @@ class:[&#39;truck 0.9266&#39;] left:471 top:83 right:689 bottom:169
 class:[&#39;bicycle 0.9984&#39;] left:111 top:113 right:577 bottom:447
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  11.575 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  11.995 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-darknet-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7716f96385bd5abb6e822041e285be54/from_darknet.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_darknet.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_keras.html b/docs/how_to/compile_models/from_keras.html
index 38054e2b72..e924dd3082 100644
--- a/docs/how_to/compile_models/from_keras.html
+++ b/docs/how_to/compile_models/from_keras.html
@@ -506,7 +506,7 @@ pip install -U tensorflow --user
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Relay top-1 id: 285, class name: Egyptian cat
 
 1/1 [==============================] - ETA: 0s
-1/1 [==============================] - 1s 938ms/step
+1/1 [==============================] - 1s 940ms/step
 Keras top-1 id: 285, class name: Egyptian cat
 </pre></div>
 </div>
diff --git a/docs/how_to/compile_models/from_mxnet.html b/docs/how_to/compile_models/from_mxnet.html
index 8c539600c8..fa4e21009e 100644
--- a/docs/how_to/compile_models/from_mxnet.html
+++ b/docs/how_to/compile_models/from_mxnet.html
@@ -440,7 +440,7 @@ to download the full example code</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;x&quot;</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#tuple" title="builtins.tuple" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">x</span><span class="o">.</span><span class="n">shape</span></a><span class="p">)</span>
 </pre></div>
 </div>
-<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip9c46c9d6-2dfb-4d52-98f2-171233342729 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip9ca116cb-bf2a-48a3-912a-847c98f2dcd4 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
 x (1, 3, 224, 224)
 </pre></div>
 </div>
diff --git a/docs/how_to/compile_models/from_oneflow.html b/docs/how_to/compile_models/from_oneflow.html
index 0134e75c6c..bf44f36302 100644
--- a/docs/how_to/compile_models/from_oneflow.html
+++ b/docs/how_to/compile_models/from_oneflow.html
@@ -448,13 +448,13 @@ Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdo
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip&quot; to /workspace/.oneflow/flowvision_cache/resnet18.zip
 
   0%|          | 0.00/41.5M [00:00&lt;?, ?B/s]
- 15%|#5        | 6.33M/41.5M [00:00&lt;00:00, 51.2MB/s]
- 27%|##7       | 11.2M/41.5M [00:00&lt;00:00, 49.7MB/s]
- 39%|###8      | 16.0M/41.5M [00:00&lt;00:00, 46.7MB/s]
- 58%|#####7    | 24.0M/41.5M [00:00&lt;00:00, 44.4MB/s]
- 77%|#######7  | 32.0M/41.5M [00:00&lt;00:00, 54.4MB/s]
- 92%|#########2| 38.3M/41.5M [00:00&lt;00:00, 57.5MB/s]
-100%|##########| 41.5M/41.5M [00:00&lt;00:00, 53.9MB/s]
+ 19%|#9        | 7.99M/41.5M [00:00&lt;00:00, 43.7MB/s]
+ 35%|###4      | 14.3M/41.5M [00:00&lt;00:00, 46.6MB/s]
+ 45%|####5     | 18.8M/41.5M [00:00&lt;00:00, 45.8MB/s]
+ 58%|#####7    | 24.0M/41.5M [00:00&lt;00:00, 37.1MB/s]
+ 77%|#######7  | 32.0M/41.5M [00:00&lt;00:00, 43.6MB/s]
+ 92%|#########2| 38.3M/41.5M [00:01&lt;00:00, 36.3MB/s]
+100%|##########| 41.5M/41.5M [00:01&lt;00:00, 40.7MB/s]
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/compile_models/from_pytorch.html b/docs/how_to/compile_models/from_pytorch.html
index 9d7797ad9b..3fcbead8d4 100644
--- a/docs/how_to/compile_models/from_pytorch.html
+++ b/docs/how_to/compile_models/from_pytorch.html
@@ -431,12 +431,11 @@ be unstable.</p>
 Downloading: &quot;https://download.pytorch.org/models/resnet18-f37072fd.pth&quot; to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
 
   0%|          | 0.00/44.7M [00:00&lt;?, ?B/s]
- 18%|#7        | 7.99M/44.7M [00:00&lt;00:00, 50.2MB/s]
- 33%|###2      | 14.6M/44.7M [00:00&lt;00:00, 59.1MB/s]
- 54%|#####3    | 24.0M/44.7M [00:00&lt;00:00, 70.2MB/s]
- 72%|#######2  | 32.3M/44.7M [00:00&lt;00:00, 76.0MB/s]
- 90%|########9 | 40.1M/44.7M [00:00&lt;00:00, 77.8MB/s]
-100%|##########| 44.7M/44.7M [00:00&lt;00:00, 77.9MB/s]
+ 18%|#7        | 7.99M/44.7M [00:00&lt;00:00, 46.8MB/s]
+ 52%|#####1    | 23.1M/44.7M [00:00&lt;00:00, 96.2MB/s]
+ 76%|#######5  | 33.8M/44.7M [00:00&lt;00:00, 92.3MB/s]
+ 99%|#########8| 44.2M/44.7M [00:00&lt;00:00, 61.3MB/s]
+100%|##########| 44.7M/44.7M [00:00&lt;00:00, 67.9MB/s]
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/compile_models/from_tensorflow.html b/docs/how_to/compile_models/from_tensorflow.html
index 0058b9db02..e9e41dd0ea 100644
--- a/docs/how_to/compile_models/from_tensorflow.html
+++ b/docs/how_to/compile_models/from_tensorflow.html
@@ -645,7 +645,7 @@ banana (score = 0.00022)
 desk (score = 0.00019)
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  10.321 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  15.234 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-tensorflow-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7f1d3d1b878694c201c614c807cdebc8/from_tensorflow.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_tensorflow.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/sg_execution_times.html b/docs/how_to/compile_models/sg_execution_times.html
index ab8401faaf..5632ff5a5c 100644
--- a/docs/how_to/compile_models/sg_execution_times.html
+++ b/docs/how_to/compile_models/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-compile-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:44.277</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
+<p><strong>05:47.039</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 81%" />
@@ -348,44 +348,44 @@
 <col style="width: 8%" />
 </colgroup>
 <tbody>
-<tr class="row-odd"><td><p><a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></td>
-<td><p>01:11.575</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></td>
+<td><p>01:15.234</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></td>
-<td><p>01:10.321</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></td>
+<td><p>01:11.995</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></td>
-<td><p>00:45.942</p></td>
+<td><p>00:46.704</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></td>
-<td><p>00:33.204</p></td>
+<td><p>00:31.920</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
-<td><p>00:29.863</p></td>
+<td><p>00:28.777</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
-<td><p>00:26.164</p></td>
+<td><p>00:26.959</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
-<td><p>00:24.637</p></td>
+<td><p>00:25.130</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></td>
-<td><p>00:22.423</p></td>
+<td><p>00:22.065</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></td>
-<td><p>00:17.770</p></td>
+<td><p>00:15.885</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></td>
-<td><p>00:02.380</p></td>
+<td><p>00:02.370</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/deploy_models/deploy_model_on_android.html b/docs/how_to/deploy_models/deploy_model_on_android.html
index f890aebf69..2b5d02a073 100644
--- a/docs/how_to/deploy_models/deploy_model_on_android.html
+++ b/docs/how_to/deploy_models/deploy_model_on_android.html
@@ -661,7 +661,7 @@ to the remote android device.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  15.8131      15.7943      15.9751      15.7570       0.0639
+  15.7907      15.7028      16.4872      15.6577       0.2365
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
index 0928810bbd..fefb705509 100644
--- a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
+++ b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
@@ -453,21 +453,21 @@ be unstable.</p>
 Downloading: &quot;https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth&quot; to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
 
   0%|          | 0.00/170M [00:00&lt;?, ?B/s]
-  8%|8         | 14.3M/170M [00:00&lt;00:01, 149MB/s]
- 17%|#6        | 28.5M/170M [00:00&lt;00:01, 116MB/s]
- 24%|##3       | 40.0M/170M [00:00&lt;00:01, 106MB/s]
- 30%|##9       | 50.4M/170M [00:00&lt;00:01, 107MB/s]
- 36%|###5      | 60.7M/170M [00:00&lt;00:01, 104MB/s]
- 42%|####1     | 70.7M/170M [00:00&lt;00:01, 103MB/s]
- 47%|####7     | 80.6M/170M [00:00&lt;00:00, 94.1MB/s]
- 53%|#####3    | 90.4M/170M [00:00&lt;00:00, 96.3MB/s]
- 60%|######    | 102M/170M [00:01&lt;00:00, 105MB/s]
- 66%|######6   | 113M/170M [00:01&lt;00:00, 97.5MB/s]
- 72%|#######2  | 123M/170M [00:01&lt;00:00, 95.5MB/s]
- 80%|#######9  | 135M/170M [00:01&lt;00:00, 106MB/s]
- 86%|########5 | 145M/170M [00:01&lt;00:00, 80.1MB/s]
- 95%|#########5| 162M/170M [00:01&lt;00:00, 98.2MB/s]
-100%|##########| 170M/170M [00:01&lt;00:00, 102MB/s]
+  6%|6         | 10.6M/170M [00:00&lt;00:02, 57.1MB/s]
+ 12%|#2        | 20.4M/170M [00:00&lt;00:02, 76.4MB/s]
+ 19%|#8        | 32.0M/170M [00:00&lt;00:01, 90.6MB/s]
+ 28%|##8       | 48.0M/170M [00:00&lt;00:01, 111MB/s]
+ 38%|###7      | 64.0M/170M [00:00&lt;00:00, 124MB/s]
+ 46%|####5     | 77.7M/170M [00:00&lt;00:00, 130MB/s]
+ 53%|#####3    | 90.3M/170M [00:00&lt;00:00, 113MB/s]
+ 60%|#####9    | 102M/170M [00:00&lt;00:00, 111MB/s]
+ 66%|######6   | 113M/170M [00:01&lt;00:00, 102MB/s]
+ 72%|#######2  | 123M/170M [00:01&lt;00:00, 99.5MB/s]
+ 80%|#######9  | 136M/170M [00:01&lt;00:00, 110MB/s]
+ 86%|########6 | 147M/170M [00:01&lt;00:00, 100MB/s]
+ 92%|#########2| 156M/170M [00:01&lt;00:00, 97.9MB/s]
+ 98%|#########7| 166M/170M [00:01&lt;00:00, 99.0MB/s]
+100%|##########| 170M/170M [00:01&lt;00:00, 98.0MB/s]
 /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/nn/functional.py:3897: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
   for i in range(dim)
 /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/detection/anchor_utils.py:124: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the &#39;trunc&#39; function NOT &#39;floor&#39;). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode=&#39;trunc&#39;), or for actual floor division, use torch.div(a, b, rounding_mode=& [...]
@@ -565,7 +565,7 @@ torchvision rcnn models.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Get 9 valid boxes
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  10.512 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  11.352 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-object-detection-pytorch-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7795da4b258c8feff986668b95ef57ad/deploy_object_detection_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_object_detection_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized.html b/docs/how_to/deploy_models/deploy_prequantized.html
index 3bd6a62d77..253c3d20f3 100644
--- a/docs/how_to/deploy_models/deploy_prequantized.html
+++ b/docs/how_to/deploy_models/deploy_prequantized.html
@@ -497,8 +497,8 @@ training. Other models require a full post training calibration.</p>
 Downloading: &quot;https://download.pytorch.org/models/mobilenet_v2-b0353104.pth&quot; to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
 
   0%|          | 0.00/13.6M [00:00&lt;?, ?B/s]
- 59%|#####9    | 8.06M/13.6M [00:00&lt;00:00, 84.5MB/s]
-100%|##########| 13.6M/13.6M [00:00&lt;00:00, 115MB/s]
+ 59%|#####8    | 7.99M/13.6M [00:00&lt;00:00, 45.3MB/s]
+100%|##########| 13.6M/13.6M [00:00&lt;00:00, 59.0MB/s]
 </pre></div>
 </div>
 </div>
@@ -589,7 +589,7 @@ output values are identical out of 1000 outputs from mobilenet v2.</p>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  90.2928      90.1269      96.4289      89.9255       0.6989
+  90.3788      90.2874      91.6289      89.9409       0.3055
 </pre></div>
 </div>
 <div class="admonition note">
@@ -628,7 +628,7 @@ This includes support for the VNNI 8 bit dot product instruction (CascadeLake or
 <div class="section" id="deploy-a-quantized-tflite-model">
 <h2>Deploy a quantized TFLite Model<a class="headerlink" href="#deploy-a-quantized-tflite-model" title="Permalink to this headline">¶</a></h2>
 <p>TODO</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  5.283 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  5.409 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/fb8217c13f4351224c6cf3aacf1a87fc/deploy_prequantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized_tflite.html b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
index 36e5aa6943..69b71b6118 100644
--- a/docs/how_to/deploy_models/deploy_prequantized_tflite.html
+++ b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
@@ -582,7 +582,7 @@ TFLite Top-5 labels: [387 102 386 341 349]
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  119.6372     119.5717     124.3739     118.9210      0.5732
+  120.5285     120.5633     122.2465     119.1566      0.5380
 </pre></div>
 </div>
 <div class="admonition note">
@@ -610,7 +610,7 @@ network for ARM CPU</span></a>.</p></li>
 </ul>
 </div></blockquote>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  26.314 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  26.101 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-tflite-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/56691c7a27d45da61d112276334640d3/deploy_prequantized_tflite.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized_tflite.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_quantized.html b/docs/how_to/deploy_models/deploy_quantized.html
index e69ac0ee0f..4d5dabb937 100644
--- a/docs/how_to/deploy_models/deploy_quantized.html
+++ b/docs/how_to/deploy_models/deploy_quantized.html
@@ -520,7 +520,7 @@ for calibration. But the accuracy might be impacted.</p>
   DeprecationWarning,
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  34.749 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  41.075 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
index e54c3e7a52..e379f1110f 100644
--- a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
+++ b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
@@ -462,26 +462,24 @@ to your device.</p>
 Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
 
   0%|          | 0/132723 [00:00&lt;?, ?KB/s]
-  4%|3         | 5023/132723 [00:00&lt;00:02, 50217.48KB/s]
-  8%|7         | 10045/132723 [00:00&lt;00:03, 35305.58KB/s]
- 10%|#         | 13833/132723 [00:00&lt;00:03, 31021.29KB/s]
- 14%|#3        | 18364/132723 [00:00&lt;00:03, 35480.39KB/s]
- 19%|#9        | 25541/132723 [00:00&lt;00:02, 46671.01KB/s]
- 24%|##4       | 31974/132723 [00:00&lt;00:01, 52070.03KB/s]
- 28%|##8       | 37453/132723 [00:00&lt;00:02, 43895.77KB/s]
- 34%|###3      | 44756/132723 [00:00&lt;00:01, 51613.55KB/s]
- 38%|###7      | 50324/132723 [00:01&lt;00:01, 42353.31KB/s]
- 44%|####4     | 58441/132723 [00:01&lt;00:01, 51651.95KB/s]
- 50%|#####     | 66583/132723 [00:01&lt;00:01, 59240.34KB/s]
- 56%|#####6    | 74773/132723 [00:01&lt;00:00, 65287.53KB/s]
- 62%|######1   | 82046/132723 [00:01&lt;00:00, 66400.59KB/s]
- 68%|######7   | 90044/132723 [00:01&lt;00:00, 70200.92KB/s]
- 74%|#######4  | 98288/132723 [00:01&lt;00:00, 69739.41KB/s]
- 80%|########  | 106294/132723 [00:01&lt;00:00, 72612.93KB/s]
- 86%|########6 | 114672/132723 [00:01&lt;00:00, 75788.06KB/s]
- 92%|#########2| 122380/132723 [00:02&lt;00:00, 65867.14KB/s]
- 98%|#########8| 130499/132723 [00:02&lt;00:00, 69896.38KB/s]
-100%|##########| 132723/132723 [00:02&lt;00:00, 58197.62KB/s]
+  4%|4         | 5343/132723 [00:00&lt;00:02, 53423.68KB/s]
+ 10%|9         | 12897/132723 [00:00&lt;00:01, 66425.70KB/s]
+ 16%|#5        | 20616/132723 [00:00&lt;00:01, 71338.58KB/s]
+ 21%|##1       | 28452/132723 [00:00&lt;00:01, 74099.80KB/s]
+ 27%|##7       | 36419/132723 [00:00&lt;00:01, 76104.81KB/s]
+ 33%|###3      | 44387/132723 [00:00&lt;00:01, 77319.13KB/s]
+ 39%|###9      | 52418/132723 [00:00&lt;00:01, 78294.72KB/s]
+ 46%|####5     | 60422/132723 [00:00&lt;00:00, 78847.80KB/s]
+ 51%|#####1    | 68307/132723 [00:00&lt;00:01, 63664.85KB/s]
+ 57%|#####7    | 76293/132723 [00:01&lt;00:00, 67953.60KB/s]
+ 63%|######3   | 84258/132723 [00:01&lt;00:00, 71163.86KB/s]
+ 69%|######9   | 92226/132723 [00:01&lt;00:00, 73561.85KB/s]
+ 75%|#######5  | 100145/132723 [00:01&lt;00:00, 75175.09KB/s]
+ 82%|########1 | 108219/132723 [00:01&lt;00:00, 76791.56KB/s]
+ 87%|########7 | 116023/132723 [00:01&lt;00:00, 68569.23KB/s]
+ 93%|#########3| 124014/132723 [00:01&lt;00:00, 71646.66KB/s]
+100%|#########9| 132161/132723 [00:01&lt;00:00, 74389.55KB/s]
+100%|##########| 132723/132723 [00:01&lt;00:00, 72610.60KB/s]
 </pre></div>
 </div>
 <p>Create TVM runtime and do inference
@@ -520,7 +518,7 @@ Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from h
 <span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
 </pre></div>
 </div>
-<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  2.827 seconds)</p>
+<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  59.243 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-ssd-gluoncv-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/cccb17d28e5e8b2e94ea8cd5ec59f6ed/deploy_ssd_gluoncv.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_ssd_gluoncv.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/sg_execution_times.html b/docs/how_to/deploy_models/sg_execution_times.html
index 4e79d0266d..16de6acd7f 100644
--- a/docs/how_to/deploy_models/sg_execution_times.html
+++ b/docs/how_to/deploy_models/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-deploy-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>12:45.094</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
+<p><strong>12:47.834</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 86%" />
@@ -349,39 +349,39 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></td>
-<td><p>03:10.512</p></td>
+<td><p>03:11.352</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></td>
-<td><p>03:02.827</p></td>
+<td><p>02:59.243</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></td>
-<td><p>02:26.314</p></td>
+<td><p>02:26.101</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></td>
-<td><p>01:34.749</p></td>
+<td><p>01:41.075</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></td>
-<td><p>01:05.283</p></td>
+<td><p>01:05.409</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></td>
-<td><p>00:35.423</p></td>
+<td><p>00:35.326</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_nano.html#sphx-glr-how-to-deploy-models-deploy-model-on-nano-py"><span class="std std-ref">Deploy the Pretrained Model on Jetson Nano</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_nano.py</span></code>)</p></td>
-<td><p>00:25.200</p></td>
+<td><p>00:24.857</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></td>
-<td><p>00:24.779</p></td>
+<td><p>00:24.464</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></td>
-<td><p>00:00.006</p></td>
+<td><p>00:00.007</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/extend_tvm/bring_your_own_datatypes.html b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
index cb2b4bd6a8..c1a6f36da6 100644
--- a/docs/how_to/extend_tvm/bring_your_own_datatypes.html
+++ b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
@@ -621,7 +621,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
 <span class="n">module</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#dict" title="builtins.dict" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">params</span></a> <span class="o">=</span> <span class="n">get_mobilenet</span><span class="p">()</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip6669ce90-4254-4ca1-8a7b-de202ad7b39f from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip8412d4bf-c047-439a-a9c1-17e3480e18d4 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
 </pre></div>
 </div>
 <p>It’s easy to execute MobileNet with native TVM:</p>
diff --git a/docs/how_to/extend_tvm/sg_execution_times.html b/docs/how_to/extend_tvm/sg_execution_times.html
index 0f0550e2f5..55cb5ab967 100644
--- a/docs/how_to/extend_tvm/sg_execution_times.html
+++ b/docs/how_to/extend_tvm/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-extend-tvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:47.083</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
+<p><strong>00:47.456</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 84%" />
@@ -349,19 +349,19 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></td>
-<td><p>00:43.638</p></td>
+<td><p>00:44.053</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></td>
-<td><p>00:02.411</p></td>
+<td><p>00:02.385</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></td>
-<td><p>00:01.025</p></td>
+<td><p>00:01.010</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></td>
-<td><p>00:00.009</p></td>
+<td><p>00:00.008</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/extend_tvm/use_pass_instrument.html b/docs/how_to/extend_tvm/use_pass_instrument.html
index 81936b46af..9042daa25a 100644
--- a/docs/how_to/extend_tvm/use_pass_instrument.html
+++ b/docs/how_to/extend_tvm/use_pass_instrument.html
@@ -525,10 +525,10 @@ profile the execution time of each passes.</p>
 </pre></div>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 7157us [7157us] (46.40%; 46.40%)
-FoldScaleAxis: 8268us [6us] (53.60%; 53.60%)
-        FoldConstant: 8262us [1673us] (53.56%; 99.93%)
-                InferType: 6589us [6589us] (42.72%; 79.75%)
+InferType: 7107us [7107us] (46.22%; 46.22%)
+FoldScaleAxis: 8271us [7us] (53.78%; 53.78%)
+        FoldConstant: 8264us [1645us] (53.74%; 99.92%)
+                InferType: 6620us [6620us] (43.05%; 80.10%)
 </pre></div>
 </div>
 </div>
@@ -550,10 +550,10 @@ Refer to following sections and <a class="reference internal" href="../../refere
 </pre></div>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6620us [6620us] (44.98%; 44.98%)
-FoldScaleAxis: 8100us [5us] (55.02%; 55.02%)
-        FoldConstant: 8095us [1657us] (54.99%; 99.94%)
-                InferType: 6438us [6438us] (43.73%; 79.53%)
+InferType: 6581us [6581us] (45.29%; 45.29%)
+FoldScaleAxis: 7949us [5us] (54.71%; 54.71%)
+        FoldConstant: 7944us [1616us] (54.67%; 99.94%)
+                InferType: 6328us [6328us] (43.55%; 79.65%)
 </pre></div>
 </div>
 <p>Register empty list to clear existing instruments.</p>
diff --git a/docs/how_to/optimize_operators/opt_conv_cuda.html b/docs/how_to/optimize_operators/opt_conv_cuda.html
index 2de017e734..4cdad6656e 100644
--- a/docs/how_to/optimize_operators/opt_conv_cuda.html
+++ b/docs/how_to/optimize_operators/opt_conv_cuda.html
@@ -577,7 +577,7 @@ latency of convolution.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Convolution: </span><span class="si">%f</span><span class="s2"> ms&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">*</span> <span cl [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 53.378337 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 50.913246 ms
 </pre></div>
 </div>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-optimize-operators-opt-conv-cuda-py">
diff --git a/docs/how_to/optimize_operators/opt_conv_tensorcore.html b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
index bca7bc46bc..b1e534c47b 100644
--- a/docs/how_to/optimize_operators/opt_conv_tensorcore.html
+++ b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
@@ -914,7 +914,7 @@ be able to run on our build server</p>
     <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;conv2d with tensor core: </span><span class="si">%f</span><span class="s2"> ms&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">* [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 12.173318 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 12.974272 ms
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/optimize_operators/opt_gemm.html b/docs/how_to/optimize_operators/opt_gemm.html
index fc3bef1647..cf4fe5c86d 100644
--- a/docs/how_to/optimize_operators/opt_gemm.html
+++ b/docs/how_to/optimize_operators/opt_gemm.html
@@ -474,8 +474,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Baseline: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018595
-Baseline: 3.199267
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018348
+Baseline: 3.426208
 </pre></div>
 </div>
 <p>In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -534,7 +534,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt1: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.297510
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.290674
 </pre></div>
 </div>
 <p>Here is the generated IR after blocking.</p>
@@ -600,7 +600,7 @@ vastly.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt2: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.332023
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.333141
 </pre></div>
 </div>
 <p>Here is the generated IR after vectorization.</p>
@@ -660,7 +660,7 @@ the access pattern for A matrix is more cache friendly.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt3: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.115810
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.117250
 </pre></div>
 </div>
 <p>Here is the generated IR after loop permutation.</p>
@@ -742,7 +742,7 @@ flattening.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt4: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.109238
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.109634
 </pre></div>
 </div>
 <p>Here is the generated IR after array packing.</p>
@@ -827,7 +827,7 @@ write to C when all the block results are ready.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt5: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111007
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111331
 </pre></div>
 </div>
 <p>Here is the generated IR after blocking.</p>
@@ -916,7 +916,7 @@ write to C when all the block results are ready.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt6: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">opt6_time</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.146501
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.146571
 </pre></div>
 </div>
 <p>Here is the generated IR after parallelization.</p>
diff --git a/docs/how_to/optimize_operators/sg_execution_times.html b/docs/how_to/optimize_operators/sg_execution_times.html
index ac99d06699..f02fd213e3 100644
--- a/docs/how_to/optimize_operators/sg_execution_times.html
+++ b/docs/how_to/optimize_operators/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-optimize-operators-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:34.117</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
+<p><strong>00:34.896</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 83%" />
@@ -349,15 +349,15 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></td>
-<td><p>00:31.586</p></td>
+<td><p>00:32.200</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></td>
-<td><p>00:01.464</p></td>
+<td><p>00:01.529</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></td>
-<td><p>00:01.067</p></td>
+<td><p>00:01.167</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
index 4c3d31ecd9..4433c7838c 100644
--- a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
+++ b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-tune-with-autoscheduler-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>09:04.571</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
+<p><strong>08:56.121</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 85%" />
@@ -349,27 +349,27 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></td>
-<td><p>05:36.615</p></td>
+<td><p>05:33.874</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></td>
-<td><p>01:32.239</p></td>
+<td><p>01:31.040</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></td>
-<td><p>01:03.601</p></td>
+<td><p>01:00.605</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></td>
-<td><p>00:28.731</p></td>
+<td><p>00:27.200</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></td>
-<td><p>00:12.062</p></td>
+<td><p>00:12.036</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></td>
-<td><p>00:11.323</p></td>
+<td><p>00:11.366</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
index 86b7f901a8..bb430a5f0c 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
@@ -503,483 +503,86 @@ cooperative fetching, unrolling and operator fusion.</p>
              bias: Buffer(bias_2: Pointer(float32), float32, [1, 512, 1, 1], []),
              compute: Buffer(compute_2: Pointer(float32), float32, [1, 512, 7, 7], [])}
   buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute} {
-  attr [IterVar(blockIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;blockIdx.x&quot;)] &quot;thread_extent&quot; = 28;
-  allocate(conv2d_nchw: Pointer(local float32), float32, [14]), storage_scope = local;
-  allocate(pad_temp.shared: Pointer(shared float32), float32, [72]), storage_scope = shared;
-  allocate(kernel.shared: Pointer(shared float32), float32, [3072]), storage_scope = shared;
-  attr [IterVar(threadIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64 {
-    conv2d_nchw_1: Buffer(conv2d_nchw, float32, [14], [], scope=&quot;local&quot;, align=32)[0] = 0f32
-    conv2d_nchw_1[1] = 0f32
+  attr [IterVar(blockIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;blockIdx.x&quot;)] &quot;thread_extent&quot; = 16;
+  allocate(conv2d_nchw: Pointer(local float32), float32, [4]), storage_scope = local;
+  allocate(pad_temp.shared: Pointer(shared float32), float32, [1008]), storage_scope = shared;
+  allocate(kernel.shared: Pointer(shared float32), float32, [1536]), storage_scope = shared;
+  attr [IterVar(threadIdx.x: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 392 {
+    conv2d_nchw_1: Buffer(conv2d_nchw, float32, [4], [], scope=&quot;local&quot;, align=8)[0] = 0f32
     conv2d_nchw_1[2] = 0f32
+    conv2d_nchw_1[1] = 0f32
     conv2d_nchw_1[3] = 0f32
-    conv2d_nchw_1[4] = 0f32
-    conv2d_nchw_1[5] = 0f32
-    conv2d_nchw_1[6] = 0f32
-    conv2d_nchw_1[7] = 0f32
-    conv2d_nchw_1[8] = 0f32
-    conv2d_nchw_1[9] = 0f32
-    conv2d_nchw_1[10] = 0f32
-    conv2d_nchw_1[11] = 0f32
-    conv2d_nchw_1[12] = 0f32
-    conv2d_nchw_1[13] = 0f32
-    for (rc.outer.outer: int32, 0, 64) {
+    for (rc.outer.outer: int32, 0, 32) {
       for (ry.outer.outer: int32, 0, 3) {
-        let cse_var_2: int32 = (rc.outer.outer*72)
+        let cse_var_4: int32 = (rc.outer.outer*784)
+        let cse_var_3: int32 = (rc.outer.outer*144)
+        let cse_var_2: int32 = (ry.outer.outer*7)
         let cse_var_1: int32 = (ry.outer.outer*3)
          {
-          attr [IterVar(threadIdx.x_1: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64 {
-            if @tir.likely((threadIdx.x_1 &lt; 18), dtype=bool) {
-              pad_temp.shared_1: Buffer(pad_temp.shared, float32, [72], [], scope=&quot;shared&quot;)[(threadIdx.x_1*4)] = @tir.if_then_else(((((1 &lt;= (ry.outer.outer + floormod(blockIdx.x, 7))) &amp;&amp; ((ry.outer.outer + floormod(blockIdx.x, 7)) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1*4), 9))) &amp;&amp; (floormod((threadIdx.x_1*4), 9) &lt; 8)), data_3: Buffer(data_2, float32, [25088], [])[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1*4), 9)*49)) + (ry.outer.out [...]
-            }
-            if @tir.likely((threadIdx.x_1 &lt; 18), dtype=bool) {
-              pad_temp.shared_1[((threadIdx.x_1*4) + 1)] = @tir.if_then_else(((((1 &lt;= (ry.outer.outer + floormod(blockIdx.x, 7))) &amp;&amp; ((ry.outer.outer + floormod(blockIdx.x, 7)) &lt; 8)) &amp;&amp; (1 &lt;= floormod(((threadIdx.x_1*4) + 1), 9))) &amp;&amp; (floormod(((threadIdx.x_1*4) + 1), 9) &lt; 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 1), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 1), 9)) - 8)], [...]
+          attr [IterVar(threadIdx.x_1: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 392 {
+            pad_temp.shared_1: Buffer(pad_temp.shared, float32, [1008], [], scope=&quot;shared&quot;)[(threadIdx.x_1*2)] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod((threadIdx.x_1*2), 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod((threadIdx.x_1*2), 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod((threadIdx.x_1*2), 9))) &amp;&amp; (floormod((threadIdx.x_1*2), 9) &lt; 8)), data_3: Buffer(data_2, float32, [25088], [])[((((cse_var_4 + (floordiv((threadIdx.x [...]
+            pad_temp.shared_1[((threadIdx.x_1*2) + 1)] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod(((threadIdx.x_1*2) + 1), 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod(((threadIdx.x_1*2) + 1), 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(((threadIdx.x_1*2) + 1), 9))) &amp;&amp; (floormod(((threadIdx.x_1*2) + 1), 9) &lt; 8)), data_3[((((cse_var_4 + (floordiv(((threadIdx.x_1*2) + 1), 9)*7)) + cse_var_2) + floormod(((threadIdx.x_1*2) + 1), 9)) - 8)],  [...]
+          }
+          attr [IterVar(threadIdx.x_1, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 392 {
+            if @tir.likely((threadIdx.x_1 &lt; 112), dtype=bool) {
+              pad_temp.shared_1[((threadIdx.x_1*2) + 784)] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod(((threadIdx.x_1*2) + 28), 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod(((threadIdx.x_1*2) + 28), 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(((threadIdx.x_1*2) + 1), 9))) &amp;&amp; (floormod(((threadIdx.x_1*2) + 1), 9) &lt; 8)), data_3[((((cse_var_4 + (floordiv(((threadIdx.x_1*2) + 784), 9)*7)) + cse_var_2) + floormod(((threadIdx.x_1*2) + 1), 9)) [...]
             }
-            if @tir.likely((threadIdx.x_1 &lt; 18), dtype=bool) {
-              pad_temp.shared_1[((threadIdx.x_1*4) + 2)] = @tir.if_then_else(((((1 &lt;= (ry.outer.outer + floormod(blockIdx.x, 7))) &amp;&amp; ((ry.outer.outer + floormod(blockIdx.x, 7)) &lt; 8)) &amp;&amp; (1 &lt;= floormod(((threadIdx.x_1*4) + 2), 9))) &amp;&amp; (floormod(((threadIdx.x_1*4) + 2), 9) &lt; 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 2), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 2), 9)) - 8)], [...]
+            if @tir.likely((threadIdx.x_1 &lt; 112), dtype=bool) {
+              pad_temp.shared_1[((threadIdx.x_1*2) + 785)] = @tir.if_then_else(((((1 &lt;= (floordiv(floormod(((threadIdx.x_1*2) + 29), 63), 9) + ry.outer.outer)) &amp;&amp; ((floordiv(floormod(((threadIdx.x_1*2) + 29), 63), 9) + ry.outer.outer) &lt; 8)) &amp;&amp; (1 &lt;= floormod(((threadIdx.x_1*2) + 2), 9))) &amp;&amp; (floormod(((threadIdx.x_1*2) + 2), 9) &lt; 8)), data_3[((((cse_var_4 + (floordiv(((threadIdx.x_1*2) + 785), 9)*7)) + cse_var_2) + floormod(((threadIdx.x_1*2) + 2), 9)) [...]
             }
-            if @tir.likely((threadIdx.x_1 &lt; 18), dtype=bool) {
-              pad_temp.shared_1[((threadIdx.x_1*4) + 3)] = @tir.if_then_else(((((1 &lt;= (ry.outer.outer + floormod(blockIdx.x, 7))) &amp;&amp; ((ry.outer.outer + floormod(blockIdx.x, 7)) &lt; 8)) &amp;&amp; (1 &lt;= floormod(((threadIdx.x_1*4) + 3), 9))) &amp;&amp; (floormod(((threadIdx.x_1*4) + 3), 9) &lt; 8)), data_3[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 3), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 3), 9)) - 8)], [...]
+          }
+          attr [IterVar(threadIdx.x_2: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 392;
+          kernel.shared_1: Buffer(kernel.shared, float32, [1536], [], scope=&quot;shared&quot;)[threadIdx.x_2] = kernel_3: Buffer(kernel_2, float32, [2359296], [])[((((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 48)*4608)) + cse_var_3) + (floordiv(floormod(threadIdx.x_2, 48), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 392;
+          kernel.shared_1[(threadIdx.x_2 + 392)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 392), 48)*4608)) + cse_var_3) + (floordiv(floormod((threadIdx.x_2 + 8), 48), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 392;
+          kernel.shared_1[(threadIdx.x_2 + 784)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 784), 48)*4608)) + cse_var_3) + (floordiv(floormod((threadIdx.x_2 + 16), 48), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
+          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 392;
+          if @tir.likely((threadIdx.x_2 &lt; 360), dtype=bool) {
+            kernel.shared_1[(threadIdx.x_2 + 1176)] = kernel_3[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1176), 48)*4608)) + cse_var_3) + (floormod((floordiv(threadIdx.x_2, 3) + 8), 16)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
+          }
+          for (rc.outer.inner: int32, 0, 2) {
+            for (rx.outer.inner: int32, 0, 3) {
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7))]*kernel.shared_1[(((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7))]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 768)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7))]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 48)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7))]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 816)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 3)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 771)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 51)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 819)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 6)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 774)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 54)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 822)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 189)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 9)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 189)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 777)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 189)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 57)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 189)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 825)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 252)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 12)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 252)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 780)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 252)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 60)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 252)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 828)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 315)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 15)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 315)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 783)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 315)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 63)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 315)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 831)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 378)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 18)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 378)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 786)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 378)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 66)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 378)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 834)]))
+              conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 441)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 21)]))
+              conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 441)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 789)]))
+              conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 441)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 69)]))
+              conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((((rc.outer.inner*504) + (floordiv(floormod(threadIdx.x, 49), 7)*9)) + rx.outer.inner) + floormod(threadIdx.x, 7)) + 441)]*kernel.shared_1[((((floordiv(threadIdx.x, 49)*96) + (rc.outer.inner*24)) + rx.outer.inner) + 837)]))
             }
           }
-          attr [IterVar(threadIdx.x_2: int32, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1: Buffer(kernel.shared, float32, [3072], [], scope=&quot;shared&quot;)[threadIdx.x_2] = kernel_3: Buffer(kernel_2, float32, [2359296], [])[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 64)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 64), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 128)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 128), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 192)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 36864)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 256)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 256), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 320)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 320), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 384)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 73728)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 448)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 448), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 512)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 512), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 576)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 110592)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 640)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 640), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 704)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 704), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 768)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 147456)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 832)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 832), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 896)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 896), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 960)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 184320)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1024)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1024), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1088)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1088), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1152)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 221184)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1216)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1216), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1280)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1280), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 258048)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1408)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1408), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1472)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1472), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1536)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 294912)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1600)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1600), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1664)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1664), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1728)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 331776)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1792), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1856)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1856), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1920)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 368640)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 1984)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1984), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2048)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2048), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2112)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 405504)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2176)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2176), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2240), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2304)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 442368)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2368)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2368), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2432)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2432), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2496)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 479232)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2560)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2560), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2624)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2624), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 516096)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2752)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2752), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2816)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2816), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2880)] = kernel_3[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 552960)]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 2944)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2944), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
-          attr [IterVar(threadIdx.x_2, (nullptr), &quot;ThreadIndex&quot;, &quot;threadIdx.x&quot;)] &quot;thread_extent&quot; = 64;
-          kernel.shared_1[(threadIdx.x_2 + 3008)] = kernel_3[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 3008), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[0]*kernel.shared_1[(threadIdx.x*48)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 3)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[1]*kernel.shared_1[(threadIdx.x*48)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 3)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[2]*kernel.shared_1[(threadIdx.x*48)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 3)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[3]*kernel.shared_1[(threadIdx.x*48)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 3)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[4]*kernel.shared_1[(threadIdx.x*48)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 3)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[5]*kernel.shared_1[(threadIdx.x*48)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 3)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[6]*kernel.shared_1[(threadIdx.x*48)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 3)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[0]*kernel.shared_1[((threadIdx.x*48) + 24)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 27)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 24)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 27)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 24)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 27)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 24)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 27)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 24)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 27)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 24)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 27)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 24)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 27)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 1)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 4)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 1)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 4)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 1)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 4)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 1)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 4)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 1)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 4)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 1)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 4)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 1)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 4)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 25)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 28)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 25)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 28)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 25)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 28)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 25)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 28)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 25)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 28)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 25)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 28)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 25)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 28)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 2)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 5)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 2)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 5)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 2)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 5)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 2)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 5)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 2)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 5)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 2)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 5)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 2)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 5)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 26)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 29)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 26)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 29)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 26)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 29)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 26)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 29)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 26)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 29)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 26)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 29)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 26)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 29)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 6)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 9)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 6)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 9)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 6)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 9)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 6)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 9)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 6)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 9)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 6)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 9)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 6)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 9)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 30)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 33)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 30)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 33)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 30)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 33)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 30)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 33)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 30)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 33)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 30)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 33)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 30)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 33)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 7)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 10)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 7)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 10)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 7)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 10)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 7)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 10)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 7)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 10)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 7)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 10)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 7)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 10)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 31)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 34)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 31)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 34)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 31)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 34)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 31)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 34)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 31)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 34)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 31)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 34)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 31)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 34)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 8)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 11)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 8)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 11)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 8)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 11)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 8)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 11)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 8)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 11)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 8)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 11)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 8)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 11)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 32)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 35)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 32)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 35)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 32)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 35)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 32)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 35)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 32)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 35)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 32)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 35)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 32)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 35)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 12)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 15)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 12)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 15)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 12)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 15)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 12)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 15)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 12)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 15)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 12)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 15)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 12)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 15)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 36)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 39)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 36)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 39)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 36)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 39)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 36)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 39)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 36)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 39)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 36)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 39)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 36)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 39)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 13)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 16)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 13)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 16)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 13)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 16)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 13)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 16)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 13)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 16)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 13)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 16)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 13)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 16)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 37)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 40)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 37)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 40)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 37)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 40)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 37)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 40)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 37)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 40)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 37)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 40)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 37)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 40)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 14)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 17)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 14)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 17)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 14)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 17)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 14)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 17)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 14)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 17)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 14)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 17)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 14)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 17)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 38)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 41)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 38)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 41)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 38)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 41)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 38)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 41)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 38)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 41)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 38)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 41)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 38)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 41)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 18)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 21)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 18)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 21)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 18)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 21)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 18)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 21)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 18)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 21)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 18)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 21)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 18)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 21)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 42)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 45)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 42)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 45)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 42)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 45)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 42)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 45)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 42)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 45)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 42)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 45)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 42)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 45)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 19)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 22)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 19)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 22)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 19)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 22)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 19)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 22)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 19)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 22)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 19)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 22)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 19)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 22)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 43)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 46)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 43)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 46)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 43)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 46)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 43)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 46)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 43)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 46)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 43)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 46)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 43)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 46)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 20)]))
-          conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 23)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 20)]))
-          conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 23)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 20)]))
-          conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 23)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 20)]))
-          conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 23)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 20)]))
-          conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 23)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 20)]))
-          conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 23)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 20)]))
-          conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 23)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 44)]))
-          conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 47)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 44)]))
-          conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 47)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 44)]))
-          conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 47)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 44)]))
-          conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 47)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 44)]))
-          conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 47)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 44)]))
-          conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 47)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 44)]))
-          conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 47)]))
         }
       }
     }
     for (i1.inner: int32, 0, 2) {
-      for (i3.inner: int32, 0, 7) {
-        compute_3: Buffer(compute_2, float32, [25088], [])[(((((floordiv(blockIdx.x, 7)*6272) + (threadIdx.x*98)) + (i1.inner*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[((i1.inner*7) + i3.inner)] + bias_3: Buffer(bias_2, float32, [512], [])[(((floordiv(blockIdx.x, 7)*128) + (threadIdx.x*2)) + i1.inner)]), 0f32)
-      }
+      compute_3: Buffer(compute_2, float32, [25088], [])[((((blockIdx.x*1568) + (floordiv(threadIdx.x, 49)*98)) + (i1.inner*49)) + floormod(threadIdx.x, 49))] = max((conv2d_nchw_1[i1.inner] + bias_3: Buffer(bias_2, float32, [512], [])[(((blockIdx.x*32) + (floordiv(threadIdx.x, 49)*2)) + i1.inner)]), 0f32)
+      compute_3[(((((blockIdx.x*1568) + (floordiv(threadIdx.x, 49)*98)) + (i1.inner*49)) + floormod(threadIdx.x, 49)) + 784)] = max((conv2d_nchw_1[(i1.inner + 2)] + bias_3[((((blockIdx.x*32) + (floordiv(threadIdx.x, 49)*2)) + i1.inner) + 16)]), 0f32)
     }
   }
 }
@@ -1016,7 +619,7 @@ cooperative fetching, unrolling and operator fusion.</p>
 <span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.362 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.315 ms
 </pre></div>
 </div>
 </div>
@@ -1045,20 +648,20 @@ conv2d_nchw_nn_o_i, conv2d_nchw_nn_i = s[conv2d_nchw].split(conv2d_nchw_nn, fact
 conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_i, factor=1)
 conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
 conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
-conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
-conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
-conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
-conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
+conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=2)
+conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
+conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=8)
+conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=2)
 conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
 conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
-conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
+conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
 conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
 conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
-conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
-conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
+conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
+conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
 conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
-conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
-conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
+conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=8)
+conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=2)
 conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
 conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
 conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
@@ -1068,13 +671,13 @@ compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
 compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
 compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
 compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
-compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
-compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
+compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=8)
+compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=2)
 compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
-compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
+compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
 compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
-compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
+compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
 compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
 s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
 s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -1094,14 +697,14 @@ s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread
 kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
 kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
 s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=392)
 s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis(&quot;threadIdx.x&quot;))
 pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=2)
 s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=392)
 s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis(&quot;threadIdx.x&quot;))
-s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, &quot;auto_unroll_max_step&quot;, 512)
+s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, &quot;auto_unroll_max_step&quot;, 64)
 s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, &quot;unroll_explicit&quot;, True)
 
 CUDA source code:
@@ -1119,430 +722,73 @@ CUDA source code:
   #define int64_t long long
   #define uint64_t unsigned long long
 #endif
-extern &quot;C&quot; __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
-  float conv2d_nchw[14];
-  __shared__ float pad_temp_shared[72];
-  __shared__ float kernel_shared[3072];
+extern &quot;C&quot; __global__ void __launch_bounds__(392) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+  float conv2d_nchw[4];
+  __shared__ float pad_temp_shared[1008];
+  __shared__ float kernel_shared[1536];
   conv2d_nchw[0] = 0.000000e+00f;
-  conv2d_nchw[1] = 0.000000e+00f;
   conv2d_nchw[2] = 0.000000e+00f;
+  conv2d_nchw[1] = 0.000000e+00f;
   conv2d_nchw[3] = 0.000000e+00f;
-  conv2d_nchw[4] = 0.000000e+00f;
-  conv2d_nchw[5] = 0.000000e+00f;
-  conv2d_nchw[6] = 0.000000e+00f;
-  conv2d_nchw[7] = 0.000000e+00f;
-  conv2d_nchw[8] = 0.000000e+00f;
-  conv2d_nchw[9] = 0.000000e+00f;
-  conv2d_nchw[10] = 0.000000e+00f;
-  conv2d_nchw[11] = 0.000000e+00f;
-  conv2d_nchw[12] = 0.000000e+00f;
-  conv2d_nchw[13] = 0.000000e+00f;
-  for (int rc_outer_outer = 0; rc_outer_outer &lt; 64; ++rc_outer_outer) {
+  for (int rc_outer_outer = 0; rc_outer_outer &lt; 32; ++rc_outer_outer) {
     for (int ry_outer_outer = 0; ry_outer_outer &lt; 3; ++ry_outer_outer) {
       __syncthreads();
-      if (((int)threadIdx.x) &lt; 18) {
-        pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) * 4) % 9))) &amp;&amp; (((((int)threadIdx.x) * 4) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
-      }
-      if (((int)threadIdx.x) &lt; 18) {
-        pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= (((((int)threadIdx.x) * 4) + 1) % 9))) &amp;&amp; ((((((int)threadIdx.x) * 4) + 1) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[(((int)threadIdx.x) * 2)] = (((((1 &lt;= ((((((int)threadIdx.x) * 2) % 63) / 9) + ry_outer_outer)) &amp;&amp; (((((((int)threadIdx.x) * 2) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) * 2) % 9))) &amp;&amp; (((((int)threadIdx.x) * 2) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 2) / 9) * 7)) + (ry_outer_outer * 7)) + ((((int)threadIdx.x) * 2) % 9)) - 8)] : 0.000000e+00f);
+      pad_temp_shared[((((int)threadIdx.x) * 2) + 1)] = (((((1 &lt;= (((((((int)threadIdx.x) * 2) + 1) % 63) / 9) + ry_outer_outer)) &amp;&amp; ((((((((int)threadIdx.x) * 2) + 1) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= (((((int)threadIdx.x) * 2) + 1) % 9))) &amp;&amp; ((((((int)threadIdx.x) * 2) + 1) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 7)) + (ry_outer_outer * 7)) + (((((int)threadIdx.x) * 2) + 1) % 9)) - 8)] : 0.0000 [...]
+      if (((int)threadIdx.x) &lt; 112) {
+        pad_temp_shared[((((int)threadIdx.x) * 2) + 784)] = (((((1 &lt;= (((((((int)threadIdx.x) * 2) + 28) % 63) / 9) + ry_outer_outer)) &amp;&amp; ((((((((int)threadIdx.x) * 2) + 28) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= (((((int)threadIdx.x) * 2) + 1) % 9))) &amp;&amp; ((((((int)threadIdx.x) * 2) + 1) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 2) + 784) / 9) * 7)) + (ry_outer_outer * 7)) + (((((int)threadIdx.x) * 2) + 1) % 9)) - 8)]  [...]
       }
-      if (((int)threadIdx.x) &lt; 18) {
-        pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= (((((int)threadIdx.x) * 4) + 2) % 9))) &amp;&amp; ((((((int)threadIdx.x) * 4) + 2) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
+      if (((int)threadIdx.x) &lt; 112) {
+        pad_temp_shared[((((int)threadIdx.x) * 2) + 785)] = (((((1 &lt;= (((((((int)threadIdx.x) * 2) + 29) % 63) / 9) + ry_outer_outer)) &amp;&amp; ((((((((int)threadIdx.x) * 2) + 29) % 63) / 9) + ry_outer_outer) &lt; 8)) &amp;&amp; (1 &lt;= (((((int)threadIdx.x) * 2) + 2) % 9))) &amp;&amp; ((((((int)threadIdx.x) * 2) + 2) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 2) + 785) / 9) * 7)) + (ry_outer_outer * 7)) + (((((int)threadIdx.x) * 2) + 2) % 9)) - 8)]  [...]
       }
-      if (((int)threadIdx.x) &lt; 18) {
-        pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= (((((int)threadIdx.x) * 4) + 3) % 9))) &amp;&amp; ((((((int)threadIdx.x) * 4) + 3) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
+      kernel_shared[((int)threadIdx.x)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) % 48) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 392)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 392) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 8) % 48) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+      kernel_shared[(((int)threadIdx.x) + 784)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 784) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 16) % 48) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+      if (((int)threadIdx.x) &lt; 360) {
+        kernel_shared[(((int)threadIdx.x) + 1176)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1176) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 8) &amp; 15) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
       }
-      kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
-      kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
-      kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
-      kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
-      kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
-      kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
-      kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
-      kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
-      kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
-      kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
-      kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
-      kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
-      kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
-      kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
-      kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
-      kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
       __syncthreads();
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+      for (int rc_outer_inner = 0; rc_outer_inner &lt; 2; ++rc_outer_inner) {
+        for (int rx_outer_inner = 0; rx_outer_inner &lt; 3; ++rx_outer_inner) {
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7))] * kernel_shared[((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 768)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 48)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 816)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 3)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 771)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 51)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 819)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 6)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 774)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 54)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 822)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 189)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 9)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 189)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 777)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 189)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 57)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 189)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 825)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 252)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 12)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 252)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 780)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 252)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 60)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 252)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 828)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 315)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 15)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 315)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 783)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 315)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 63)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 315)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 831)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 378)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 18)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 378)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 786)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 378)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 66)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 378)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 834)]));
+          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 441)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 21)]));
+          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 441)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 789)]));
+          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 441)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 69)]));
+          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((rc_outer_inner * 504) + (((((int)threadIdx.x) % 49) / 7) * 9)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 441)] * kernel_shared[(((((((int)threadIdx.x) / 49) * 96) + (rc_outer_inner * 24)) + rx_outer_inner) + 837)]));
+        }
+      }
     }
   }
   for (int i1_inner = 0; i1_inner &lt; 2; ++i1_inner) {
-    for (int i3_inner = 0; i3_inner &lt; 7; ++i3_inner) {
-      compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
-    }
+    compute[((((((int)blockIdx.x) * 1568) + ((((int)threadIdx.x) / 49) * 98)) + (i1_inner * 49)) + (((int)threadIdx.x) % 49))] = max((conv2d_nchw[i1_inner] + bias[(((((int)blockIdx.x) * 32) + ((((int)threadIdx.x) / 49) * 2)) + i1_inner)]), 0.000000e+00f);
+    compute[(((((((int)blockIdx.x) * 1568) + ((((int)threadIdx.x) / 49) * 98)) + (i1_inner * 49)) + (((int)threadIdx.x) % 49)) + 784)] = max((conv2d_nchw[(i1_inner + 2)] + bias[((((((int)blockIdx.x) * 32) + ((((int)threadIdx.x) / 49) * 2)) + i1_inner) + 16)]), 0.000000e+00f);
   }
 }
 </pre></div>
@@ -1579,7 +825,7 @@ In the example below we resume the status and do more 5 trials.</p>
 Get devices for measurement successfully!
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes  36.615 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes  33.874 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/e3e540f3b477c0c52d8eb73e674e8ffd/tune_conv2d_layer_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_conv2d_layer_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
index 45f15a2d9e..35d12c4639 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
@@ -915,7 +915,7 @@ so we can read the log file and load the best schedules.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-   7.8919       7.8903       7.8979       7.8877       0.0043
+   7.8687       7.8667       7.8751       7.8643       0.0046
 </pre></div>
 </div>
 </div>
@@ -937,7 +937,7 @@ to learn how to use the RPC Tracker and RPC Server.
 To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
 with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
 </ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  3.601 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  0.605 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-cuda-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/eafe360d52540634c9eea0fa89e804bd/tune_network_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
index dd45bf8d90..db7b42951a 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
@@ -934,7 +934,7 @@ so we can read the log file and load the best schedules.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  754.9071     754.6821     755.7237     754.3153      0.5966
+  755.3878     755.7386     755.9539     754.4709      0.6543
 </pre></div>
 </div>
 </div>
@@ -956,7 +956,7 @@ to learn how to use the RPC Tracker and RPC Server.
 To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
 with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
 </ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  32.239 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  31.040 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-x86-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/e416b94ca1090b0897c0f6e0df95b911/tune_network_x86.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_x86.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
index b7705dbc08..b7356c2f28 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
@@ -632,77 +632,75 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
              placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [128, 512], []),
              compute: Buffer(compute_2: Pointer(float32), float32, [128, 512], [])}
   buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute} {
-  for (i0.outer.i1.outer.fused: int32, 0, 16) &quot;parallel&quot; {
-    allocate(compute_3: Pointer(global float32), float32, [4096]), storage_scope = global {
-      for (i.outer.inner: int32, 0, 2) {
-        for (nb_j.inner: int32, 0, 2) {
-          for (i.inner.init: int32, 0, 64) {
-            let cse_var_1: int32 = (((i.outer.inner*2048) + (i.inner.init*32)) + (nb_j.inner*16))
-             {
-              compute_4: Buffer(compute_3, float32, [4096], [])[cse_var_1] = 0f32
-              compute_4[(cse_var_1 + 1)] = 0f32
-              compute_4[(cse_var_1 + 2)] = 0f32
-              compute_4[(cse_var_1 + 3)] = 0f32
-              compute_4[(cse_var_1 + 4)] = 0f32
-              compute_4[(cse_var_1 + 5)] = 0f32
-              compute_4[(cse_var_1 + 6)] = 0f32
-              compute_4[(cse_var_1 + 7)] = 0f32
-              compute_4[(cse_var_1 + 8)] = 0f32
-              compute_4[(cse_var_1 + 9)] = 0f32
-              compute_4[(cse_var_1 + 10)] = 0f32
-              compute_4[(cse_var_1 + 11)] = 0f32
-              compute_4[(cse_var_1 + 12)] = 0f32
-              compute_4[(cse_var_1 + 13)] = 0f32
-              compute_4[(cse_var_1 + 14)] = 0f32
-              compute_4[(cse_var_1 + 15)] = 0f32
-            }
+  for (i0.outer.i1.outer.fused: int32, 0, 64) &quot;parallel&quot; {
+    allocate(compute_3: Pointer(global float32), float32, [1024]), storage_scope = global {
+      for (nb_j.inner: int32, 0, 2) {
+        for (i.inner.init: int32, 0, 32) {
+          let cse_var_1: int32 = ((i.inner.init*32) + (nb_j.inner*16))
+           {
+            compute_4: Buffer(compute_3, float32, [1024], [])[cse_var_1] = 0f32
+            compute_4[(cse_var_1 + 1)] = 0f32
+            compute_4[(cse_var_1 + 2)] = 0f32
+            compute_4[(cse_var_1 + 3)] = 0f32
+            compute_4[(cse_var_1 + 4)] = 0f32
+            compute_4[(cse_var_1 + 5)] = 0f32
+            compute_4[(cse_var_1 + 6)] = 0f32
+            compute_4[(cse_var_1 + 7)] = 0f32
+            compute_4[(cse_var_1 + 8)] = 0f32
+            compute_4[(cse_var_1 + 9)] = 0f32
+            compute_4[(cse_var_1 + 10)] = 0f32
+            compute_4[(cse_var_1 + 11)] = 0f32
+            compute_4[(cse_var_1 + 12)] = 0f32
+            compute_4[(cse_var_1 + 13)] = 0f32
+            compute_4[(cse_var_1 + 14)] = 0f32
+            compute_4[(cse_var_1 + 15)] = 0f32
           }
-          for (elem_idx: int32, 0, let cse_var_2: int32 = ((i0.outer.i1.outer.fused*2) + nb_j.inner) in (placeholder_15: Buffer(placeholder_13, int32, [33], [])[(cse_var_2 + 1)] - placeholder_15[cse_var_2])) {
-            for (i.inner: int32, 0, 64) {
-              let cse_var_21: int32 = (elem_idx*16)
-              let cse_var_20: int32 = ((i0.outer.i1.outer.fused*2) + nb_j.inner)
-              let cse_var_19: int32 = ((i.outer.inner*16384) + (i.inner*256))
-              let cse_var_18: int32 = (((i.outer.inner*2048) + (i.inner*32)) + (nb_j.inner*16))
-              let cse_var_17: int32 = (cse_var_18 + 9)
-              let cse_var_16: int32 = (cse_var_18 + 8)
-              let cse_var_15: int32 = (cse_var_18 + 7)
-              let cse_var_14: int32 = (cse_var_18 + 6)
-              let cse_var_13: int32 = (cse_var_18 + 5)
-              let cse_var_12: int32 = (cse_var_18 + 4)
-              let cse_var_11: int32 = (cse_var_18 + 3)
-              let cse_var_10: int32 = (cse_var_18 + 2)
-              let cse_var_9: int32 = (cse_var_18 + 15)
-              let cse_var_8: int32 = (cse_var_18 + 14)
-              let cse_var_7: int32 = (cse_var_18 + 13)
-              let cse_var_6: int32 = (cse_var_18 + 12)
-              let cse_var_5: int32 = (cse_var_18 + 11)
-              let cse_var_4: int32 = (cse_var_18 + 10)
-              let cse_var_3: int32 = (cse_var_18 + 1)
-               {
-                compute_4[cse_var_18] = (compute_4[cse_var_18] + (placeholder_16: Buffer(placeholder_11, float32, [78656], [])[((placeholder_15[cse_var_20]*16) + cse_var_21)]*max(placeholder_17: Buffer(placeholder_10, float32, [32768], [])[(cse_var_19 + placeholder_18: Buffer(placeholder_12, int32, [4916], [])[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_3] = (compute_4[cse_var_3] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 1)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_10] = (compute_4[cse_var_10] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 2)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_11] = (compute_4[cse_var_11] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 3)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_12] = (compute_4[cse_var_12] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 4)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_13] = (compute_4[cse_var_13] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 5)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_14] = (compute_4[cse_var_14] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 6)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_15] = (compute_4[cse_var_15] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 7)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_16] = (compute_4[cse_var_16] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 8)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_17] = (compute_4[cse_var_17] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 9)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_4] = (compute_4[cse_var_4] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 10)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_5] = (compute_4[cse_var_5] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 11)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_6] = (compute_4[cse_var_6] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 12)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_7] = (compute_4[cse_var_7] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 13)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_8] = (compute_4[cse_var_8] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 14)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-                compute_4[cse_var_9] = (compute_4[cse_var_9] + (placeholder_16[(((placeholder_15[cse_var_20]*16) + cse_var_21) + 15)]*max(placeholder_17[(cse_var_19 + placeholder_18[(placeholder_15[cse_var_20] + elem_idx)])], 0f32)))
-              }
+        }
+        for (elem_idx: int32, 0, let cse_var_2: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) in (placeholder_15: Buffer(placeholder_13, int32, [33], [])[(cse_var_2 + 1)] - placeholder_15[cse_var_2])) {
+          for (i.inner: int32, 0, 32) {
+            let cse_var_21: int32 = (elem_idx*16)
+            let cse_var_20: int32 = ((i.inner*32) + (nb_j.inner*16))
+            let cse_var_19: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)
+            let cse_var_18: int32 = ((floordiv(i0.outer.i1.outer.fused, 16)*8192) + (i.inner*256))
+            let cse_var_17: int32 = (cse_var_20 + 9)
+            let cse_var_16: int32 = (cse_var_20 + 8)
+            let cse_var_15: int32 = (cse_var_20 + 7)
+            let cse_var_14: int32 = (cse_var_20 + 6)
+            let cse_var_13: int32 = (cse_var_20 + 5)
+            let cse_var_12: int32 = (cse_var_20 + 4)
+            let cse_var_11: int32 = (cse_var_20 + 3)
+            let cse_var_10: int32 = (cse_var_20 + 2)
+            let cse_var_9: int32 = (cse_var_20 + 15)
+            let cse_var_8: int32 = (cse_var_20 + 14)
+            let cse_var_7: int32 = (cse_var_20 + 13)
+            let cse_var_6: int32 = (cse_var_20 + 12)
+            let cse_var_5: int32 = (cse_var_20 + 11)
+            let cse_var_4: int32 = (cse_var_20 + 10)
+            let cse_var_3: int32 = (cse_var_20 + 1)
+             {
+              compute_4[cse_var_20] = (compute_4[cse_var_20] + (placeholder_16: Buffer(placeholder_11, float32, [78656], [])[((placeholder_15[cse_var_19]*16) + cse_var_21)]*max(placeholder_17: Buffer(placeholder_10, float32, [32768], [])[(cse_var_18 + placeholder_18: Buffer(placeholder_12, int32, [4916], [])[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_3] = (compute_4[cse_var_3] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 1)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_10] = (compute_4[cse_var_10] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 2)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_11] = (compute_4[cse_var_11] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 3)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_12] = (compute_4[cse_var_12] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 4)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_13] = (compute_4[cse_var_13] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 5)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_14] = (compute_4[cse_var_14] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 6)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_15] = (compute_4[cse_var_15] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 7)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_16] = (compute_4[cse_var_16] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 8)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_17] = (compute_4[cse_var_17] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 9)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_4] = (compute_4[cse_var_4] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 10)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_5] = (compute_4[cse_var_5] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 11)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_6] = (compute_4[cse_var_6] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 12)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_7] = (compute_4[cse_var_7] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 13)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_8] = (compute_4[cse_var_8] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 14)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
+              compute_4[cse_var_9] = (compute_4[cse_var_9] + (placeholder_16[(((placeholder_15[cse_var_19]*16) + cse_var_21) + 15)]*max(placeholder_17[(cse_var_18 + placeholder_18[(placeholder_15[cse_var_19] + elem_idx)])], 0f32)))
             }
           }
         }
       }
-      for (i0.inner: int32, 0, 128) {
+      for (i0.inner: int32, 0, 32) {
         for (i1.inner: int32, 0, 32) {
-          let cse_var_22: int32 = (((i0.inner*512) + (i0.outer.i1.outer.fused*32)) + i1.inner)
+          let cse_var_22: int32 = ((((floordiv(i0.outer.i1.outer.fused, 16)*16384) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)) + i1.inner)
           compute_5: Buffer(compute_2, float32, [65536], [])[cse_var_22] = max((compute_4[((i0.inner*32) + i1.inner)] + placeholder_19: Buffer(placeholder_14, float32, [65536], [])[cse_var_22]), 0f32)
         }
       }
@@ -742,7 +740,7 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
 <span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.856 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.735 ms
 </pre></div>
 </div>
 <div class="admonition note">
diff --git a/docs/how_to/tune_with_autotvm/sg_execution_times.html b/docs/how_to/tune_with_autotvm/sg_execution_times.html
index d5be9e9d9d..f1bf6cb6bf 100644
--- a/docs/how_to/tune_with_autotvm/sg_execution_times.html
+++ b/docs/how_to/tune_with_autotvm/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-tune-with-autotvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:37.080</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
+<p><strong>00:37.316</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 84%" />
@@ -349,11 +349,11 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-conv2d-cuda-py"><span class="std std-ref">Tuning High Performance Convolution on NVIDIA GPUs</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_cuda.py</span></code>)</p></td>
-<td><p>00:37.044</p></td>
+<td><p>00:37.281</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_relay_x86.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-x86-py"><span class="std std-ref">Auto-tuning a Convolutional Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_x86.py</span></code>)</p></td>
-<td><p>00:00.021</p></td>
+<td><p>00:00.020</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-cuda-py"><span class="std std-ref">Auto-tuning a Convolutional Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_cuda.py</span></code>)</p></td>
diff --git a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
index 5c4f99a035..45a051b150 100644
--- a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
+++ b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
@@ -689,7 +689,7 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 4, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 2, 128]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9469753
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 256, 1, 2]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 256, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,610123
 No: 2   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
@@ -812,161 +812,149 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 4, 8]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 1, 256]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,767289
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 32, 16, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 256, 2]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9939639
 No: 3   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
-  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 738, in __call__
-    yield remote, remote.load_module(os.path.split(build_result.filename)[1])
-  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 702, in run_through_rpc
-    costs = time_f(*args).results
-  File &quot;/workspace/python/tvm/runtime/module.py&quot;, line 357, in evaluator
-    blob = feval(*args)
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 142, in build
+    res = future.result()
+  File &quot;/usr/lib/python3.7/concurrent/futures/_base.py&quot;, line 435, in result
+    return self.__get_result()
+  File &quot;/usr/lib/python3.7/concurrent/futures/_base.py&quot;, line 384, in __get_result
+    raise self._exception
+  File &quot;/usr/lib/python3.7/concurrent/futures/thread.py&quot;, line 57, in run
+    result = self.fn(*self.args, **self.kwargs)
+  File &quot;/workspace/python/tvm/contrib/popen_pool.py&quot;, line 432, in &lt;lambda&gt;
+    worker = lambda *args: self._worker_run(*args)
+  File &quot;/workspace/python/tvm/contrib/popen_pool.py&quot;, line 401, in _worker_run
+    return proc.recv()
+  File &quot;/workspace/python/tvm/contrib/popen_pool.py&quot;, line 309, in recv
+    raise TimeoutError()
+TimeoutError
+
+        [(&#39;tile_f&#39;, [-1, 4, 1, 16]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 2, 16]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,4192266
+No: 4   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
+    func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
+    func = build(s, args, target_host=task.target_host, runtime=runtime)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 227, in build
+    input_mod = lower(inputs, args, name=name, binds=binds)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 134, in lower
+    return ffi.lower_schedule(inp, args, name, binds, simple_mode)
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
-  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 262, in tvm._ffi._cy3.core.FuncCall
-  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 251, in tvm._ffi._cy3.core.FuncCall3
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 276, in tvm._ffi._cy3.core.FuncCall
   File &quot;tvm/_ffi/_cython/./base.pxi&quot;, line 181, in tvm._ffi._cy3.core.CHECK_CALL
 tvm._ffi.base.TVMError: Traceback (most recent call last):
-  4: TVMFuncCall
+  24: TVMFuncCall
         at ../src/runtime/c_runtime_api.cc:477
-  3: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
-        at ../include/tvm/runtime/packed_func.h:1217
-  2: tvm::runtime::RPCWrappedFunc::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
-        at ../src/runtime/rpc/rpc_module.cc:129
-  1: tvm::runtime::RPCClientSession::CallFunc(void*, TVMValue const*, int const*, int, std::function&lt;void (tvm::runtime::TVMArgs)&gt; const&amp;)
-        at ../src/runtime/rpc/rpc_endpoint.cc:1012
-  0: tvm::runtime::RPCEndpoint::CallFunc(void*, TVMValue const*, int const*, int, std::function&lt;void (tvm::runtime::TVMArgs)&gt;)
-        at ../src/runtime/rpc/rpc_endpoint.cc:804
-  File &quot;../src/runtime/rpc/rpc_endpoint.cc&quot;, line 804
-TVMError:
----------------------------------------------------------------
-An error occurred during the execution of TVM.
-For more information, please see: https://tvm.apache.org/docs/errors.html
----------------------------------------------------------------
-  Check failed: (code == RPCCode::kReturn) is false: code=kShutdown
-
-During handling of the above exception, another exception occurred:
-
-Traceback (most recent call last):
-  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 702, in run_through_rpc
-    costs = time_f(*args).results
-  File &quot;/usr/lib/python3.7/contextlib.py&quot;, line 130, in __exit__
-    self.gen.throw(type, value, traceback)
-  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 742, in __call__
-    remote.remove(build_result.filename)
-  File &quot;/workspace/python/tvm/rpc/client.py&quot;, line 144, in remove
-    self._remote_funcs[&quot;remove&quot;] = self.get_function(&quot;tvm.rpc.server.remove&quot;)
-  File &quot;/workspace/python/tvm/rpc/client.py&quot;, line 72, in get_function
-    return self._sess.get_function(name)
-  File &quot;/workspace/python/tvm/runtime/module.py&quot;, line 171, in get_function
-    self.handle, c_str(name), ctypes.c_int(query_imports), ctypes.byref(ret_handle)
-  File &quot;/workspace/python/tvm/_ffi/base.py&quot;, line 348, in check_call
-    raise get_last_ffi_error()
-tvm._ffi.base.TVMError: Traceback (most recent call last):
-  52: 0xffffffffffffffff
-  51: _start
-  50: __libc_start_main
-  49: _Py_UnixMain
-  48: 0x0000000000650da0
-  47: 0x0000000000650afa
-  46: _PyFunction_FastCallDict
-  45: _PyEval_EvalCodeWithName
-  44: _PyEval_EvalFrameDefault
-  43: _PyFunction_FastCallKeywords
-  42: _PyEval_EvalCodeWithName
-  41: _PyEval_EvalFrameDefault
-  40: _PyMethodDef_RawFastCallKeywords
-  39: 0x0000000000546369
-  38: _PyEval_EvalCodeWithName
-  37: _PyEval_EvalFrameDefault
-  36: _PyFunction_FastCallKeywords
-  35: _PyEval_EvalCodeWithName
-  34: _PyEval_EvalFrameDefault
-  33: _PyFunction_FastCallDict
-  32: _PyEval_EvalCodeWithName
-  31: _PyEval_EvalFrameDefault
-  30: _PyObject_FastCallDict
-  29: 0x00000000004c06e1
-  28: _PyFunction_FastCallDict
-  27: _PyEval_EvalFrameDefault
-  26: _PyMethodDescr_FastCallKeywords
-  25: 0x00000000005dcb58
-  24: 0x00000000005dc83f
-  23: 0x00000000004ba127
-  22: _PyEval_EvalFrameDefault
-  21: _PyFunction_FastCallKeywords
-  20: _PyEval_EvalFrameDefault
-  19: _PyFunction_FastCallKeywords
-  18: _PyEval_EvalFrameDefault
-  17: _PyFunction_FastCallKeywords
-  16: _PyEval_EvalCodeWithName
-  15: _PyEval_EvalFrameDefault
-  14: 0x0000000000537c30
-  13: _PyObject_FastCallKeywords
-  12: 0x00007fd33478ffa2
-  11: _ctypes_callproc
-  10: ffi_call
-  9: ffi_call_unix64
-  8: TVMModGetFunction
-        at ../src/runtime/c_runtime_api.cc:408
-  7: tvm::runtime::ModuleNode::GetFunction(std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, bool)
-        at ../src/runtime/module.cc:66
-  6: tvm::runtime::RPCModuleNode::GetFunction(std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, tvm::runtime::ObjectPtr&lt;tvm::runtime::Object&gt; const&amp;)
-        at ../src/runtime/rpc/rpc_module.cc:185
-  5: tvm::runtime::RPCClientSession::GetFunction(std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;)
-        at ../src/runtime/rpc/rpc_endpoint.cc:1007
-  4: tvm::runtime::TVMRetValue tvm::runtime::RPCEndpoint::SysCallRemote&lt;std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;&gt;(tvm::runtime::RPCCode, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;)
-        at ../src/runtime/rpc/rpc_endpoint.h:223
-  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;int, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;&gt;(int&amp;&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;) const
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1731
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1671
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1631
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1631
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1631
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1631
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1631
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1646
+  13: operator()
+        at ../src/driver/driver_api.cc:388
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:374
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:269
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:453
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1750
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1694
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
         at ../include/tvm/runtime/packed_func.h:1618
   2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
         at ../include/tvm/runtime/packed_func.h:1217
   1: Call
         at ../include/tvm/runtime/packed_func.h:1213
   0: operator()
-        at ../src/runtime/rpc/rpc_endpoint.cc:684
-  File &quot;../src/runtime/rpc/rpc_endpoint.cc&quot;, line 684
-TVMError:
----------------------------------------------------------------
-An error occurred during the execution of TVM.
-For more information, please see: https://tvm.apache.org/docs/errors.html
----------------------------------------------------------------
-  Check failed: (code == RPCCode::kReturn) is false: code=1
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
 
 Traceback (most recent call last):
-  52: 0xffffffffffffffff
-  51: _start
-  50: __libc_start_main
-  49: _Py_UnixMain
-  48: 0x0000000000650da0
-  47: 0x0000000000650afa
-  46: _PyFunction_FastCallDict
-  45: _PyEval_EvalCodeWithName
-  44: _PyEval_EvalFrameDefault
-  43: _PyFunction_FastCallKeywords
-  42: _PyEval_EvalCodeWithName
-  41: _PyEval_EvalFrameDefault
-  40: _PyMethodDef_RawFastCallKeywords
-  39: 0x0000000000546369
-  38: _PyEval_EvalCodeWithName
-  37: _PyEval_EvalFrameDefault
-  36: _PyFunction_FastCallKeywords
-  35: _PyEval_EvalCodeWithName
-  34: _PyEval_EvalFrameDefault
-  33: _PyFunction_FastCallDict
-  32: _PyEval_EvalCodeWithName
-  31: _PyEval_EvalFrameDefault
-  30: _PyObject_FastCallDict
-  29: 0x00000000004c06e1
-  28: _PyFunction_FastCallDict
-  27: _PyEval_EvalFrameDefault
-  26: _PyMethodDescr_FastCallKeywords
-  25: 0x00000000005dcb58
-  24: 0x00000000005dc83f
-  23: 0x00000000004ba127
-  22: _PyEval_EvalFrameDefault
-  21: _PyFunction_FastCallKeywords
-  20: _PyEval_EvalFrameDefault
-  19: _PyFunction_FastCall      [(&#39;tile_f&#39;, [-1, 128, 2, 1]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 2, 2]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,234317
-No: 4   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1731
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1671
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1631
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1631
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1631
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1631
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1631
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1646
+  13: operator()
+        at ../src/driver/driver_api.cc:388
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:374
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:269
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:453
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1750
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1694
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1618
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 32, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 512]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9871662
+No: 5   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1088,8 +1076,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 8, 1, 8]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 1, 256]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,8703999
-No: 5   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 2, 32]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 8, 64]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,8690190
+No: 6   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1211,8 +1199,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 2, 8]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 512]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,771685
-No: 6   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 1, 8]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 4, 64]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,4234258
+No: 7   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1334,9 +1322,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 64, 1, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 32, 8]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,5535966
-No: 7   GFLOPS: 7.23/7.23       result: MeasureResult(costs=(0.032004703499999995,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.247319459915161, timestamp=1668621631.5396304)        [(&#39;tile_f&#39;, [-1, 4, 4, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 16, 2]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,1212537
-No: 8   GFLOPS: 0.00/7.23       result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 32, 16]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 128, 2]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,6451904
+No: 8   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1458,9 +1445,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 16, 4, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 16, 32]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9641963
-No: 9   GFLOPS: 920.13/920.13   result: MeasureResult(costs=(0.0002515955407407407,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.322073221206665, timestamp=1668621633.0297952)       [(&#39;tile_f&#39;, [-1, 2, 4, 2]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 8, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9305633
-No: 10  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 16, 16]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 1, 128]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,4628103
+No: 9   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1582,8 +1568,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 16, 32]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 64]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2687499
-No: 11  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 16, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 4]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,5689328
+No: 10  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1705,8 +1691,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 8, 8, 8]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 128]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,10240277
-No: 12  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 16, 1, 4]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 128, 4]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2611284
+No: 11  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1828,10 +1814,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 2, 32]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 2, 8]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,6876512
-No: 13  GFLOPS: 0.83/920.13     result: MeasureResult(costs=(0.27770135475,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.8979127407073975, timestamp=1668621642.613656)       [(&#39;tile_f&#39;, [-1, 8, 1, 32]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 16, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,1757548
-No: 14  GFLOPS: 3.64/920.13     result: MeasureResult(costs=(0.0636859435,), error_no=MeasureErrorNo.NO_ERROR, all_cost=6.455172538757324, timestamp=1668621644.0596006)        [(&#39;tile_f&#39;, [-1, 2, 8, 8]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 1]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,10262495
-No: 15  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 8, 16, 1]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 32, 1]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9699617
+No: 12  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -1953,8 +1937,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 1, 32]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 32]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,6344106
-No: 16  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 8, 4, 1]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 16]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2453682
+No: 13  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -2076,8 +2060,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 2, 256]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 16, 16]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,329338
-No: 17  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 16, 2, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 128, 1]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9125614
+No: 14  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -2199,8 +2183,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 4, 1]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 2, 64]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,6744780
-No: 18  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 8, 16]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 16, 4]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,3179839
+No: 15  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -2322,8 +2306,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 8, 16, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 256, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,4675477
-No: 19  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 16, 1, 32]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 4, 4]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2013189
+No: 16  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -2445,8 +2429,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 16, 8]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 4, 4]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9950760
-No: 20  GFLOPS: 0.00/920.13     result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 16, 1, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 32]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,6925484
+No: 17  GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 588, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 540, in _build_func_common
@@ -2568,7 +2552,10 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 871, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 4, 8]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 128]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2310591
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 32, 4]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 16]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,5745872
+No: 18  GFLOPS: 119.53/119.53   result: MeasureResult(costs=(0.0019367525892857143,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.878584861755371, timestamp=1668632874.2540498)       [(&#39;tile_f&#39;, [-1, 2, 64, 1]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 16]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,3604746
+No: 19  GFLOPS: 133.82/133.82   result: MeasureResult(costs=(0.001729959672413793,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.642688274383545, timestamp=1668632874.896128) [(&#39;tile_f&#39;, [-1, 16, 4, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 2, 1]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9684423
+No: 20  GFLOPS: 63.65/133.82    result: MeasureResult(costs=(0.0036371566071428567,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.447140693664551, timestamp=1668632875.603516)        [(&#39;tile_f&#39;, [-1, 4, 1, 8]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 1]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,3882918
 </pre></div>
 </div>
 <p>Finally we can inspect the best config from log file, check correctness,
@@ -2607,9 +2594,9 @@ and measure running time.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Finish loading 20 records
 
 Best config:
-[(&#39;tile_f&#39;, [-1, 2, 4, 2]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 8, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9305633
+[(&#39;tile_f&#39;, [-1, 16, 4, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 2, 1]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9684423
 Finish loading 20 records
-Time cost of this operator: 0.000535
+Time cost of this operator: 0.002187
 </pre></div>
 </div>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autotvm-tune-conv2d-cuda-py">
diff --git a/docs/how_to/work_with_microtvm/micro_autotune.html b/docs/how_to/work_with_microtvm/micro_autotune.html
index c7224458e2..11d9648608 100644
--- a/docs/how_to/work_with_microtvm/micro_autotune.html
+++ b/docs/how_to/work_with_microtvm/micro_autotune.html
@@ -596,10 +596,10 @@ the tuned operator.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build without Autotuning ##########
 Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  Measurements(us)
 ---------                                     ---                                           --------  -------  -----              ------  -------  ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  313.7     98.74    (1, 2, 10, 10, 3)  2       1        [313.7]
-tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.017     0.95     (1, 6, 10, 10)     1       1        [3.017]
-tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.987     0.311    (1, 1, 10, 10, 3)  1       1        [0.987]
-Total_time                                    -                                             317.704   -        -                  -       -        -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  310.4     98.711   (1, 2, 10, 10, 3)  2       1        [310.4]
+tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.08      0.98     (1, 6, 10, 10)     1       1        [3.08]
+tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.973     0.309    (1, 1, 10, 10, 3)  1       1        [0.973]
+Total_time                                    -                                             314.453   -        -                  -       -        -
 </pre></div>
 </div>
 </div>
@@ -650,10 +650,10 @@ Total_time                                    -
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build with Autotuning ##########
 Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  Measurements(us)
 ---------                                     ---                                           --------  -------  -----              ------  -------  ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  100.4     97.289   (1, 6, 10, 10, 1)  2       1        [100.4]
-tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.811     1.755    (1, 6, 10, 10)     1       1        [1.811]
-tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.986     0.956    (1, 1, 10, 10, 3)  1       1        [0.986]
-Total_time                                    -                                             103.197   -        -                  -       -        -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  121.3     97.76    (1, 6, 10, 10, 1)  2       1        [121.3]
+tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.82      1.467    (1, 6, 10, 10)     1       1        [1.82]
+tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.959     0.773    (1, 1, 10, 10, 3)  1       1        [0.959]
+Total_time                                    -                                             124.079   -        -                  -       -        -
 </pre></div>
 </div>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-autotune-py">
diff --git a/docs/how_to/work_with_microtvm/micro_pytorch.html b/docs/how_to/work_with_microtvm/micro_pytorch.html
index 86db81c8e0..b864496078 100644
--- a/docs/how_to/work_with_microtvm/micro_pytorch.html
+++ b/docs/how_to/work_with_microtvm/micro_pytorch.html
@@ -440,7 +440,8 @@ download a cat image and preprocess it to use as the model input.</p>
 Downloading: &quot;https://download.pytorch.org/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth&quot; to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2_qnnpack_37f702c5.pth
 
   0%|          | 0.00/3.42M [00:00&lt;?, ?B/s]
-100%|##########| 3.42M/3.42M [00:00&lt;00:00, 85.4MB/s]
+ 72%|#######1  | 2.46M/3.42M [00:00&lt;00:00, 23.5MB/s]
+100%|##########| 3.42M/3.42M [00:00&lt;00:00, 30.7MB/s]
 /workspace/python/tvm/relay/frontend/pytorch_utils.py:47: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
   return LooseVersion(torch_ver) &gt; ver
 /venv/apache-tvm-py3.7/lib/python3.7/site-packages/setuptools/_distutils/version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
@@ -564,7 +565,7 @@ via the host <cite>main.cc`</cite> or if a Zephyr emulated board is selected as
 Torch top-1 id: 282, class name: tiger cat
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  1.947 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  2.708 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-pytorch-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/12b9ecc04c41abaa12022061771821d1/micro_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/work_with_microtvm/micro_train.html b/docs/how_to/work_with_microtvm/micro_train.html
index 1809824c06..53edd4ddaa 100644
--- a/docs/how_to/work_with_microtvm/micro_train.html
+++ b/docs/how_to/work_with_microtvm/micro_train.html
@@ -530,7 +530,7 @@ take about <strong>2 minutes</strong> to download the Stanford Cars, while COCO
 <a href="https://docs.python.org/3/library/shutil.html#shutil.move" title="shutil.move" class="sphx-glr-backref-module-shutil sphx-glr-backref-type-py-function"><span class="n">shutil</span><span class="o">.</span><span class="n">move</span></a><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><a href="https://docs.python.org/3/library/stdtypes.html#str" title="builtins.str" class="sphx-glr-backref-module-builtins sphx-glr-backref-typ [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&#39;/tmp/tmpl8_9f7vj/images/random&#39;
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&#39;/tmp/tmp8f79bv0e/images/random&#39;
 </pre></div>
 </div>
 </div>
@@ -590,8 +590,8 @@ objects to other stuff? We can display some examples from our datasets using <co
     <span class="n">plt</span><span class="o">.</span><span class="n">axis</span><span class="p">(</span><span class="s2">&quot;off&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
-<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmpl8_9f7vj/images/target contains 8144 images
-/tmp/tmpl8_9f7vj/images/random contains 5000 images
+<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmp8f79bv0e/images/target contains 8144 images
+/tmp/tmp8f79bv0e/images/random contains 5000 images
 </pre></div>
 </div>
 </div>
@@ -703,13 +703,13 @@ the time on our validation set).</p>
 </pre></div>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Epoch 1/3
-328/328 - 47s - loss: 0.2049 - accuracy: 0.9255 - val_loss: 0.1223 - val_accuracy: 0.9622 - 47s/epoch - 143ms/step
+328/328 - 46s - loss: 0.2469 - accuracy: 0.9177 - val_loss: 0.1360 - val_accuracy: 0.9517 - 46s/epoch - 141ms/step
 Epoch 2/3
-328/328 - 43s - loss: 0.0911 - accuracy: 0.9663 - val_loss: 0.0991 - val_accuracy: 0.9690 - 43s/epoch - 131ms/step
+328/328 - 43s - loss: 0.1019 - accuracy: 0.9630 - val_loss: 0.1356 - val_accuracy: 0.9547 - 43s/epoch - 131ms/step
 Epoch 3/3
-328/328 - 43s - loss: 0.0602 - accuracy: 0.9778 - val_loss: 0.1140 - val_accuracy: 0.9630 - 43s/epoch - 131ms/step
+328/328 - 43s - loss: 0.0745 - accuracy: 0.9730 - val_loss: 0.1347 - val_accuracy: 0.9569 - 43s/epoch - 131ms/step
 
-&lt;keras.callbacks.History object at 0x7f5326724510&gt;
+&lt;keras.callbacks.History object at 0x7f69b7684c90&gt;
 </pre></div>
 </div>
 </div>
@@ -971,7 +971,7 @@ as intended.</p>
 <p>From here, we could modify the model to read live images from the camera - we have another
 Arduino tutorial for how to do that <a class="reference external" href="https://github.com/guberti/tvm-arduino-demos/tree/master/examples/person_detection">on GitHub</a>. Alternatively, we could also
 <a class="reference external" href="https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_autotune.html">use TVM’s autotuning capabilities</a> to dramatically improve the model’s performance.</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 4 minutes  38.559 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 4 minutes  39.935 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-train-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/b52cec46baf4f78d6bcd94cbe269c8a6/micro_train.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_train.py</span></code></a></p>
diff --git a/docs/how_to/work_with_microtvm/sg_execution_times.html b/docs/how_to/work_with_microtvm/sg_execution_times.html
index 427f614597..44fe66c46e 100644
--- a/docs/how_to/work_with_microtvm/sg_execution_times.html
+++ b/docs/how_to/work_with_microtvm/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-work-with-microtvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>06:41.491</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
+<p><strong>06:44.796</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 83%" />
@@ -349,27 +349,27 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="micro_train.html#sphx-glr-how-to-work-with-microtvm-micro-train-py"><span class="std std-ref">Training Vision Models for microTVM on Arduino</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_train.py</span></code>)</p></td>
-<td><p>04:38.559</p></td>
+<td><p>04:39.935</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="micro_pytorch.html#sphx-glr-how-to-work-with-microtvm-micro-pytorch-py"><span class="std std-ref">microTVM PyTorch Tutorial</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_pytorch.py</span></code>)</p></td>
-<td><p>01:01.947</p></td>
+<td><p>01:02.708</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="micro_autotune.html#sphx-glr-how-to-work-with-microtvm-micro-autotune-py"><span class="std std-ref">Autotuning with microTVM</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_autotune.py</span></code>)</p></td>
-<td><p>00:49.192</p></td>
+<td><p>00:49.942</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="micro_aot.html#sphx-glr-how-to-work-with-microtvm-micro-aot-py"><span class="std std-ref">microTVM Host-Driven AoT</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_aot.py</span></code>)</p></td>
-<td><p>00:08.118</p></td>
+<td><p>00:08.359</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="micro_tflite.html#sphx-glr-how-to-work-with-microtvm-micro-tflite-py"><span class="std std-ref">microTVM with TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tflite.py</span></code>)</p></td>
-<td><p>00:03.672</p></td>
+<td><p>00:03.850</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="micro_reference_vm.html#sphx-glr-how-to-work-with-microtvm-micro-reference-vm-py"><span class="std std-ref">microTVM Reference Virtual Machines</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_reference_vm.py</span></code>)</p></td>
-<td><p>00:00.001</p></td>
+<td><p>00:00.002</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="micro_ethosu.html#sphx-glr-how-to-work-with-microtvm-micro-ethosu-py"><span class="std std-ref">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_ethosu.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_relay/sg_execution_times.html b/docs/how_to/work_with_relay/sg_execution_times.html
index 1e222cd385..13133e4e10 100644
--- a/docs/how_to/work_with_relay/sg_execution_times.html
+++ b/docs/how_to/work_with_relay/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-work-with-relay-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:42.944</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
+<p><strong>00:43.710</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 84%" />
@@ -349,15 +349,15 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="using_pipeline_executor.html#sphx-glr-how-to-work-with-relay-using-pipeline-executor-py"><span class="std std-ref">Using Pipeline Executor in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_pipeline_executor.py</span></code>)</p></td>
-<td><p>00:31.089</p></td>
+<td><p>00:32.016</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="using_external_lib.html#sphx-glr-how-to-work-with-relay-using-external-lib-py"><span class="std std-ref">Using External Libraries in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_external_lib.py</span></code>)</p></td>
-<td><p>00:10.213</p></td>
+<td><p>00:10.162</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="build_gcn.html#sphx-glr-how-to-work-with-relay-build-gcn-py"><span class="std std-ref">Building a Graph Convolutional Network</span></a> (<code class="docutils literal notranslate"><span class="pre">build_gcn.py</span></code>)</p></td>
-<td><p>00:01.636</p></td>
+<td><p>00:01.526</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="using_relay_viz.html#sphx-glr-how-to-work-with-relay-using-relay-viz-py"><span class="std std-ref">Use Relay Visualizer to Visualize Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_relay_viz.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_schedules/intrin_math.html b/docs/how_to/work_with_schedules/intrin_math.html
index 192d20df4d..2875599715 100644
--- a/docs/how_to/work_with_schedules/intrin_math.html
+++ b/docs/how_to/work_with_schedules/intrin_math.html
@@ -535,7 +535,7 @@ The following example customizes CUDA lowering rule for <code class="code docuti
 <a href="../../reference/api/python/ir.html#tvm.ir.register_intrin_lowering" title="tvm.ir.register_intrin_lowering" class="sphx-glr-backref-module-tvm-ir sphx-glr-backref-type-py-function"><span class="n">register_intrin_lowering</span></a><span class="p">(</span><span class="s2">&quot;tir.exp&quot;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">f</span><span class="o">= [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&lt;function my_cuda_math_rule at 0x7f533893d200&gt;
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&lt;function my_cuda_math_rule at 0x7f6a15288cb0&gt;
 </pre></div>
 </div>
 <p>Register the rule to TVM with override option to override existing rule.
diff --git a/docs/how_to/work_with_schedules/sg_execution_times.html b/docs/how_to/work_with_schedules/sg_execution_times.html
index 6bb7c66e09..c4e524bb94 100644
--- a/docs/how_to/work_with_schedules/sg_execution_times.html
+++ b/docs/how_to/work_with_schedules/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-work-with-schedules-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:06.268</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
+<p><strong>00:07.284</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 83%" />
@@ -349,23 +349,23 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="intrin_math.html#sphx-glr-how-to-work-with-schedules-intrin-math-py"><span class="std std-ref">Intrinsics and Math Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">intrin_math.py</span></code>)</p></td>
-<td><p>00:04.007</p></td>
+<td><p>00:04.902</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tensorize.html#sphx-glr-how-to-work-with-schedules-tensorize-py"><span class="std std-ref">Use Tensorize to Leverage Hardware Intrinsics</span></a> (<code class="docutils literal notranslate"><span class="pre">tensorize.py</span></code>)</p></td>
-<td><p>00:00.978</p></td>
+<td><p>00:01.041</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="reduction.html#sphx-glr-how-to-work-with-schedules-reduction-py"><span class="std std-ref">Reduction</span></a> (<code class="docutils literal notranslate"><span class="pre">reduction.py</span></code>)</p></td>
-<td><p>00:00.546</p></td>
+<td><p>00:00.576</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="scan.html#sphx-glr-how-to-work-with-schedules-scan-py"><span class="std std-ref">Scan and Recurrent Kernel</span></a> (<code class="docutils literal notranslate"><span class="pre">scan.py</span></code>)</p></td>
-<td><p>00:00.529</p></td>
+<td><p>00:00.556</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="extern_op.html#sphx-glr-how-to-work-with-schedules-extern-op-py"><span class="std std-ref">External Tensor Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">extern_op.py</span></code>)</p></td>
-<td><p>00:00.113</p></td>
+<td><p>00:00.114</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="schedule_primitives.html#sphx-glr-how-to-work-with-schedules-schedule-primitives-py"><span class="std std-ref">Schedule Primitives in TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">schedule_primitives.py</span></code>)</p></td>
@@ -377,7 +377,7 @@
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tuple_inputs.html#sphx-glr-how-to-work-with-schedules-tuple-inputs-py"><span class="std std-ref">Compute and Reduce with Tuple Inputs</span></a> (<code class="docutils literal notranslate"><span class="pre">tuple_inputs.py</span></code>)</p></td>
-<td><p>00:00.019</p></td>
+<td><p>00:00.018</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/work_with_schedules/tensorize.html b/docs/how_to/work_with_schedules/tensorize.html
index f4ca22b604..4c5a36d830 100644
--- a/docs/how_to/work_with_schedules/tensorize.html
+++ b/docs/how_to/work_with_schedules/tensorize.html
@@ -586,7 +586,7 @@ The importing needs to happen before the tensorized GEMV being executed.</p>
              B: Buffer(B_2: Pointer(float32), float32, [512, 64], []),
              C: Buffer(C_2: Pointer(float32), float32, [1024, 512], [])}
   buffer_map = {A_1: A, B_1: B, C_1: C} {
-  attr [IterVar(i: int32, (nullptr), &quot;DataPar&quot;, &quot;&quot;)] &quot;pragma_import_llvm&quot; = &quot;; ModuleID = &#39;/tmp/tmp8kh4i1du/input0.cc&#39;\nsource_filename = \&quot;/tmp/tmp8kh4i1du/input0.cc\&quot;\ntarget datalayout = \&quot;e-m:e-i64:64-f80:128-n8:16:32:64-S128\&quot;\ntarget triple = \&quot;x86_64-pc-linux-gnu\&quot;\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = allo [...]
+  attr [IterVar(i: int32, (nullptr), &quot;DataPar&quot;, &quot;&quot;)] &quot;pragma_import_llvm&quot; = &quot;; ModuleID = &#39;/tmp/tmp_0ju_8iw/input0.cc&#39;\nsource_filename = \&quot;/tmp/tmp_0ju_8iw/input0.cc\&quot;\ntarget datalayout = \&quot;e-m:e-i64:64-f80:128-n8:16:32:64-S128\&quot;\ntarget triple = \&quot;x86_64-pc-linux-gnu\&quot;\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = allo [...]
   for (i, 0, 1024) {
     for (j.outer: int32, 0, 32) {
       @tir.call_extern(&quot;gemv_update&quot;, @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/install/nnpack.html b/docs/install/nnpack.html
index 23d2181e9d..1ef28de467 100644
--- a/docs/install/nnpack.html
+++ b/docs/install/nnpack.html
@@ -229,17 +229,7 @@
               <p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
 <ul class="current">
 <li class="toctree-l1 current"><a class="reference internal" href="index.html">Installing TVM</a><ul class="current">
-<li class="toctree-l2 current"><a class="reference internal" href="from_source.html">Install from Source</a><ul class="current">
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#developers-get-source-from-github">Developers: Get Source from Github</a></li>
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#build-the-shared-library">Build the Shared Library</a></li>
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#python-package-installation">Python Package Installation</a></li>
-<li class="toctree-l3 current"><a class="reference internal" href="from_source.html#install-contrib-libraries">Install Contrib Libraries</a><ul class="current">
-<li class="toctree-l4 current"><a class="current reference internal" href="#">NNPACK Contrib Installation</a></li>
-</ul>
-</li>
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#enable-c-tests">Enable C++ Tests</a></li>
-</ul>
-</li>
+<li class="toctree-l2"><a class="reference internal" href="from_source.html">Install from Source</a></li>
 <li class="toctree-l2"><a class="reference internal" href="docker.html">Docker Images</a></li>
 <li class="toctree-l2 current"><a class="current reference internal" href="#">NNPACK Contrib Installation</a><ul>
 <li class="toctree-l3"><a class="reference internal" href="#conditions">Conditions</a></li>
diff --git a/docs/reference/api/doxygen/bias__add_8h__incl.svg b/docs/reference/api/doxygen/bias__add_8h__incl.svg
index 78f0eec9a5..640a135645 100644
--- a/docs/reference/api/doxygen/bias__add_8h__incl.svg
+++ b/docs/reference/api/doxygen/bias__add_8h__incl.svg
@@ -58,33 +58,33 @@
 <path fill="none" stroke="#191970" d="M1391.2008,-1063.1389C1391.2008,-1041.9692 1391.2008,-1003.8174 1391.2008,-980.6112"/>
 <polygon fill="#191970" stroke="#191970" points="1394.7009,-980.5594 1391.2008,-970.5595 1387.7009,-980.5595 1394.7009,-980.5594"/>
 </g>
-<!-- Node91 -->
+<!-- Node92 -->
 <g id="node41" class="node">
-<title>Node91</title>
+<title>Node92</title>
 <g id="a_node41"><a xlink:href="tags_8h.html" target="_top" xlink:title="External function interface to rocBLAS libraries. ">
 <polygon fill="#ffffff" stroke="#000000" points="3034.7008,-890 3034.7008,-909 3125.7008,-909 3125.7008,-890 3034.7008,-890"/>
 <text text-anchor="middle" x="3080.2008" y="-897" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/tags.h</text>
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node91 -->
+<!-- Node0&#45;&gt;Node92 -->
 <g id="edge159" class="edge">
-<title>Node0&#45;&gt;Node91</title>
+<title>Node0&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M1446.7447,-1078.2247C1719.8698,-1076.4317 2912.7041,-1063.3317 3047.2008,-971 3064.4906,-959.1306 3072.9087,-935.8133 3076.8792,-919.0293"/>
 <polygon fill="#191970" stroke="#191970" points="3080.3293,-919.6265 3078.8946,-909.1293 3073.47,-918.2301 3080.3293,-919.6265"/>
 </g>
-<!-- Node92 -->
+<!-- Node93 -->
 <g id="node42" class="node">
-<title>Node92</title>
+<title>Node93</title>
 <g id="a_node42"><a xlink:href="topi_2transform_8h.html" target="_top" xlink:title="Transform op constructors. ">
 <polygon fill="#ffffff" stroke="#000000" points="631.2008,-1007.5 631.2008,-1026.5 749.2008,-1026.5 749.2008,-1007.5 631.2008,-1007.5"/>
 <text text-anchor="middle" x="690.2008" y="-1014.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/transform.h</text>
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node92 -->
+<!-- Node0&#45;&gt;Node93 -->
 <g id="edge160" class="edge">
-<title>Node0&#45;&gt;Node92</title>
+<title>Node0&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M1335.6085,-1073.6228C1208.7647,-1062.4946 898.2577,-1035.2532 759.3439,-1023.066"/>
 <polygon fill="#191970" stroke="#191970" points="759.5105,-1019.5673 749.2429,-1022.1799 758.8987,-1026.5405 759.5105,-1019.5673"/>
 </g>
@@ -1215,9 +1215,9 @@
 <path fill="none" stroke="#191970" d="M1389.6224,-951.3416C1386.4083,-931.6745 1379.0129,-886.4221 1374.3319,-857.7784"/>
 <polygon fill="#191970" stroke="#191970" points="1377.7522,-857.0062 1372.6851,-847.7016 1370.8438,-858.1352 1377.7522,-857.0062"/>
 </g>
-<!-- Node81&#45;&gt;Node91 -->
+<!-- Node81&#45;&gt;Node92 -->
 <g id="edge155" class="edge">
-<title>Node81&#45;&gt;Node91</title>
+<title>Node81&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M1450.5016,-960.3026C1701.3368,-957.1957 2676.28,-943.4972 2982.2008,-915 2995.8681,-913.7269 3010.5224,-911.7445 3024.2249,-909.6177"/>
 <polygon fill="#191970" stroke="#191970" points="3025.1659,-913.0116 3034.4857,-907.9726 3024.0577,-906.0999 3025.1659,-913.0116"/>
 </g>
@@ -1245,15 +1245,15 @@
 <path fill="none" stroke="#191970" d="M1226.2711,-889.9005C1251.4315,-879.8663 1291.5599,-863.8627 1323.0611,-851.2997"/>
 <polygon fill="#191970" stroke="#191970" points="1324.5422,-854.4772 1332.5343,-847.5218 1321.9491,-847.9752 1324.5422,-854.4772"/>
 </g>
-<!-- Node90 -->
+<!-- Node91 -->
 <g id="node40" class="node">
-<title>Node90</title>
+<title>Node91</title>
 <polygon fill="#ffffff" stroke="#bfbfbf" points="1230.7008,-823 1230.7008,-842 1277.7008,-842 1277.7008,-823 1230.7008,-823"/>
 <text text-anchor="middle" x="1254.2008" y="-830" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">deque</text>
 </g>
-<!-- Node82&#45;&gt;Node90 -->
+<!-- Node82&#45;&gt;Node91 -->
 <g id="edge152" class="edge">
-<title>Node82&#45;&gt;Node90</title>
+<title>Node82&#45;&gt;Node91</title>
 <path fill="none" stroke="#191970" d="M1209.6511,-889.9005C1217.6828,-879.552 1230.6423,-862.8542 1240.5182,-850.1295"/>
 <polygon fill="#191970" stroke="#191970" points="1243.3979,-852.1275 1246.7643,-842.0817 1237.868,-847.8356 1243.3979,-852.1275"/>
 </g>
@@ -1320,90 +1320,90 @@
 <path fill="none" stroke="#191970" d="M2177.0555,-542.5563C2154.9648,-540.7109 2128.8191,-538.6256 2105.2008,-537 1816.8035,-517.1508 1470.6263,-499.7608 1341.2906,-493.4873"/>
 <polygon fill="#191970" stroke="#191970" points="1341.423,-489.9897 1331.2655,-493.0022 1341.0846,-496.9815 1341.423,-489.9897"/>
 </g>
-<!-- Node91&#45;&gt;Node17 -->
+<!-- Node92&#45;&gt;Node17 -->
 <g id="edge156" class="edge">
-<title>Node91&#45;&gt;Node17</title>
+<title>Node92&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M3079.0609,-889.8443C3076.6391,-868.4659 3071.2008,-815.5037 3071.2008,-771 3071.2008,-771 3071.2008,-771 3071.2008,-138.5 3071.2008,-101.6453 3059.8642,-87.4458 3029.2008,-67 2973.2118,-29.6676 2766.7962,-19.1069 2686.4767,-16.376"/>
 <polygon fill="#191970" stroke="#191970" points="2686.553,-12.8768 2676.4458,-16.0538 2686.3282,-19.8732 2686.553,-12.8768"/>
 </g>
-<!-- Node92&#45;&gt;Node1 -->
+<!-- Node93&#45;&gt;Node1 -->
 <g id="edge161" class="edge">
-<title>Node92&#45;&gt;Node1</title>
+<title>Node93&#45;&gt;Node1</title>
 <path fill="none" stroke="#191970" d="M732.5566,-1007.4235C753.9991,-1000.5726 778.8761,-989.241 795.2008,-971 843.0109,-917.5777 787.745,-862.4358 843.2008,-817 871.6755,-793.6702 1104.2647,-779.485 1219.2835,-773.8319"/>
 <polygon fill="#191970" stroke="#191970" points="1219.5487,-777.3232 1229.3676,-773.3433 1219.2099,-770.3314 1219.5487,-777.3232"/>
 </g>
-<!-- Node92&#45;&gt;Node17 -->
+<!-- Node93&#45;&gt;Node17 -->
 <g id="edge192" class="edge">
-<title>Node92&#45;&gt;Node17</title>
+<title>Node93&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M749.5788,-1015.5706C1039.8962,-1008.3408 2311.7392,-973.8169 2703.2008,-915 2854.229,-892.3081 3015.2008,-923.7234 3015.2008,-771 3015.2008,-771 3015.2008,-771 3015.2008,-138.5 3015.2008,-98.9438 2995.938,-87.6517 2962.2008,-67 2916.2248,-38.8565 2755.8235,-23.3607 2686.5114,-17.8454"/>
 <polygon fill="#191970" stroke="#191970" points="2686.4416,-14.3296 2676.2008,-17.0455 2685.9001,-21.3086 2686.4416,-14.3296"/>
 </g>
-<!-- Node92&#45;&gt;Node21 -->
+<!-- Node93&#45;&gt;Node21 -->
 <g id="edge194" class="edge">
-<title>Node92&#45;&gt;Node21</title>
+<title>Node93&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M631.134,-1014.9918C489.8177,-1009.8268 140.4319,-994.7942 93.2008,-971 55.6372,-952.0761 31.2008,-941.5611 31.2008,-899.5 31.2008,-899.5 31.2008,-899.5 31.2008,-267 31.2008,-187.5302 69.5043,-164.6251 137.2008,-123 176.9989,-98.529 313.4583,-84.753 377.5031,-79.4982"/>
 <polygon fill="#191970" stroke="#191970" points="377.8074,-82.9851 387.4967,-78.6994 377.2496,-76.0074 377.8074,-82.9851"/>
 </g>
-<!-- Node92&#45;&gt;Node27 -->
+<!-- Node93&#45;&gt;Node27 -->
 <g id="edge189" class="edge">
-<title>Node92&#45;&gt;Node27</title>
+<title>Node93&#45;&gt;Node27</title>
 <path fill="none" stroke="#191970" d="M630.9369,-1014.1644C554.5289,-1009.5497 417.9539,-998.0411 304.2008,-971 213.582,-949.4583 107.2008,-992.644 107.2008,-899.5 107.2008,-899.5 107.2008,-899.5 107.2008,-832.5 107.2008,-796.007 481.7651,-337.063 509.2008,-313 590.8078,-241.4248 617.6516,-225.5863 720.2008,-190 828.3323,-152.4765 871.6339,-202.7273 975.2008,-154 993.7044,-145.2942 991.23,-132.7583 1009.2008,-123 1032.1169,-110.5563 1098.7035,-95.0256 1143.0843,-85.5684"/>
 <polygon fill="#191970" stroke="#191970" points="1143.9574,-88.9614 1153.02,-83.4733 1142.513,-82.112 1143.9574,-88.9614"/>
 </g>
-<!-- Node92&#45;&gt;Node48 -->
+<!-- Node93&#45;&gt;Node48 -->
 <g id="edge191" class="edge">
-<title>Node92&#45;&gt;Node48</title>
+<title>Node93&#45;&gt;Node48</title>
 <path fill="none" stroke="#191970" d="M675.174,-1007.3726C649.3274,-989.5016 599.2008,-948.5232 599.2008,-899.5 599.2008,-899.5 599.2008,-899.5 599.2008,-435 599.2008,-346.1971 891.2408,-246.3196 989.4315,-215.2482"/>
 <polygon fill="#191970" stroke="#191970" points="990.6135,-218.5456 999.1064,-212.2115 988.5172,-211.8669 990.6135,-218.5456"/>
 </g>
-<!-- Node92&#45;&gt;Node80 -->
+<!-- Node93&#45;&gt;Node80 -->
 <g id="edge170" class="edge">
-<title>Node92&#45;&gt;Node80</title>
+<title>Node93&#45;&gt;Node80</title>
 <path fill="none" stroke="#191970" d="M705.2276,-1007.3726C731.0741,-989.5016 781.2008,-948.5232 781.2008,-899.5 781.2008,-899.5 781.2008,-899.5 781.2008,-715 781.2008,-508.0549 1027.7819,-635.6454 1219.2008,-557 1274.0129,-534.4802 1290.3078,-532.9718 1340.2008,-501 1390.0351,-469.0657 1441.0342,-421.1438 1466.5336,-395.9174"/>
 <polygon fill="#191970" stroke="#191970" points="1469.2831,-398.1178 1473.8828,-388.5735 1464.3352,-393.1662 1469.2831,-398.1178"/>
 </g>
-<!-- Node92&#45;&gt;Node81 -->
+<!-- Node93&#45;&gt;Node81 -->
 <g id="edge171" class="edge">
-<title>Node92&#45;&gt;Node81</title>
+<title>Node93&#45;&gt;Node81</title>
 <path fill="none" stroke="#191970" d="M749.6217,-1013.8988C861.2257,-1007.7966 1109.7262,-992.8923 1318.2008,-971 1319.314,-970.8831 1320.4363,-970.7625 1321.5659,-970.6386"/>
 <polygon fill="#191970" stroke="#191970" points="1322.3453,-974.0719 1331.8764,-969.4449 1321.5402,-967.1183 1322.3453,-974.0719"/>
 </g>
-<!-- Node92&#45;&gt;Node82 -->
+<!-- Node93&#45;&gt;Node82 -->
 <g id="edge172" class="edge">
-<title>Node92&#45;&gt;Node82</title>
+<title>Node93&#45;&gt;Node82</title>
 <path fill="none" stroke="#191970" d="M731.6662,-1007.484C824.5362,-986.1711 1049.0424,-934.6487 1150.6796,-911.3237"/>
 <polygon fill="#191970" stroke="#191970" points="1151.7425,-914.6709 1160.7062,-909.0227 1150.1767,-907.8482 1151.7425,-914.6709"/>
 </g>
-<!-- Node92&#45;&gt;Node83 -->
+<!-- Node93&#45;&gt;Node83 -->
 <g id="edge173" class="edge">
-<title>Node92&#45;&gt;Node83</title>
+<title>Node93&#45;&gt;Node83</title>
 <path fill="none" stroke="#191970" d="M749.2544,-1011.358C880.3868,-998.1158 1191.6487,-962.5361 1285.2008,-915 1312.8297,-900.9611 1337.3852,-874.8619 1352.9114,-855.7664"/>
 <polygon fill="#191970" stroke="#191970" points="1355.7426,-857.8286 1359.1917,-847.8109 1350.2483,-853.4912 1355.7426,-857.8286"/>
 </g>
-<!-- Node92&#45;&gt;Node91 -->
+<!-- Node93&#45;&gt;Node92 -->
 <g id="edge188" class="edge">
-<title>Node92&#45;&gt;Node91</title>
+<title>Node93&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M749.2651,-1016.5058C1106.7398,-1013.4332 2967.4208,-996.2477 3020.2008,-971 3043.1744,-960.0104 3060.5532,-935.5679 3070.5484,-918.3314"/>
 <polygon fill="#191970" stroke="#191970" points="3073.8314,-919.6234 3075.5668,-909.1716 3067.6924,-916.26 3073.8314,-919.6234"/>
 </g>
-<!-- Node93 -->
+<!-- Node94 -->
 <g id="node43" class="node">
-<title>Node93</title>
+<title>Node94</title>
 <g id="a_node43"><a xlink:href="data__layout_8h.html" target="_top" xlink:title="Layout expression to describe the data organization of a tensor. And BijectiveLayout to mapping two d...">
 <polygon fill="#ffffff" stroke="#000000" points="410.7008,-593.5 410.7008,-612.5 529.7008,-612.5 529.7008,-593.5 410.7008,-593.5"/>
 <text text-anchor="middle" x="470.2008" y="-600.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/tir/data_layout.h</text>
 </a>
 </g>
 </g>
-<!-- Node92&#45;&gt;Node93 -->
+<!-- Node93&#45;&gt;Node94 -->
 <g id="edge162" class="edge">
-<title>Node92&#45;&gt;Node93</title>
+<title>Node93&#45;&gt;Node94</title>
 <path fill="none" stroke="#191970" d="M630.8646,-1014.0053C545.2503,-1008.8693 391.9886,-996.3853 343.2008,-971 277.3476,-936.7353 257.1505,-917.1695 230.2008,-848 186.7574,-736.4977 359.4441,-649.0684 435.684,-616.6363"/>
 <polygon fill="#191970" stroke="#191970" points="437.4626,-619.6865 445.3415,-612.6032 434.7651,-613.2271 437.4626,-619.6865"/>
 </g>
-<!-- Node94 -->
+<!-- Node95 -->
 <g id="node45" class="node">
-<title>Node94</title>
+<title>Node95</title>
 <g id="a_node45"><a xlink:href="ravel__unravel_8h.html" target="_top" xlink:title="Index ravel and unraval operations. ">
 <polygon fill="#ffffff" stroke="#000000" points="239.2008,-817.5 239.2008,-847.5 355.2008,-847.5 355.2008,-817.5 239.2008,-817.5"/>
 <text text-anchor="start" x="247.2008" y="-835.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/detail/ravel</text>
@@ -1411,15 +1411,15 @@
 </a>
 </g>
 </g>
-<!-- Node92&#45;&gt;Node94 -->
+<!-- Node93&#45;&gt;Node95 -->
 <g id="edge174" class="edge">
-<title>Node92&#45;&gt;Node94</title>
+<title>Node93&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M631.091,-1015.0914C549.4162,-1011.3824 407.4427,-1000.7437 366.2008,-971 328.224,-943.6111 309.9748,-889.3154 302.1945,-857.448"/>
 <polygon fill="#191970" stroke="#191970" points="305.5784,-856.5443 299.944,-847.5719 298.7534,-858.0996 305.5784,-856.5443"/>
 </g>
-<!-- Node95 -->
+<!-- Node96 -->
 <g id="node46" class="node">
-<title>Node95</title>
+<title>Node96</title>
 <g id="a_node46"><a xlink:href="strided__slice_8h.html" target="_top" xlink:title="Utility functions for strided_slice op. ">
 <polygon fill="#ffffff" stroke="#000000" points="627.7008,-884.5 627.7008,-914.5 752.7008,-914.5 752.7008,-884.5 627.7008,-884.5"/>
 <text text-anchor="start" x="635.7008" y="-902.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/detail/strided</text>
@@ -1427,15 +1427,15 @@
 </a>
 </g>
 </g>
-<!-- Node92&#45;&gt;Node95 -->
+<!-- Node93&#45;&gt;Node96 -->
 <g id="edge177" class="edge">
-<title>Node92&#45;&gt;Node95</title>
+<title>Node93&#45;&gt;Node96</title>
 <path fill="none" stroke="#191970" d="M690.2008,-1007.3845C690.2008,-989.544 690.2008,-950.7839 690.2008,-924.9138"/>
 <polygon fill="#191970" stroke="#191970" points="693.7009,-924.7143 690.2008,-914.7143 686.7009,-924.7143 693.7009,-924.7143"/>
 </g>
-<!-- Node96 -->
+<!-- Node97 -->
 <g id="node48" class="node">
-<title>Node96</title>
+<title>Node97</title>
 <g id="a_node48"><a xlink:href="tensor__utils_8h.html" target="_top" xlink:title="Utility functions for handling tensor. ">
 <polygon fill="#ffffff" stroke="#000000" points="411.2008,-817.5 411.2008,-847.5 533.2008,-847.5 533.2008,-817.5 411.2008,-817.5"/>
 <text text-anchor="start" x="419.2008" y="-835.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/detail/tensor</text>
@@ -1443,21 +1443,21 @@
 </a>
 </g>
 </g>
-<!-- Node92&#45;&gt;Node96 -->
+<!-- Node93&#45;&gt;Node97 -->
 <g id="edge185" class="edge">
-<title>Node92&#45;&gt;Node96</title>
+<title>Node93&#45;&gt;Node97</title>
 <path fill="none" stroke="#191970" d="M658.2551,-1007.4439C636.2912,-999.8578 607.2286,-987.7344 585.2008,-971 542.4279,-938.5057 505.8358,-886.6345 486.6209,-856.4452"/>
 <polygon fill="#191970" stroke="#191970" points="489.4964,-854.4422 481.2279,-847.8177 483.5606,-858.1526 489.4964,-854.4422"/>
 </g>
-<!-- Node97 -->
+<!-- Node98 -->
 <g id="node49" class="node">
-<title>Node97</title>
+<title>Node98</title>
 <polygon fill="#ffffff" stroke="#bfbfbf" points="375.7008,-951.5 375.7008,-970.5 428.7008,-970.5 428.7008,-951.5 375.7008,-951.5"/>
 <text text-anchor="middle" x="402.2008" y="-958.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">iterator</text>
 </g>
-<!-- Node92&#45;&gt;Node97 -->
+<!-- Node93&#45;&gt;Node98 -->
 <g id="edge190" class="edge">
-<title>Node92&#45;&gt;Node97</title>
+<title>Node93&#45;&gt;Node98</title>
 <path fill="none" stroke="#191970" d="M631.0823,-1007.7603C582.1195,-999.7553 510.5426,-987.2434 438.966,-971.0118"/>
 <polygon fill="#191970" stroke="#191970" points="439.3979,-967.5201 428.8686,-968.6969 437.8336,-974.3431 439.3979,-967.5201"/>
 </g>
@@ -1467,45 +1467,45 @@
 <polygon fill="#ffffff" stroke="#bfbfbf" points="447.2008,-951.5 447.2008,-970.5 533.2008,-970.5 533.2008,-951.5 447.2008,-951.5"/>
 <text text-anchor="middle" x="490.2008" y="-958.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">unordered_set</text>
 </g>
-<!-- Node92&#45;&gt;Node75 -->
+<!-- Node93&#45;&gt;Node75 -->
 <g id="edge193" class="edge">
-<title>Node92&#45;&gt;Node75</title>
+<title>Node93&#45;&gt;Node75</title>
 <path fill="none" stroke="#191970" d="M655.8195,-1007.3733C622.1307,-997.9404 570.6525,-983.5265 534.039,-973.2747"/>
 <polygon fill="#191970" stroke="#191970" points="534.7067,-969.8271 524.1334,-970.5011 532.8193,-976.5679 534.7067,-969.8271"/>
 </g>
-<!-- Node93&#45;&gt;Node17 -->
+<!-- Node94&#45;&gt;Node17 -->
 <g id="edge167" class="edge">
-<title>Node93&#45;&gt;Node17</title>
+<title>Node94&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M529.7429,-601.9916C833.5209,-596.7839 2199.8028,-572.5768 2288.2008,-557 2499.1731,-519.8241 2673.2008,-481.2227 2673.2008,-267 2673.2008,-267 2673.2008,-267 2673.2008,-138.5 2673.2008,-101.753 2664.8156,-59.5469 2659.1862,-35.3522"/>
 <polygon fill="#191970" stroke="#191970" points="2662.5102,-34.2061 2656.7632,-25.3054 2655.7053,-35.8473 2662.5102,-34.2061"/>
 </g>
-<!-- Node93&#45;&gt;Node19 -->
+<!-- Node94&#45;&gt;Node19 -->
 <g id="edge168" class="edge">
-<title>Node93&#45;&gt;Node19</title>
+<title>Node94&#45;&gt;Node19</title>
 <path fill="none" stroke="#191970" d="M449.3074,-593.4014C431.7607,-584.9167 406.3919,-571.6402 386.2008,-557 173.4356,-402.7281 -123.2589,-259.9274 55.2008,-67 76.7402,-43.7144 552.0722,-22.2864 682.4072,-16.833"/>
 <polygon fill="#191970" stroke="#191970" points="682.7174,-20.3232 692.5636,-16.4113 682.4269,-13.3293 682.7174,-20.3232"/>
 </g>
-<!-- Node93&#45;&gt;Node21 -->
+<!-- Node94&#45;&gt;Node21 -->
 <g id="edge169" class="edge">
-<title>Node93&#45;&gt;Node21</title>
+<title>Node94&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M452.8591,-593.4601C424.9585,-576.6916 373.2008,-539.1023 373.2008,-491 373.2008,-491 373.2008,-491 373.2008,-205.5 373.2008,-165.195 390.6048,-120.5903 401.8563,-95.9654"/>
 <polygon fill="#191970" stroke="#191970" points="405.1239,-97.2425 406.2329,-86.7058 398.7952,-94.2511 405.1239,-97.2425"/>
 </g>
-<!-- Node93&#45;&gt;Node27 -->
+<!-- Node94&#45;&gt;Node27 -->
 <g id="edge165" class="edge">
-<title>Node93&#45;&gt;Node27</title>
+<title>Node94&#45;&gt;Node27</title>
 <path fill="none" stroke="#191970" d="M479.8487,-593.3898C487.9038,-584.6988 498.8953,-571.1317 504.2008,-557 542.4441,-455.1341 460.2669,-401.0435 524.2008,-313 548.5479,-279.4716 572.1668,-293.4425 610.2008,-277 697.3103,-239.3417 714.459,-217.7877 805.2008,-190 893.2332,-163.0419 923.4994,-187.641 1009.2008,-154 1032.8404,-144.7206 1034.4362,-134.2566 1057.2008,-123 1084.9505,-109.2783 1117.8465,-97.547 1143.3407,-89.3661"/>
 <polygon fill="#191970" stroke="#191970" points="1144.5869,-92.6431 1153.0727,-86.2995 1142.483,-85.9667 1144.5869,-92.6431"/>
 </g>
-<!-- Node93&#45;&gt;Node53 -->
+<!-- Node94&#45;&gt;Node53 -->
 <g id="edge163" class="edge">
-<title>Node93&#45;&gt;Node53</title>
+<title>Node94&#45;&gt;Node53</title>
 <path fill="none" stroke="#191970" d="M529.8751,-600.8631C682.9528,-595.1364 1086.8151,-578.416 1219.2008,-557 1314.98,-541.5058 1336.8156,-527.3237 1430.2008,-501 1460.1113,-492.5687 1467.1715,-488.9982 1497.2008,-481 1545.8257,-468.0489 1601.7477,-455.1163 1641.3811,-446.2774"/>
 <polygon fill="#191970" stroke="#191970" points="1642.3418,-449.6495 1651.3453,-444.065 1640.8244,-442.8159 1642.3418,-449.6495"/>
 </g>
-<!-- Node93&#45;&gt;Node62 -->
+<!-- Node94&#45;&gt;Node62 -->
 <g id="edge164" class="edge">
-<title>Node93&#45;&gt;Node62</title>
+<title>Node94&#45;&gt;Node62</title>
 <path fill="none" stroke="#191970" d="M529.8349,-598.2496C666.7461,-587.3435 1001.3454,-560.6898 1125.6659,-550.7866"/>
 <polygon fill="#191970" stroke="#191970" points="1126.2997,-554.2473 1135.9902,-549.9641 1125.7438,-547.2694 1126.2997,-554.2473"/>
 </g>
@@ -1515,57 +1515,57 @@
 <polygon fill="#ffffff" stroke="#bfbfbf" points="439.2008,-537.5 439.2008,-556.5 495.2008,-556.5 495.2008,-537.5 439.2008,-537.5"/>
 <text text-anchor="middle" x="467.2008" y="-544.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">sstream</text>
 </g>
-<!-- Node93&#45;&gt;Node76 -->
+<!-- Node94&#45;&gt;Node76 -->
 <g id="edge166" class="edge">
-<title>Node93&#45;&gt;Node76</title>
+<title>Node94&#45;&gt;Node76</title>
 <path fill="none" stroke="#191970" d="M469.6782,-593.2455C469.2868,-585.9382 468.738,-575.6944 468.2564,-566.7046"/>
 <polygon fill="#191970" stroke="#191970" points="471.7474,-566.4411 467.7173,-556.6427 464.7574,-566.8156 471.7474,-566.4411"/>
 </g>
-<!-- Node94&#45;&gt;Node1 -->
+<!-- Node95&#45;&gt;Node1 -->
 <g id="edge175" class="edge">
-<title>Node94&#45;&gt;Node1</title>
+<title>Node95&#45;&gt;Node1</title>
 <path fill="none" stroke="#191970" d="M355.3954,-822.9101C370.5943,-820.6799 386.982,-818.5172 402.2008,-817 707.493,-786.5649 1074.4369,-775.5948 1219.6213,-772.2624"/>
 <polygon fill="#191970" stroke="#191970" points="1219.7689,-775.7601 1229.6875,-772.0355 1219.6111,-768.7619 1219.7689,-775.7601"/>
 </g>
-<!-- Node94&#45;&gt;Node21 -->
+<!-- Node95&#45;&gt;Node21 -->
 <g id="edge176" class="edge">
-<title>Node94&#45;&gt;Node21</title>
+<title>Node95&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M239.0726,-818.8277C215.2356,-810.9175 188.719,-798.8781 169.2008,-781 124.3498,-739.9178 107.2008,-719.8223 107.2008,-659 107.2008,-659 107.2008,-659 107.2008,-323 107.2008,-229.938 99.7923,-184.992 169.2008,-123 199.4533,-95.98 317.7461,-83.7461 376.9947,-79.2213"/>
 <polygon fill="#191970" stroke="#191970" points="377.6183,-82.6851 387.3361,-78.4643 377.1072,-75.7038 377.6183,-82.6851"/>
 </g>
-<!-- Node95&#45;&gt;Node17 -->
+<!-- Node96&#45;&gt;Node17 -->
 <g id="edge181" class="edge">
-<title>Node95&#45;&gt;Node17</title>
+<title>Node96&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M752.8102,-896.701C1112.6219,-880.5152 2901.2008,-798.6661 2901.2008,-771 2901.2008,-771 2901.2008,-771 2901.2008,-138.5 2901.2008,-43.6908 2753.5751,-21.9605 2686.5809,-16.9804"/>
 <polygon fill="#191970" stroke="#191970" points="2686.7655,-13.4851 2676.556,-16.3163 2686.3028,-20.4697 2686.7655,-13.4851"/>
 </g>
-<!-- Node95&#45;&gt;Node21 -->
+<!-- Node96&#45;&gt;Node21 -->
 <g id="edge183" class="edge">
-<title>Node95&#45;&gt;Node21</title>
+<title>Node96&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M627.3816,-892.2726C555.0006,-883.2503 441.0403,-866.7089 402.2008,-848 392.8733,-843.507 269.6795,-732.1594 262.2008,-725 207.9319,-673.0487 145.2008,-678.1269 145.2008,-603 145.2008,-603 145.2008,-603 145.2008,-323 145.2008,-217.7042 189.1675,-187.751 272.2008,-123 303.1311,-98.8799 347.2758,-87.1243 377.467,-81.5909"/>
 <polygon fill="#191970" stroke="#191970" points="378.2471,-85.0087 387.5196,-79.8831 377.0746,-78.1075 378.2471,-85.0087"/>
 </g>
-<!-- Node95&#45;&gt;Node27 -->
+<!-- Node96&#45;&gt;Node27 -->
 <g id="edge179" class="edge">
-<title>Node95&#45;&gt;Node27</title>
+<title>Node96&#45;&gt;Node27</title>
 <path fill="none" stroke="#191970" d="M657.3149,-884.4614C619.0954,-864.4594 561.2008,-825.2132 561.2008,-771 561.2008,-771 561.2008,-771 561.2008,-603 561.2008,-523.173 558.5916,-500.2615 585.2008,-425 594.9065,-397.5482 600.783,-391.5519 619.2008,-369 659.2864,-319.9165 782.3342,-213.4685 841.2008,-190 927.5357,-155.5806 962.6665,-192.6314 1047.2008,-154 1067.0247,-144.9406 1066.7077,-134.5355 1085.2008,-123 1105.7293,-110.1949 1130.412,-98.8043 1150.1187,-90.5611"/>
 <polygon fill="#191970" stroke="#191970" points="1151.7107,-93.6913 1159.6369,-86.661 1149.0565,-87.214 1151.7107,-93.6913"/>
 </g>
-<!-- Node95&#45;&gt;Node48 -->
+<!-- Node96&#45;&gt;Node48 -->
 <g id="edge180" class="edge">
-<title>Node95&#45;&gt;Node48</title>
+<title>Node96&#45;&gt;Node48</title>
 <path fill="none" stroke="#191970" d="M689.782,-884.2166C689.0349,-838.3796 690.4881,-698.7079 731.2008,-593 764.0825,-507.6246 950.8538,-287.1177 1005.9211,-223.1426"/>
 <polygon fill="#191970" stroke="#191970" points="1008.9047,-225.0424 1012.7875,-215.1847 1003.6048,-220.4695 1008.9047,-225.0424"/>
 </g>
-<!-- Node95&#45;&gt;Node53 -->
+<!-- Node96&#45;&gt;Node53 -->
 <g id="edge178" class="edge">
-<title>Node95&#45;&gt;Node53</title>
+<title>Node96&#45;&gt;Node53</title>
 <path fill="none" stroke="#191970" d="M711.7494,-884.4098C736.6391,-867.2391 779.0646,-838.7615 817.2008,-817 1011.5863,-706.0786 1061.6835,-679.2548 1268.2008,-593 1313.7231,-573.987 1327.1333,-574.6513 1373.2008,-557 1453.9084,-526.0759 1471.2643,-511.3202 1552.2008,-481 1585.3098,-468.5968 1623.4661,-456.2652 1651.73,-447.4966"/>
 <polygon fill="#191970" stroke="#191970" points="1652.7652,-450.8401 1661.2903,-444.5494 1650.7029,-444.1508 1652.7652,-450.8401"/>
 </g>
-<!-- Node95&#45;&gt;Node83 -->
+<!-- Node96&#45;&gt;Node83 -->
 <g id="edge184" class="edge">
-<title>Node95&#45;&gt;Node83</title>
+<title>Node96&#45;&gt;Node83</title>
 <path fill="none" stroke="#191970" d="M753.0593,-896.0646C861.9805,-889.6665 1092.7072,-874.1501 1286.2008,-848 1288.4098,-847.7015 1290.6489,-847.3845 1292.9071,-847.0524"/>
 <polygon fill="#191970" stroke="#191970" points="1293.5474,-850.4951 1302.8956,-845.5089 1292.4783,-843.5772 1293.5474,-850.4951"/>
 </g>
@@ -1575,21 +1575,21 @@
 <polygon fill="#ffffff" stroke="#bfbfbf" points="627.7008,-823 627.7008,-842 668.7008,-842 668.7008,-823 627.7008,-823"/>
 <text text-anchor="middle" x="648.2008" y="-830" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tuple</text>
 </g>
-<!-- Node95&#45;&gt;Node49 -->
+<!-- Node96&#45;&gt;Node49 -->
 <g id="edge182" class="edge">
-<title>Node95&#45;&gt;Node49</title>
+<title>Node96&#45;&gt;Node49</title>
 <path fill="none" stroke="#191970" d="M680.6703,-884.2967C674.4442,-874.3645 666.2777,-861.3371 659.6925,-850.8321"/>
 <polygon fill="#191970" stroke="#191970" points="662.4674,-848.6689 654.1905,-842.055 656.5363,-852.3868 662.4674,-848.6689"/>
 </g>
-<!-- Node96&#45;&gt;Node1 -->
+<!-- Node97&#45;&gt;Node1 -->
 <g id="edge186" class="edge">
-<title>Node96&#45;&gt;Node1</title>
+<title>Node97&#45;&gt;Node1</title>
 <path fill="none" stroke="#191970" d="M533.2429,-825.5146C559.3851,-822.6539 590.2692,-819.4479 618.2008,-817 838.601,-797.6844 1100.6827,-781.5642 1219.2755,-774.6391"/>
 <polygon fill="#191970" stroke="#191970" points="1219.4949,-778.1323 1229.2746,-774.0571 1219.0881,-771.1441 1219.4949,-778.1323"/>
 </g>
-<!-- Node96&#45;&gt;Node21 -->
+<!-- Node97&#45;&gt;Node21 -->
 <g id="edge187" class="edge">
-<title>Node96&#45;&gt;Node21</title>
+<title>Node97&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M431.8733,-817.4387C360.9214,-788.0225 221.2008,-715.7778 221.2008,-603 221.2008,-603 221.2008,-603 221.2008,-491 221.2008,-323.8109 190.5066,-250.0345 299.2008,-123 319.0192,-99.8375 352.3251,-88.1558 377.4779,-82.3805"/>
 <polygon fill="#191970" stroke="#191970" points="378.2818,-85.7881 387.3535,-80.3151 376.8487,-78.9364 378.2818,-85.7881"/>
 </g>
diff --git a/docs/reference/api/doxygen/broadcast_8h__dep__incl.svg b/docs/reference/api/doxygen/broadcast_8h__dep__incl.svg
index a2711fad4f..255f03a159 100644
--- a/docs/reference/api/doxygen/broadcast_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/broadcast_8h__dep__incl.svg
@@ -9,45 +9,45 @@
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 281)">
 <title>include/tvm/topi/broadcast.h</title>
 <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-281 425.8482,-281 425.8482,4 -4,4"/>
-<!-- Node91 -->
+<!-- Node92 -->
 <g id="node1" class="node">
-<title>Node91</title>
+<title>Node92</title>
 <polygon fill="#bfbfbf" stroke="#000000" points="101.3482,-257.5 101.3482,-276.5 257.3482,-276.5 257.3482,-257.5 101.3482,-257.5"/>
 <text text-anchor="middle" x="179.3482" y="-264.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/broadcast.h</text>
 </g>
-<!-- Node92 -->
+<!-- Node93 -->
 <g id="node2" class="node">
-<title>Node92</title>
+<title>Node93</title>
 <g id="a_node2"><a xlink:href="elemwise_8h.html" target="_top" xlink:title="Elementwise op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="14.8482,-201.5 14.8482,-220.5 169.8482,-220.5 169.8482,-201.5 14.8482,-201.5"/>
 <text text-anchor="middle" x="92.3482" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/elemwise.h</text>
 </a>
 </g>
 </g>
-<!-- Node91&#45;&gt;Node92 -->
+<!-- Node92&#45;&gt;Node93 -->
 <g id="edge1" class="edge">
-<title>Node91&#45;&gt;Node92</title>
+<title>Node92&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M155.6857,-251.769C140.2691,-241.8456 120.6222,-229.1994 107.3288,-220.6427"/>
 <polygon fill="#191970" stroke="#191970" points="153.8909,-254.7761 164.1939,-257.2455 157.6797,-248.89 153.8909,-254.7761"/>
 </g>
-<!-- Node93 -->
+<!-- Node94 -->
 <g id="node3" class="node">
-<title>Node93</title>
+<title>Node94</title>
 <g id="a_node3"><a xlink:href="reduction_8h.html" target="_top" xlink:title="Reduction op constructors. ">
 <polygon fill="#ffffff" stroke="#000000" points="15.3482,-140 15.3482,-159 169.3482,-159 169.3482,-140 15.3482,-140"/>
 <text text-anchor="middle" x="92.3482" y="-147" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/reduction.h</text>
 </a>
 </g>
 </g>
-<!-- Node91&#45;&gt;Node93 -->
+<!-- Node92&#45;&gt;Node94 -->
 <g id="edge9" class="edge">
-<title>Node91&#45;&gt;Node93</title>
+<title>Node92&#45;&gt;Node94</title>
 <path fill="none" stroke="#191970" d="M100.5748,-255.7108C60.3885,-248.1695 17.4556,-236.7242 5.3482,-221 -15.1075,-194.4337 30.8894,-171.3732 63.4833,-159.0441"/>
 <polygon fill="#191970" stroke="#191970" points="99.9768,-259.1594 110.4403,-257.4968 101.2238,-252.2713 99.9768,-259.1594"/>
 </g>
-<!-- Node98 -->
+<!-- Node99 -->
 <g id="node8" class="node">
-<title>Node98</title>
+<title>Node99</title>
 <g id="a_node8"><a xlink:href="bias__add_8h.html" target="_top" xlink:title="bias_add op constructions ">
 <polygon fill="#ffffff" stroke="#000000" points="310.8482,-134.5 310.8482,-164.5 421.8482,-164.5 421.8482,-134.5 310.8482,-134.5"/>
 <text text-anchor="start" x="318.8482" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -55,36 +55,36 @@
 </a>
 </g>
 </g>
-<!-- Node91&#45;&gt;Node98 -->
+<!-- Node92&#45;&gt;Node99 -->
 <g id="edge8" class="edge">
-<title>Node91&#45;&gt;Node98</title>
+<title>Node92&#45;&gt;Node99</title>
 <path fill="none" stroke="#191970" d="M253.182,-255.5911C294.0919,-247.7551 339.4367,-236.0503 353.3482,-221 367.393,-205.8054 368.7781,-180.7353 367.9598,-164.6785"/>
 <polygon fill="#191970" stroke="#191970" points="252.3658,-252.1828 243.1732,-257.4502 253.6442,-259.0651 252.3658,-252.1828"/>
 </g>
-<!-- Node99 -->
+<!-- Node100 -->
 <g id="node9" class="node">
-<title>Node99</title>
+<title>Node100</title>
 <g id="a_node9"><a xlink:href="topi_2transform_8h.html" target="_top" xlink:title="Transform op constructors. ">
 <polygon fill="#ffffff" stroke="#000000" points="188.3482,-201.5 188.3482,-220.5 344.3482,-220.5 344.3482,-201.5 188.3482,-201.5"/>
 <text text-anchor="middle" x="266.3482" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/transform.h</text>
 </a>
 </g>
 </g>
-<!-- Node91&#45;&gt;Node99 -->
+<!-- Node92&#45;&gt;Node100 -->
 <g id="edge10" class="edge">
-<title>Node91&#45;&gt;Node99</title>
+<title>Node92&#45;&gt;Node100</title>
 <path fill="none" stroke="#191970" d="M203.0107,-251.769C218.4273,-241.8456 238.0741,-229.1994 251.3676,-220.6427"/>
 <polygon fill="#191970" stroke="#191970" points="201.0167,-248.89 194.5024,-257.2455 204.8055,-254.7761 201.0167,-248.89"/>
 </g>
-<!-- Node92&#45;&gt;Node93 -->
+<!-- Node93&#45;&gt;Node94 -->
 <g id="edge2" class="edge">
-<title>Node92&#45;&gt;Node93</title>
+<title>Node93&#45;&gt;Node94</title>
 <path fill="none" stroke="#191970" d="M92.3482,-191.2462C92.3482,-180.5519 92.3482,-167.814 92.3482,-159.2449"/>
 <polygon fill="#191970" stroke="#191970" points="88.8483,-191.3906 92.3482,-201.3906 95.8483,-191.3907 88.8483,-191.3906"/>
 </g>
-<!-- Node94 -->
+<!-- Node95 -->
 <g id="node4" class="node">
-<title>Node94</title>
+<title>Node95</title>
 <g id="a_node4"><a xlink:href="nn_2pooling_8h.html" target="_top" xlink:title="Pooling op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="47.8482,-.5 47.8482,-30.5 158.8482,-30.5 158.8482,-.5 47.8482,-.5"/>
 <text text-anchor="start" x="55.8482" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -92,15 +92,15 @@
 </a>
 </g>
 </g>
-<!-- Node93&#45;&gt;Node94 -->
+<!-- Node94&#45;&gt;Node95 -->
 <g id="edge3" class="edge">
-<title>Node93&#45;&gt;Node94</title>
+<title>Node94&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M56.1973,-135.5185C40.0159,-127.2274 22.5413,-114.967 13.3482,-98 6.7846,-85.8861 6.4363,-78.9186 13.3482,-67 23.0115,-50.3368 40.3197,-38.6061 57.194,-30.5956"/>
 <polygon fill="#191970" stroke="#191970" points="54.9691,-138.8101 65.5028,-139.947 57.9772,-132.4894 54.9691,-138.8101"/>
 </g>
-<!-- Node95 -->
+<!-- Node96 -->
 <g id="node5" class="node">
-<title>Node95</title>
+<title>Node96</title>
 <g id="a_node5"><a xlink:href="nn_2softmax_8h.html" target="_top" xlink:title="Softmax op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="22.8482,-67.5 22.8482,-97.5 133.8482,-97.5 133.8482,-67.5 22.8482,-67.5"/>
 <text text-anchor="start" x="30.8482" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -108,30 +108,30 @@
 </a>
 </g>
 </g>
-<!-- Node93&#45;&gt;Node95 -->
+<!-- Node94&#45;&gt;Node96 -->
 <g id="edge4" class="edge">
-<title>Node93&#45;&gt;Node95</title>
+<title>Node94&#45;&gt;Node96</title>
 <path fill="none" stroke="#191970" d="M88.2953,-130.1042C86.1273,-119.7287 83.4979,-107.145 81.4871,-97.5218"/>
 <polygon fill="#191970" stroke="#191970" points="84.8709,-130.8279 90.3423,-139.9005 91.7229,-129.396 84.8709,-130.8279"/>
 </g>
-<!-- Node96 -->
+<!-- Node97 -->
 <g id="node6" class="node">
-<title>Node96</title>
+<title>Node97</title>
 <g id="a_node6"><a xlink:href="topi_2nn_8h.html" target="_top" xlink:title="NN op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="152.3482,-73 152.3482,-92 272.3482,-92 272.3482,-73 152.3482,-73"/>
 <text text-anchor="middle" x="212.3482" y="-80" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn.h</text>
 </a>
 </g>
 </g>
-<!-- Node93&#45;&gt;Node96 -->
+<!-- Node94&#45;&gt;Node97 -->
 <g id="edge5" class="edge">
-<title>Node93&#45;&gt;Node96</title>
+<title>Node94&#45;&gt;Node97</title>
 <path fill="none" stroke="#191970" d="M118.525,-134.8846C141.7971,-121.891 175.1515,-103.2681 195.187,-92.0817"/>
 <polygon fill="#191970" stroke="#191970" points="116.5663,-131.9696 109.5413,-139.9005 119.9788,-138.0815 116.5663,-131.9696"/>
 </g>
-<!-- Node97 -->
+<!-- Node98 -->
 <g id="node7" class="node">
-<title>Node97</title>
+<title>Node98</title>
 <g id="a_node7"><a xlink:href="reorg_8h.html" target="_top" xlink:title="Reorg op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="290.3482,-67.5 290.3482,-97.5 418.3482,-97.5 418.3482,-67.5 290.3482,-67.5"/>
 <text text-anchor="start" x="298.3482" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/vision</text>
@@ -139,39 +139,39 @@
 </a>
 </g>
 </g>
-<!-- Node93&#45;&gt;Node97 -->
+<!-- Node94&#45;&gt;Node98 -->
 <g id="edge7" class="edge">
-<title>Node93&#45;&gt;Node97</title>
+<title>Node94&#45;&gt;Node98</title>
 <path fill="none" stroke="#191970" d="M139.4098,-137.4652C183.1109,-126.2897 248.3422,-109.6084 295.475,-97.5553"/>
 <polygon fill="#191970" stroke="#191970" points="138.4292,-134.1032 129.6081,-139.9717 140.1635,-140.885 138.4292,-134.1032"/>
 </g>
-<!-- Node96&#45;&gt;Node94 -->
+<!-- Node97&#45;&gt;Node95 -->
 <g id="edge6" class="edge">
-<title>Node96&#45;&gt;Node94</title>
+<title>Node97&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M187.9954,-67.5308C170.0527,-56.5019 145.7494,-41.5631 127.7866,-30.5218"/>
 <polygon fill="#191970" stroke="#191970" points="186.379,-70.6456 196.7311,-72.9005 190.0447,-64.6821 186.379,-70.6456"/>
 </g>
-<!-- Node99&#45;&gt;Node93 -->
+<!-- Node100&#45;&gt;Node94 -->
 <g id="edge13" class="edge">
-<title>Node99&#45;&gt;Node93</title>
+<title>Node100&#45;&gt;Node94</title>
 <path fill="none" stroke="#191970" d="M229.4273,-197.9504C196.4414,-186.2916 148.8916,-169.4852 119.3448,-159.0419"/>
 <polygon fill="#191970" stroke="#191970" points="228.5659,-201.3581 239.1607,-201.3906 230.8987,-194.7582 228.5659,-201.3581"/>
 </g>
-<!-- Node99&#45;&gt;Node96 -->
+<!-- Node100&#45;&gt;Node97 -->
 <g id="edge12" class="edge">
-<title>Node99&#45;&gt;Node96</title>
+<title>Node100&#45;&gt;Node97</title>
 <path fill="none" stroke="#191970" d="M258.3273,-191.9134C246.6448,-164.1134 225.3715,-113.4907 216.4541,-92.2705"/>
 <polygon fill="#191970" stroke="#191970" points="255.1885,-193.4785 262.2894,-201.3416 261.6419,-190.7666 255.1885,-193.4785"/>
 </g>
-<!-- Node99&#45;&gt;Node97 -->
+<!-- Node100&#45;&gt;Node98 -->
 <g id="edge14" class="edge">
-<title>Node99&#45;&gt;Node97</title>
+<title>Node100&#45;&gt;Node98</title>
 <path fill="none" stroke="#191970" d="M273.0793,-191.9383C279.3036,-175.7127 289.5193,-152.2082 302.3482,-134 311.8618,-120.4971 325.1226,-107.4276 335.8715,-97.8334"/>
 <polygon fill="#191970" stroke="#191970" points="269.7585,-190.8277 269.5781,-201.421 276.3252,-193.2523 269.7585,-190.8277"/>
 </g>
-<!-- Node99&#45;&gt;Node98 -->
+<!-- Node100&#45;&gt;Node99 -->
 <g id="edge11" class="edge">
-<title>Node99&#45;&gt;Node98</title>
+<title>Node100&#45;&gt;Node99</title>
 <path fill="none" stroke="#191970" d="M290.5375,-196.1236C306.1029,-186.5508 326.2665,-174.1502 341.8854,-164.5446"/>
 <polygon fill="#191970" stroke="#191970" points="288.6577,-193.1707 281.9732,-201.3906 292.3248,-199.1333 288.6577,-193.1707"/>
 </g>
diff --git a/docs/reference/api/doxygen/broadcast_8h__incl.svg b/docs/reference/api/doxygen/broadcast_8h__incl.svg
index c2480e7fd3..06cefc4e96 100644
--- a/docs/reference/api/doxygen/broadcast_8h__incl.svg
+++ b/docs/reference/api/doxygen/broadcast_8h__incl.svg
@@ -70,18 +70,18 @@
 <path fill="none" stroke="#191970" d="M3238.9683,-1009.6942C3013.6316,-988.597 2363.725,-927.7492 2139.3046,-906.7377"/>
 <polygon fill="#191970" stroke="#191970" points="2139.5443,-903.2449 2129.2616,-905.7974 2138.8917,-910.2144 2139.5443,-903.2449"/>
 </g>
-<!-- Node90 -->
+<!-- Node91 -->
 <g id="node50" class="node">
-<title>Node90</title>
+<title>Node91</title>
 <g id="a_node50"><a xlink:href="tags_8h.html" target="_top" xlink:title="External function interface to rocBLAS libraries. ">
 <polygon fill="#ffffff" stroke="#000000" points="3461.5,-828.5 3461.5,-847.5 3552.5,-847.5 3552.5,-828.5 3461.5,-828.5"/>
 <text text-anchor="middle" x="3507" y="-835.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/tags.h</text>
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node90 -->
+<!-- Node0&#45;&gt;Node91 -->
 <g id="edge206" class="edge">
-<title>Node0&#45;&gt;Node90</title>
+<title>Node0&#45;&gt;Node91</title>
 <path fill="none" stroke="#191970" d="M3373.4252,-1007.4688C3398.4428,-1000.8119 3426.7347,-989.6365 3447,-971 3480.744,-939.9683 3496.7639,-886.3431 3503.2554,-857.5774"/>
 <polygon fill="#191970" stroke="#191970" points="3506.7318,-858.0523 3505.356,-847.5472 3499.8804,-856.6174 3506.7318,-858.0523"/>
 </g>
@@ -118,15 +118,15 @@
 <path fill="none" stroke="#191970" d="M3242.2524,-957.3371C3019.9489,-946.4433 2364.4325,-914.3204 2139.1018,-903.2783"/>
 <polygon fill="#191970" stroke="#191970" points="2139.1795,-899.778 2129.0202,-902.7843 2138.8369,-906.7696 2139.1795,-899.778"/>
 </g>
-<!-- Node89 -->
+<!-- Node90 -->
 <g id="node49" class="node">
-<title>Node89</title>
+<title>Node90</title>
 <polygon fill="#ffffff" stroke="#bfbfbf" points="3281.5,-890 3281.5,-909 3328.5,-909 3328.5,-890 3281.5,-890"/>
 <text text-anchor="middle" x="3305" y="-897" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">deque</text>
 </g>
-<!-- Node1&#45;&gt;Node89 -->
+<!-- Node1&#45;&gt;Node90 -->
 <g id="edge203" class="edge">
-<title>Node1&#45;&gt;Node89</title>
+<title>Node1&#45;&gt;Node90</title>
 <path fill="none" stroke="#191970" d="M3315.125,-951.3906C3313.4441,-942.776 3310.926,-929.8708 3308.8216,-919.0858"/>
 <polygon fill="#191970" stroke="#191970" points="3312.2518,-918.3895 3306.9014,-909.2449 3305.3814,-919.7301 3312.2518,-918.3895"/>
 </g>
@@ -1673,9 +1673,9 @@
 <path fill="none" stroke="#191970" d="M1631.6635,-380.4792C1588.006,-365.402 1499.7416,-335.4421 1424,-313 1383.0548,-300.868 1336.8541,-288.5496 1299.8125,-279.0034"/>
 <polygon fill="#191970" stroke="#191970" points="1300.6682,-275.6097 1290.1119,-276.5128 1298.9273,-282.3898 1300.6682,-275.6097"/>
 </g>
-<!-- Node90&#45;&gt;Node18 -->
+<!-- Node91&#45;&gt;Node18 -->
 <g id="edge207" class="edge">
-<title>Node90&#45;&gt;Node18</title>
+<title>Node91&#45;&gt;Node18</title>
 <path fill="none" stroke="#191970" d="M3504.421,-828.4945C3499.4439,-809.2857 3489,-764.4032 3489,-726 3489,-726 3489,-726 3489,-133 3489,-98.2309 3476.7137,-85.055 3447,-67 3399.7062,-38.2628 3011.2774,-21.1837 2895.4684,-16.695"/>
 <polygon fill="#191970" stroke="#191970" points="2895.2592,-13.1845 2885.1328,-16.2999 2894.9918,-20.1794 2895.2592,-13.1845"/>
 </g>
diff --git a/docs/reference/api/doxygen/constant__utils_8h__dep__incl.svg b/docs/reference/api/doxygen/constant__utils_8h__dep__incl.svg
index 8fc2c8be98..0c5c9cdf8f 100644
--- a/docs/reference/api/doxygen/constant__utils_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/constant__utils_8h__dep__incl.svg
@@ -9,61 +9,61 @@
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 426)">
 <title>include/tvm/topi/detail/constant_utils.h</title>
 <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-426 902.9229,-426 902.9229,4 -4,4"/>
-<!-- Node87 -->
+<!-- Node88 -->
 <g id="node1" class="node">
-<title>Node87</title>
+<title>Node88</title>
 <polygon fill="#bfbfbf" stroke="#000000" points="343.9229,-391.5 343.9229,-421.5 470.9229,-421.5 470.9229,-391.5 343.9229,-391.5"/>
 <text text-anchor="start" x="351.9229" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
 <text text-anchor="middle" x="407.4229" y="-398.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/constant_utils.h</text>
 </g>
-<!-- Node88 -->
+<!-- Node89 -->
 <g id="node2" class="node">
-<title>Node88</title>
+<title>Node89</title>
 <g id="a_node2"><a xlink:href="broadcast_8h.html" target="_top" xlink:title="Broadcast op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="24.4229,-263 24.4229,-282 180.4229,-282 180.4229,-263 24.4229,-263"/>
 <text text-anchor="middle" x="102.4229" y="-270" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/broadcast.h</text>
 </a>
 </g>
 </g>
-<!-- Node87&#45;&gt;Node88 -->
+<!-- Node88&#45;&gt;Node89 -->
 <g id="edge1" class="edge">
-<title>Node87&#45;&gt;Node88</title>
+<title>Node88&#45;&gt;Node89</title>
 <path fill="none" stroke="#191970" d="M333.8707,-399.0896C248.1705,-389.6996 116.3143,-372.6071 101.4229,-355 83.9404,-334.3294 92.817,-299.2128 98.6608,-282.2474"/>
 <polygon fill="#191970" stroke="#191970" points="333.5041,-402.5703 343.8231,-400.1683 334.2585,-395.611 333.5041,-402.5703"/>
 </g>
-<!-- Node90 -->
+<!-- Node91 -->
 <g id="node4" class="node">
-<title>Node90</title>
+<title>Node91</title>
 <g id="a_node4"><a xlink:href="reduction_8h.html" target="_top" xlink:title="Reduction op constructors. ">
 <polygon fill="#ffffff" stroke="#000000" points="16.4229,-140 16.4229,-159 170.4229,-159 170.4229,-140 16.4229,-140"/>
 <text text-anchor="middle" x="93.4229" y="-147" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/reduction.h</text>
 </a>
 </g>
 </g>
-<!-- Node87&#45;&gt;Node90 -->
+<!-- Node88&#45;&gt;Node91 -->
 <g id="edge25" class="edge">
-<title>Node87&#45;&gt;Node90</title>
+<title>Node88&#45;&gt;Node91</title>
 <path fill="none" stroke="#191970" d="M333.7414,-399.6493C253.1049,-391.2779 129.6646,-375.6404 87.4229,-355 19.1958,-321.6626 -22.9439,-267.6617 13.4229,-201 24.4977,-180.6993 47.3852,-167.2003 65.8531,-159.135"/>
 <polygon fill="#191970" stroke="#191970" points="333.5388,-403.1468 343.8432,-400.6834 334.2517,-396.1832 333.5388,-403.1468"/>
 </g>
-<!-- Node93 -->
+<!-- Node94 -->
 <g id="node7" class="node">
-<title>Node93</title>
+<title>Node94</title>
 <g id="a_node7"><a xlink:href="topi_2nn_8h.html" target="_top" xlink:title="NN op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="367.4229,-73 367.4229,-92 487.4229,-92 487.4229,-73 367.4229,-73"/>
 <text text-anchor="middle" x="427.4229" y="-80" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn.h</text>
 </a>
 </g>
 </g>
-<!-- Node87&#45;&gt;Node93 -->
+<!-- Node88&#45;&gt;Node94 -->
 <g id="edge24" class="edge">
-<title>Node87&#45;&gt;Node93</title>
+<title>Node88&#45;&gt;Node94</title>
 <path fill="none" stroke="#191970" d="M426.7195,-383.2061C433.038,-374.7218 439.6522,-364.7996 444.4229,-355 461.1375,-320.6657 468.4229,-310.6867 468.4229,-272.5 468.4229,-272.5 468.4229,-272.5 468.4229,-211 468.4229,-164.6342 443.8964,-113.1211 432.7829,-92.1592"/>
 <polygon fill="#191970" stroke="#191970" points="423.9235,-381.1003 420.5788,-391.1534 429.4626,-385.3803 423.9235,-381.1003"/>
 </g>
-<!-- Node94 -->
+<!-- Node95 -->
 <g id="node8" class="node">
-<title>Node94</title>
+<title>Node95</title>
 <g id="a_node8"><a xlink:href="reorg_8h.html" target="_top" xlink:title="Reorg op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="221.4229,-67.5 221.4229,-97.5 349.4229,-97.5 349.4229,-67.5 221.4229,-67.5"/>
 <text text-anchor="start" x="229.4229" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/vision</text>
@@ -71,30 +71,30 @@
 </a>
 </g>
 </g>
-<!-- Node87&#45;&gt;Node94 -->
+<!-- Node88&#45;&gt;Node95 -->
 <g id="edge27" class="edge">
-<title>Node87&#45;&gt;Node94</title>
+<title>Node88&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M414.2837,-381.686C421.0617,-354.9515 430.4229,-311.0503 430.4229,-272.5 430.4229,-272.5 430.4229,-272.5 430.4229,-211 430.4229,-154.4909 367.9766,-116.8097 325.0447,-97.5812"/>
 <polygon fill="#191970" stroke="#191970" points="410.8946,-380.812 411.749,-391.3723 417.6666,-382.5841 410.8946,-380.812"/>
 </g>
-<!-- Node96 -->
+<!-- Node97 -->
 <g id="node10" class="node">
-<title>Node96</title>
+<title>Node97</title>
 <g id="a_node10"><a xlink:href="topi_2transform_8h.html" target="_top" xlink:title="Transform op constructors. ">
 <polygon fill="#ffffff" stroke="#000000" points="241.4229,-201.5 241.4229,-220.5 397.4229,-220.5 397.4229,-201.5 241.4229,-201.5"/>
 <text text-anchor="middle" x="319.4229" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/transform.h</text>
 </a>
 </g>
 </g>
-<!-- Node87&#45;&gt;Node96 -->
+<!-- Node88&#45;&gt;Node97 -->
 <g id="edge26" class="edge">
-<title>Node87&#45;&gt;Node96</title>
+<title>Node88&#45;&gt;Node97</title>
 <path fill="none" stroke="#191970" d="M368.1559,-386.398C334.1506,-366.5492 287.2462,-332.7011 266.4229,-288 260.605,-275.5108 260.7044,-269.535 266.4229,-257 273.7047,-241.0379 289.564,-228.4785 302.0471,-220.5442"/>
 <polygon fill="#191970" stroke="#191970" points="366.5109,-389.4888 376.9353,-391.3816 369.9665,-383.4012 366.5109,-389.4888"/>
 </g>
-<!-- Node97 -->
+<!-- Node98 -->
 <g id="node11" class="node">
-<title>Node97</title>
+<title>Node98</title>
 <g id="a_node11"><a xlink:href="detail_2broadcast_8h.html" target="_top" xlink:title="Detail broadcast. ">
 <polygon fill="#ffffff" stroke="#000000" points="110.9229,-324.5 110.9229,-354.5 237.9229,-354.5 237.9229,-324.5 110.9229,-324.5"/>
 <text text-anchor="start" x="118.9229" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
@@ -102,15 +102,15 @@
 </a>
 </g>
 </g>
-<!-- Node87&#45;&gt;Node97 -->
+<!-- Node88&#45;&gt;Node98 -->
 <g id="edge16" class="edge">
-<title>Node87&#45;&gt;Node97</title>
+<title>Node88&#45;&gt;Node98</title>
 <path fill="none" stroke="#191970" d="M345.4666,-388.6842C308.5084,-378.0568 262.0918,-364.7095 226.9407,-354.6017"/>
 <polygon fill="#191970" stroke="#191970" points="344.5553,-392.064 355.1331,-391.4639 346.4898,-385.3366 344.5553,-392.064"/>
 </g>
-<!-- Node98 -->
+<!-- Node99 -->
 <g id="node12" class="node">
-<title>Node98</title>
+<title>Node99</title>
 <g id="a_node12"><a xlink:href="strided__slice_8h.html" target="_top" xlink:title="Utility functions for strided_slice op. ">
 <polygon fill="#ffffff" stroke="#000000" points="274.9229,-257.5 274.9229,-287.5 401.9229,-287.5 401.9229,-257.5 274.9229,-257.5"/>
 <text text-anchor="start" x="282.9229" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
@@ -118,30 +118,30 @@
 </a>
 </g>
 </g>
-<!-- Node87&#45;&gt;Node98 -->
+<!-- Node88&#45;&gt;Node99 -->
 <g id="edge19" class="edge">
-<title>Node87&#45;&gt;Node98</title>
+<title>Node88&#45;&gt;Node99</title>
 <path fill="none" stroke="#191970" d="M395.0283,-382.4294C380.886,-354.9647 358.1827,-310.8742 346.2813,-287.7614"/>
 <polygon fill="#191970" stroke="#191970" points="391.9521,-384.1007 399.6418,-391.389 398.1755,-380.8961 391.9521,-384.1007"/>
 </g>
-<!-- Node99 -->
+<!-- Node100 -->
 <g id="node13" class="node">
-<title>Node99</title>
+<title>Node100</title>
 <g id="a_node13"><a xlink:href="einsum_8h.html" target="_top" xlink:title="Einstein summation op. ">
 <polygon fill="#ffffff" stroke="#000000" points="496.4229,-330 496.4229,-349 640.4229,-349 640.4229,-330 496.4229,-330"/>
 <text text-anchor="middle" x="568.4229" y="-337" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/einsum.h</text>
 </a>
 </g>
 </g>
-<!-- Node87&#45;&gt;Node99 -->
+<!-- Node88&#45;&gt;Node100 -->
 <g id="edge21" class="edge">
-<title>Node87&#45;&gt;Node99</title>
+<title>Node88&#45;&gt;Node100</title>
 <path fill="none" stroke="#191970" d="M453.0212,-387.5243C483.1409,-374.99 521.3445,-359.0916 545.3448,-349.1039"/>
 <polygon fill="#191970" stroke="#191970" points="451.4421,-384.3904 443.5544,-391.4639 454.1316,-390.8531 451.4421,-384.3904"/>
 </g>
-<!-- Node100 -->
+<!-- Node101 -->
 <g id="node14" class="node">
-<title>Node100</title>
+<title>Node101</title>
 <g id="a_node14"><a xlink:href="nn_2bnn_8h.html" target="_top" xlink:title="Binary op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="658.9229,-324.5 658.9229,-354.5 769.9229,-354.5 769.9229,-324.5 658.9229,-324.5"/>
 <text text-anchor="start" x="666.9229" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -149,15 +149,15 @@
 </a>
 </g>
 </g>
-<!-- Node87&#45;&gt;Node100 -->
+<!-- Node88&#45;&gt;Node101 -->
 <g id="edge22" class="edge">
-<title>Node87&#45;&gt;Node100</title>
+<title>Node88&#45;&gt;Node101</title>
 <path fill="none" stroke="#191970" d="M480.9467,-391.3398C528.964,-381.319 592.9935,-367.7305 649.4229,-355 652.4784,-354.3107 655.6124,-353.5943 658.7744,-352.8644"/>
 <polygon fill="#191970" stroke="#191970" points="480.0276,-387.956 470.9519,-393.4223 481.4556,-394.8088 480.0276,-387.956"/>
 </g>
-<!-- Node101 -->
+<!-- Node102 -->
 <g id="node15" class="node">
-<title>Node101</title>
+<title>Node102</title>
 <g id="a_node15"><a xlink:href="flatten_8h.html" target="_top" xlink:title="Softmax op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="787.9229,-324.5 787.9229,-354.5 898.9229,-354.5 898.9229,-324.5 787.9229,-324.5"/>
 <text text-anchor="start" x="795.9229" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -165,36 +165,36 @@
 </a>
 </g>
 </g>
-<!-- Node87&#45;&gt;Node101 -->
+<!-- Node88&#45;&gt;Node102 -->
 <g id="edge23" class="edge">
-<title>Node87&#45;&gt;Node101</title>
+<title>Node88&#45;&gt;Node102</title>
 <path fill="none" stroke="#191970" d="M481.2494,-398.4863C556.5657,-389.7945 676.6438,-374.5096 779.4229,-355 782.1185,-354.4883 784.8712,-353.9363 787.6466,-353.3561"/>
 <polygon fill="#191970" stroke="#191970" points="480.4622,-395.0534 470.9255,-399.6687 481.2588,-402.008 480.4622,-395.0534"/>
 </g>
-<!-- Node89 -->
+<!-- Node90 -->
 <g id="node3" class="node">
-<title>Node89</title>
+<title>Node90</title>
 <g id="a_node3"><a xlink:href="elemwise_8h.html" target="_top" xlink:title="Elementwise op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="29.9229,-201.5 29.9229,-220.5 184.9229,-220.5 184.9229,-201.5 29.9229,-201.5"/>
 <text text-anchor="middle" x="107.4229" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/elemwise.h</text>
 </a>
 </g>
 </g>
-<!-- Node88&#45;&gt;Node89 -->
+<!-- Node89&#45;&gt;Node90 -->
 <g id="edge2" class="edge">
-<title>Node88&#45;&gt;Node89</title>
+<title>Node89&#45;&gt;Node90</title>
 <path fill="none" stroke="#191970" d="M104.0289,-252.7462C104.8983,-242.0519 105.9339,-229.314 106.6306,-220.7449"/>
 <polygon fill="#191970" stroke="#191970" points="100.526,-252.6399 103.2041,-262.8906 107.503,-253.2072 100.526,-252.6399"/>
 </g>
-<!-- Node88&#45;&gt;Node90 -->
+<!-- Node89&#45;&gt;Node91 -->
 <g id="edge10" class="edge">
-<title>Node88&#45;&gt;Node90</title>
+<title>Node89&#45;&gt;Node91</title>
 <path fill="none" stroke="#191970" d="M64.2227,-258.4933C47.6381,-250.2815 29.8734,-238.0952 20.4229,-221 5.7463,-194.4513 43.248,-171.468 69.7695,-159.1325"/>
 <polygon fill="#191970" stroke="#191970" points="63.2305,-261.8887 73.7793,-262.876 66.1486,-255.526 63.2305,-261.8887"/>
 </g>
-<!-- Node95 -->
+<!-- Node96 -->
 <g id="node9" class="node">
-<title>Node95</title>
+<title>Node96</title>
 <g id="a_node9"><a xlink:href="bias__add_8h.html" target="_top" xlink:title="bias_add op constructions ">
 <polygon fill="#ffffff" stroke="#000000" points="188.9229,-134.5 188.9229,-164.5 299.9229,-164.5 299.9229,-134.5 188.9229,-134.5"/>
 <text text-anchor="start" x="196.9229" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -202,27 +202,27 @@
 </a>
 </g>
 </g>
-<!-- Node88&#45;&gt;Node95 -->
+<!-- Node89&#45;&gt;Node96 -->
 <g id="edge9" class="edge">
-<title>Node88&#45;&gt;Node95</title>
+<title>Node89&#45;&gt;Node96</title>
 <path fill="none" stroke="#191970" d="M134.7199,-258.535C153.1977,-249.5419 176.0771,-236.6202 193.4229,-221 211.7781,-204.4708 227.3255,-180.2291 236.2684,-164.6551"/>
 <polygon fill="#191970" stroke="#191970" points="133.0659,-255.4452 125.5111,-262.8732 136.0491,-261.7777 133.0659,-255.4452"/>
 </g>
-<!-- Node88&#45;&gt;Node96 -->
+<!-- Node89&#45;&gt;Node97 -->
 <g id="edge11" class="edge">
-<title>Node88&#45;&gt;Node96</title>
+<title>Node89&#45;&gt;Node97</title>
 <path fill="none" stroke="#191970" d="M146.0949,-260.1229C187.2965,-248.4459 248.1913,-231.1877 285.7547,-220.5419"/>
 <polygon fill="#191970" stroke="#191970" points="144.9958,-256.7965 136.3291,-262.8906 146.9046,-263.5312 144.9958,-256.7965"/>
 </g>
-<!-- Node89&#45;&gt;Node90 -->
+<!-- Node90&#45;&gt;Node91 -->
 <g id="edge3" class="edge">
-<title>Node89&#45;&gt;Node90</title>
+<title>Node90&#45;&gt;Node91</title>
 <path fill="none" stroke="#191970" d="M103.0028,-191.5832C100.5527,-180.8205 97.6124,-167.9042 95.6412,-159.2449"/>
 <polygon fill="#191970" stroke="#191970" points="99.603,-192.417 105.2354,-201.3906 106.4283,-190.8632 99.603,-192.417"/>
 </g>
-<!-- Node91 -->
+<!-- Node92 -->
 <g id="node5" class="node">
-<title>Node91</title>
+<title>Node92</title>
 <g id="a_node5"><a xlink:href="nn_2pooling_8h.html" target="_top" xlink:title="Pooling op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="189.9229,-.5 189.9229,-30.5 300.9229,-30.5 300.9229,-.5 189.9229,-.5"/>
 <text text-anchor="start" x="197.9229" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -230,15 +230,15 @@
 </a>
 </g>
 </g>
-<!-- Node90&#45;&gt;Node91 -->
+<!-- Node91&#45;&gt;Node92 -->
 <g id="edge4" class="edge">
-<title>Node90&#45;&gt;Node91</title>
+<title>Node91&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M83.8726,-130.545C76.2711,-112.4878 68.9378,-85.4028 82.4229,-67 95.4036,-49.2856 147.8546,-34.8407 189.7998,-25.8056"/>
 <polygon fill="#191970" stroke="#191970" points="80.754,-132.1409 88.1081,-139.7677 87.1153,-129.2195 80.754,-132.1409"/>
 </g>
-<!-- Node92 -->
+<!-- Node93 -->
 <g id="node6" class="node">
-<title>Node92</title>
+<title>Node93</title>
 <g id="a_node6"><a xlink:href="nn_2softmax_8h.html" target="_top" xlink:title="Softmax op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="91.9229,-67.5 91.9229,-97.5 202.9229,-97.5 202.9229,-67.5 91.9229,-67.5"/>
 <text text-anchor="start" x="99.9229" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -246,69 +246,69 @@
 </a>
 </g>
 </g>
-<!-- Node90&#45;&gt;Node92 -->
+<!-- Node91&#45;&gt;Node93 -->
 <g id="edge5" class="edge">
-<title>Node90&#45;&gt;Node92</title>
+<title>Node91&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M107.4932,-132.0423C116.127,-121.33 127.0699,-107.7528 135.3158,-97.5218"/>
 <polygon fill="#191970" stroke="#191970" points="104.7099,-129.9182 101.1597,-139.9005 110.1601,-134.3109 104.7099,-129.9182"/>
 </g>
-<!-- Node90&#45;&gt;Node93 -->
+<!-- Node91&#45;&gt;Node94 -->
 <g id="edge6" class="edge">
-<title>Node90&#45;&gt;Node93</title>
+<title>Node91&#45;&gt;Node94</title>
 <path fill="none" stroke="#191970" d="M154.8671,-138.0822C208.6419,-127.9647 288.8245,-112.5769 358.4229,-98 367.3154,-96.1375 376.8324,-94.0567 385.8698,-92.0393"/>
 <polygon fill="#191970" stroke="#191970" points="153.991,-134.6854 144.8088,-139.9711 155.2831,-141.5652 153.991,-134.6854"/>
 </g>
-<!-- Node90&#45;&gt;Node94 -->
+<!-- Node91&#45;&gt;Node95 -->
 <g id="edge8" class="edge">
-<title>Node90&#45;&gt;Node94</title>
+<title>Node91&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M130.375,-136.6052C162.3804,-125.4367 208.7025,-109.2722 242.3754,-97.5218"/>
 <polygon fill="#191970" stroke="#191970" points="129.2202,-133.3011 120.9318,-139.9005 131.5266,-139.9103 129.2202,-133.3011"/>
 </g>
-<!-- Node93&#45;&gt;Node91 -->
+<!-- Node94&#45;&gt;Node92 -->
 <g id="edge7" class="edge">
-<title>Node93&#45;&gt;Node91</title>
+<title>Node94&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M391.6077,-69.3153C361.3007,-58.1583 317.8722,-42.1709 286.2283,-30.5218"/>
 <polygon fill="#191970" stroke="#191970" points="390.7532,-72.7303 401.3467,-72.9005 393.1715,-66.1613 390.7532,-72.7303"/>
 </g>
-<!-- Node96&#45;&gt;Node90 -->
+<!-- Node97&#45;&gt;Node91 -->
 <g id="edge14" class="edge">
-<title>Node96&#45;&gt;Node90</title>
+<title>Node97&#45;&gt;Node91</title>
 <path fill="none" stroke="#191970" d="M274.2114,-198.6969C231.2973,-187.0189 167.6914,-169.7103 128.4874,-159.0419"/>
 <polygon fill="#191970" stroke="#191970" points="273.5422,-202.142 284.1104,-201.3906 275.3803,-195.3876 273.5422,-202.142"/>
 </g>
-<!-- Node96&#45;&gt;Node93 -->
+<!-- Node97&#45;&gt;Node94 -->
 <g id="edge13" class="edge">
-<title>Node96&#45;&gt;Node93</title>
+<title>Node97&#45;&gt;Node94</title>
 <path fill="none" stroke="#191970" d="M334.2475,-193.3614C357.3224,-165.9066 401.062,-113.8646 419.2111,-92.2705"/>
 <polygon fill="#191970" stroke="#191970" points="331.2951,-191.4344 327.5404,-201.3416 336.6538,-195.9382 331.2951,-191.4344"/>
 </g>
-<!-- Node96&#45;&gt;Node94 -->
+<!-- Node97&#45;&gt;Node95 -->
 <g id="edge15" class="edge">
-<title>Node96&#45;&gt;Node94</title>
+<title>Node97&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M318.1164,-191.206C316.716,-175.4624 313.9065,-153.0012 308.4229,-134 304.7846,-121.3931 298.6534,-107.8737 293.636,-97.8512"/>
 <polygon fill="#191970" stroke="#191970" points="314.6394,-191.6389 318.9051,-201.3371 321.6183,-191.0955 314.6394,-191.6389"/>
 </g>
-<!-- Node96&#45;&gt;Node95 -->
+<!-- Node97&#45;&gt;Node96 -->
 <g id="edge12" class="edge">
-<title>Node96&#45;&gt;Node95</title>
+<title>Node97&#45;&gt;Node96</title>
 <path fill="none" stroke="#191970" d="M299.9576,-195.0385C288.4809,-185.6276 274.0462,-173.7911 262.7699,-164.5446"/>
 <polygon fill="#191970" stroke="#191970" points="297.7521,-197.7562 307.7041,-201.3906 302.1907,-192.3433 297.7521,-197.7562"/>
 </g>
-<!-- Node97&#45;&gt;Node88 -->
+<!-- Node98&#45;&gt;Node89 -->
 <g id="edge17" class="edge">
-<title>Node97&#45;&gt;Node88</title>
+<title>Node98&#45;&gt;Node89</title>
 <path fill="none" stroke="#191970" d="M150.4622,-317.2032C137.6312,-305.2634 122.4944,-291.1776 112.6909,-282.055"/>
 <polygon fill="#191970" stroke="#191970" points="148.3799,-320.0466 158.0849,-324.2967 153.1486,-314.9221 148.3799,-320.0466"/>
 </g>
-<!-- Node97&#45;&gt;Node96 -->
+<!-- Node98&#45;&gt;Node97 -->
 <g id="edge18" class="edge">
-<title>Node97&#45;&gt;Node96</title>
+<title>Node98&#45;&gt;Node97</title>
 <path fill="none" stroke="#191970" d="M187.2725,-315.3049C197.6432,-297.4988 213.579,-273.601 232.4229,-257 250.7556,-240.8493 275.7536,-228.3887 294.2502,-220.5351"/>
 <polygon fill="#191970" stroke="#191970" points="183.9595,-314.0549 182.1004,-324.4854 190.0583,-317.4909 183.9595,-314.0549"/>
 </g>
-<!-- Node98&#45;&gt;Node96 -->
+<!-- Node99&#45;&gt;Node97 -->
 <g id="edge20" class="edge">
-<title>Node98&#45;&gt;Node96</title>
+<title>Node99&#45;&gt;Node97</title>
 <path fill="none" stroke="#191970" d="M330.7254,-247.5846C327.8265,-238.2011 324.695,-228.065 322.465,-220.8469"/>
 <polygon fill="#191970" stroke="#191970" points="327.4303,-248.7765 333.7262,-257.2977 334.1184,-246.7102 327.4303,-248.7765"/>
 </g>
diff --git a/docs/reference/api/doxygen/detail_2broadcast_8h__dep__incl.svg b/docs/reference/api/doxygen/detail_2broadcast_8h__dep__incl.svg
index 394e2c8b5d..fe24c9d6ea 100644
--- a/docs/reference/api/doxygen/detail_2broadcast_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/detail_2broadcast_8h__dep__incl.svg
@@ -9,76 +9,76 @@
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 348)">
 <title>include/tvm/topi/detail/broadcast.h</title>
 <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-348 435.5,-348 435.5,4 -4,4"/>
-<!-- Node89 -->
+<!-- Node90 -->
 <g id="node1" class="node">
-<title>Node89</title>
+<title>Node90</title>
 <polygon fill="#bfbfbf" stroke="#000000" points="211,-313.5 211,-343.5 338,-343.5 338,-313.5 211,-313.5"/>
 <text text-anchor="start" x="219" y="-331.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/detail</text>
 <text text-anchor="middle" x="274.5" y="-320.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/broadcast.h</text>
 </g>
-<!-- Node90 -->
+<!-- Node91 -->
 <g id="node2" class="node">
-<title>Node90</title>
+<title>Node91</title>
 <g id="a_node2"><a xlink:href="broadcast_8h.html" target="_top" xlink:title="Broadcast op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="143.5,-257.5 143.5,-276.5 299.5,-276.5 299.5,-257.5 143.5,-257.5"/>
 <text text-anchor="middle" x="221.5" y="-264.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/broadcast.h</text>
 </a>
 </g>
 </g>
-<!-- Node89&#45;&gt;Node90 -->
+<!-- Node90&#45;&gt;Node91 -->
 <g id="edge1" class="edge">
-<title>Node89&#45;&gt;Node90</title>
+<title>Node90&#45;&gt;Node91</title>
 <path fill="none" stroke="#191970" d="M254.8227,-305.6669C246.2565,-295.7269 236.6697,-284.6026 229.986,-276.8469"/>
 <polygon fill="#191970" stroke="#191970" points="252.2194,-308.0074 261.3989,-313.2977 257.522,-303.4377 252.2194,-308.0074"/>
 </g>
-<!-- Node98 -->
+<!-- Node99 -->
 <g id="node10" class="node">
-<title>Node98</title>
+<title>Node99</title>
 <g id="a_node10"><a xlink:href="topi_2transform_8h.html" target="_top" xlink:title="Transform op constructors. ">
 <polygon fill="#ffffff" stroke="#000000" points="249.5,-201.5 249.5,-220.5 405.5,-220.5 405.5,-201.5 249.5,-201.5"/>
 <text text-anchor="middle" x="327.5" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/transform.h</text>
 </a>
 </g>
 </g>
-<!-- Node89&#45;&gt;Node98 -->
+<!-- Node90&#45;&gt;Node99 -->
 <g id="edge16" class="edge">
-<title>Node89&#45;&gt;Node98</title>
+<title>Node90&#45;&gt;Node99</title>
 <path fill="none" stroke="#191970" d="M292.0891,-305.1342C297.9136,-296.6073 304.0398,-286.6767 308.5,-277 317.2846,-257.9414 322.9709,-233.8931 325.6669,-220.724"/>
 <polygon fill="#191970" stroke="#191970" points="289.1295,-303.2575 286.211,-313.4425 294.8439,-307.3005 289.1295,-303.2575"/>
 </g>
-<!-- Node91 -->
+<!-- Node92 -->
 <g id="node3" class="node">
-<title>Node91</title>
+<title>Node92</title>
 <g id="a_node3"><a xlink:href="elemwise_8h.html" target="_top" xlink:title="Elementwise op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="0,-201.5 0,-220.5 155,-220.5 155,-201.5 0,-201.5"/>
 <text text-anchor="middle" x="77.5" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/elemwise.h</text>
 </a>
 </g>
 </g>
-<!-- Node90&#45;&gt;Node91 -->
+<!-- Node91&#45;&gt;Node92 -->
 <g id="edge2" class="edge">
-<title>Node90&#45;&gt;Node91</title>
+<title>Node91&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M187.1818,-253.6541C160.922,-243.4419 125.3508,-229.6086 101.9315,-220.5011"/>
 <polygon fill="#191970" stroke="#191970" points="186.1569,-257.0108 196.7455,-257.3733 188.694,-250.4867 186.1569,-257.0108"/>
 </g>
-<!-- Node92 -->
+<!-- Node93 -->
 <g id="node4" class="node">
-<title>Node92</title>
+<title>Node93</title>
 <g id="a_node4"><a xlink:href="reduction_8h.html" target="_top" xlink:title="Reduction op constructors. ">
 <polygon fill="#ffffff" stroke="#000000" points="15.5,-140 15.5,-159 169.5,-159 169.5,-140 15.5,-140"/>
 <text text-anchor="middle" x="92.5" y="-147" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/reduction.h</text>
 </a>
 </g>
 </g>
-<!-- Node90&#45;&gt;Node92 -->
+<!-- Node91&#45;&gt;Node93 -->
 <g id="edge10" class="edge">
-<title>Node90&#45;&gt;Node92</title>
+<title>Node91&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M207.701,-249.2253C196.519,-235.3577 179.9679,-215.997 163.5,-201 145.5987,-184.6978 122.5454,-168.7795 107.6454,-159.0569"/>
 <polygon fill="#191970" stroke="#191970" points="205.2628,-251.7837 214.2189,-257.4441 210.7475,-247.4341 205.2628,-251.7837"/>
 </g>
-<!-- Node97 -->
+<!-- Node98 -->
 <g id="node9" class="node">
-<title>Node97</title>
+<title>Node98</title>
 <g id="a_node9"><a xlink:href="bias__add_8h.html" target="_top" xlink:title="bias_add op constructions ">
 <polygon fill="#ffffff" stroke="#000000" points="188,-134.5 188,-164.5 299,-164.5 299,-134.5 188,-134.5"/>
 <text text-anchor="start" x="196" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -86,27 +86,27 @@
 </a>
 </g>
 </g>
-<!-- Node90&#45;&gt;Node97 -->
+<!-- Node91&#45;&gt;Node98 -->
 <g id="edge9" class="edge">
-<title>Node90&#45;&gt;Node97</title>
+<title>Node91&#45;&gt;Node98</title>
 <path fill="none" stroke="#191970" d="M225.1881,-247.3022C229.5447,-224.0339 236.6982,-185.8278 240.6514,-164.7143"/>
 <polygon fill="#191970" stroke="#191970" points="221.7006,-246.9111 223.3004,-257.3845 228.581,-248.1994 221.7006,-246.9111"/>
 </g>
-<!-- Node90&#45;&gt;Node98 -->
+<!-- Node91&#45;&gt;Node99 -->
 <g id="edge11" class="edge">
-<title>Node90&#45;&gt;Node98</title>
+<title>Node91&#45;&gt;Node99</title>
 <path fill="none" stroke="#191970" d="M249.0222,-252.46C267.9908,-242.4388 292.6727,-229.3993 309.2478,-220.6427"/>
 <polygon fill="#191970" stroke="#191970" points="247.1708,-249.4796 239.9638,-257.2455 250.4407,-255.669 247.1708,-249.4796"/>
 </g>
-<!-- Node91&#45;&gt;Node92 -->
+<!-- Node92&#45;&gt;Node93 -->
 <g id="edge3" class="edge">
-<title>Node91&#45;&gt;Node92</title>
+<title>Node92&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M82.2358,-191.5832C84.8608,-180.8205 88.0112,-167.9042 90.1232,-159.2449"/>
 <polygon fill="#191970" stroke="#191970" points="78.8131,-190.846 79.8438,-201.3906 85.6137,-192.5048 78.8131,-190.846"/>
 </g>
-<!-- Node93 -->
+<!-- Node94 -->
 <g id="node5" class="node">
-<title>Node93</title>
+<title>Node94</title>
 <g id="a_node5"><a xlink:href="nn_2pooling_8h.html" target="_top" xlink:title="Pooling op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="61,-.5 61,-30.5 172,-30.5 172,-.5 61,-.5"/>
 <text text-anchor="start" x="69" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -114,15 +114,15 @@
 </a>
 </g>
 </g>
-<!-- Node92&#45;&gt;Node93 -->
+<!-- Node93&#45;&gt;Node94 -->
 <g id="edge4" class="edge">
-<title>Node92&#45;&gt;Node93</title>
+<title>Node93&#45;&gt;Node94</title>
 <path fill="none" stroke="#191970" d="M63.089,-134.7356C49.2768,-126.0887 34.1876,-113.7498 26.5,-98 20.4564,-85.6185 19.5881,-78.9186 26.5,-67 36.1633,-50.3368 53.4715,-38.6061 70.3458,-30.5956"/>
 <polygon fill="#191970" stroke="#191970" points="61.4916,-137.8559 71.8906,-139.8834 65.0256,-131.8135 61.4916,-137.8559"/>
 </g>
-<!-- Node94 -->
+<!-- Node95 -->
 <g id="node6" class="node">
-<title>Node94</title>
+<title>Node95</title>
 <g id="a_node6"><a xlink:href="nn_2softmax_8h.html" target="_top" xlink:title="Softmax op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="36,-67.5 36,-97.5 147,-97.5 147,-67.5 36,-67.5"/>
 <text text-anchor="start" x="44" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -130,30 +130,30 @@
 </a>
 </g>
 </g>
-<!-- Node92&#45;&gt;Node94 -->
+<!-- Node93&#45;&gt;Node95 -->
 <g id="edge5" class="edge">
-<title>Node92&#45;&gt;Node94</title>
+<title>Node93&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M92.2056,-129.7758C92.0517,-119.4641 91.8663,-107.0437 91.7242,-97.5218"/>
 <polygon fill="#191970" stroke="#191970" points="88.7078,-129.9539 92.3567,-139.9005 95.707,-129.8494 88.7078,-129.9539"/>
 </g>
-<!-- Node95 -->
+<!-- Node96 -->
 <g id="node7" class="node">
-<title>Node95</title>
+<title>Node96</title>
 <g id="a_node7"><a xlink:href="topi_2nn_8h.html" target="_top" xlink:title="NN op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="165.5,-73 165.5,-92 285.5,-92 285.5,-73 165.5,-73"/>
 <text text-anchor="middle" x="225.5" y="-80" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn.h</text>
 </a>
 </g>
 </g>
-<!-- Node92&#45;&gt;Node95 -->
+<!-- Node93&#45;&gt;Node96 -->
 <g id="edge6" class="edge">
-<title>Node92&#45;&gt;Node95</title>
+<title>Node93&#45;&gt;Node96</title>
 <path fill="none" stroke="#191970" d="M120.4869,-135.4013C146.3174,-122.389 183.9776,-103.4173 206.4797,-92.0817"/>
 <polygon fill="#191970" stroke="#191970" points="118.9118,-132.2757 111.5557,-139.9005 122.0611,-138.5273 118.9118,-132.2757"/>
 </g>
-<!-- Node96 -->
+<!-- Node97 -->
 <g id="node8" class="node">
-<title>Node96</title>
+<title>Node97</title>
 <g id="a_node8"><a xlink:href="reorg_8h.html" target="_top" xlink:title="Reorg op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="303.5,-67.5 303.5,-97.5 431.5,-97.5 431.5,-67.5 303.5,-67.5"/>
 <text text-anchor="start" x="311.5" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/vision</text>
@@ -161,39 +161,39 @@
 </a>
 </g>
 </g>
-<!-- Node92&#45;&gt;Node96 -->
+<!-- Node93&#45;&gt;Node97 -->
 <g id="edge8" class="edge">
-<title>Node92&#45;&gt;Node96</title>
+<title>Node93&#45;&gt;Node97</title>
 <path fill="none" stroke="#191970" d="M141.6073,-137.5357C187.4779,-126.3599 256.1301,-109.6338 305.7057,-97.5553"/>
 <polygon fill="#191970" stroke="#191970" points="140.496,-134.204 131.6087,-139.9717 142.1531,-141.005 140.496,-134.204"/>
 </g>
-<!-- Node95&#45;&gt;Node93 -->
+<!-- Node96&#45;&gt;Node94 -->
 <g id="edge7" class="edge">
-<title>Node95&#45;&gt;Node93</title>
+<title>Node96&#45;&gt;Node94</title>
 <path fill="none" stroke="#191970" d="M201.1472,-67.5308C183.2045,-56.5019 158.9012,-41.5631 140.9384,-30.5218"/>
 <polygon fill="#191970" stroke="#191970" points="199.5308,-70.6456 209.883,-72.9005 203.1965,-64.6821 199.5308,-70.6456"/>
 </g>
-<!-- Node98&#45;&gt;Node92 -->
+<!-- Node99&#45;&gt;Node93 -->
 <g id="edge14" class="edge">
-<title>Node98&#45;&gt;Node92</title>
+<title>Node99&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M281.3191,-198.9144C236.7206,-187.2428 170.0727,-169.8009 129.0464,-159.0643"/>
 <polygon fill="#191970" stroke="#191970" points="280.4787,-202.3122 291.0391,-201.4581 282.251,-195.5403 280.4787,-202.3122"/>
 </g>
-<!-- Node98&#45;&gt;Node95 -->
+<!-- Node99&#45;&gt;Node96 -->
 <g id="edge13" class="edge">
-<title>Node98&#45;&gt;Node95</title>
+<title>Node99&#45;&gt;Node96</title>
 <path fill="none" stroke="#191970" d="M326.6563,-190.7532C325.0519,-174.1123 320.6055,-150.6489 308.5,-134 294.1391,-114.2492 269.6725,-100.4232 251.013,-92.1058"/>
 <polygon fill="#191970" stroke="#191970" points="323.1847,-191.2841 327.3722,-201.0162 330.1678,-190.7969 323.1847,-191.2841"/>
 </g>
-<!-- Node98&#45;&gt;Node96 -->
+<!-- Node99&#45;&gt;Node97 -->
 <g id="edge15" class="edge">
-<title>Node98&#45;&gt;Node96</title>
+<title>Node99&#45;&gt;Node97</title>
 <path fill="none" stroke="#191970" d="M333.5336,-191.6171C341.5329,-165.9192 355.4845,-121.0997 362.768,-97.7016"/>
 <polygon fill="#191970" stroke="#191970" points="330.1369,-190.7532 330.5065,-201.3416 336.8206,-192.8338 330.1369,-190.7532"/>
 </g>
-<!-- Node98&#45;&gt;Node97 -->
+<!-- Node99&#45;&gt;Node98 -->
 <g id="edge12" class="edge">
-<title>Node98&#45;&gt;Node97</title>
+<title>Node99&#45;&gt;Node98</title>
 <path fill="none" stroke="#191970" d="M306.0723,-195.3119C293.1592,-185.8577 276.8008,-173.8809 264.0487,-164.5446"/>
 <polygon fill="#191970" stroke="#191970" points="304.2388,-198.3072 314.375,-201.3906 308.374,-192.6592 304.2388,-198.3072"/>
 </g>
diff --git a/docs/reference/api/doxygen/detail_2broadcast_8h__incl.svg b/docs/reference/api/doxygen/detail_2broadcast_8h__incl.svg
index c817d7b733..1660f59238 100644
--- a/docs/reference/api/doxygen/detail_2broadcast_8h__incl.svg
+++ b/docs/reference/api/doxygen/detail_2broadcast_8h__incl.svg
@@ -71,15 +71,15 @@
 <path fill="none" stroke="#191970" d="M1408.5649,-1063.4639C1379.2433,-1053.5419 1340.6347,-1040.4774 1309.466,-1029.9304"/>
 <polygon fill="#191970" stroke="#191970" points="1310.2231,-1026.4917 1299.6289,-1026.6017 1307.9794,-1033.1224 1310.2231,-1026.4917"/>
 </g>
-<!-- Node88 -->
+<!-- Node89 -->
 <g id="node50" class="node">
-<title>Node88</title>
+<title>Node89</title>
 <polygon fill="#ffffff" stroke="#bfbfbf" points="3098.5,-1002 3098.5,-1021 3145.5,-1021 3145.5,-1002 3098.5,-1002"/>
 <text text-anchor="middle" x="3122" y="-1009" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">deque</text>
 </g>
-<!-- Node0&#45;&gt;Node88 -->
+<!-- Node0&#45;&gt;Node89 -->
 <g id="edge204" class="edge">
-<title>Node0&#45;&gt;Node88</title>
+<title>Node0&#45;&gt;Node89</title>
 <path fill="none" stroke="#191970" d="M1516.7508,-1077.8226C1798.9965,-1074.6427 2928.9912,-1059.9465 3084,-1027 3086.8748,-1026.389 3089.8098,-1025.5724 3092.7109,-1024.6324"/>
 <polygon fill="#191970" stroke="#191970" points="3094.1441,-1027.8341 3102.3074,-1021.0804 3091.7142,-1021.2693 3094.1441,-1027.8341"/>
 </g>
diff --git a/docs/reference/api/doxygen/einsum_8h__incl.svg b/docs/reference/api/doxygen/einsum_8h__incl.svg
index 3d666c1267..23ad23e4d1 100644
--- a/docs/reference/api/doxygen/einsum_8h__incl.svg
+++ b/docs/reference/api/doxygen/einsum_8h__incl.svg
@@ -109,9 +109,9 @@
 <path fill="none" stroke="#191970" d="M3017.9108,-884.3264C3015.9386,-869.297 3010.9231,-839.7066 3000,-817 2986.3968,-788.722 2979.8129,-782.5467 2957,-761 2892.6987,-700.2677 2381.7862,-344.137 2299,-313 2241.196,-291.2591 2065.8298,-276.6595 1976.5122,-270.4247"/>
 <polygon fill="#191970" stroke="#191970" points="1976.5369,-266.9182 1966.3202,-269.7235 1976.0564,-273.9017 1976.5369,-266.9182"/>
 </g>
-<!-- Node89 -->
+<!-- Node90 -->
 <g id="node45" class="node">
-<title>Node89</title>
+<title>Node90</title>
 <g id="a_node45"><a xlink:href="ravel__unravel_8h.html" target="_top" xlink:title="Index ravel and unraval operations. ">
 <polygon fill="#ffffff" stroke="#000000" points="2735,-817.5 2735,-847.5 2851,-847.5 2851,-817.5 2735,-817.5"/>
 <text text-anchor="start" x="2743" y="-835.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/detail/ravel</text>
@@ -119,15 +119,15 @@
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node89 -->
+<!-- Node0&#45;&gt;Node90 -->
 <g id="edge176" class="edge">
-<title>Node0&#45;&gt;Node89</title>
+<title>Node0&#45;&gt;Node90</title>
 <path fill="none" stroke="#191970" d="M2983.6875,-884.3906C2950.1983,-875.2774 2899.0646,-861.3627 2858.1755,-850.2358"/>
 <polygon fill="#191970" stroke="#191970" points="2858.854,-846.7932 2848.2858,-847.5446 2857.0159,-853.5476 2858.854,-846.7932"/>
 </g>
-<!-- Node90 -->
+<!-- Node91 -->
 <g id="node46" class="node">
-<title>Node90</title>
+<title>Node91</title>
 <g id="a_node46"><a xlink:href="tensor__utils_8h.html" target="_top" xlink:title="Utility functions for handling tensor. ">
 <polygon fill="#ffffff" stroke="#000000" points="2869,-817.5 2869,-847.5 2991,-847.5 2991,-817.5 2869,-817.5"/>
 <text text-anchor="start" x="2877" y="-835.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/detail/tensor</text>
@@ -135,48 +135,48 @@
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node90 -->
+<!-- Node0&#45;&gt;Node91 -->
 <g id="edge179" class="edge">
-<title>Node0&#45;&gt;Node90</title>
+<title>Node0&#45;&gt;Node91</title>
 <path fill="none" stroke="#191970" d="M3005.0938,-884.3906C2993.1602,-876.1444 2975.5374,-863.9669 2960.3623,-853.4807"/>
 <polygon fill="#191970" stroke="#191970" points="2961.9885,-850.3501 2951.7718,-847.5446 2958.0091,-856.1089 2961.9885,-850.3501"/>
 </g>
-<!-- Node91 -->
+<!-- Node92 -->
 <g id="node47" class="node">
-<title>Node91</title>
+<title>Node92</title>
 <g id="a_node47"><a xlink:href="tags_8h.html" target="_top" xlink:title="External function interface to rocBLAS libraries. ">
 <polygon fill="#ffffff" stroke="#000000" points="28.5,-761.5 28.5,-780.5 119.5,-780.5 119.5,-761.5 28.5,-761.5"/>
 <text text-anchor="middle" x="74" y="-768.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/tags.h</text>
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node91 -->
+<!-- Node0&#45;&gt;Node92 -->
 <g id="edge182" class="edge">
-<title>Node0&#45;&gt;Node91</title>
+<title>Node0&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M2946.9247,-893.4909C2556.9116,-890.6313 707.2636,-875.6972 454,-848 331.8662,-834.6433 190.7203,-801.1903 119.7588,-783.0874"/>
 <polygon fill="#191970" stroke="#191970" points="120.3653,-779.6297 109.8089,-780.5321 118.624,-786.4097 120.3653,-779.6297"/>
 </g>
-<!-- Node92 -->
+<!-- Node93 -->
 <g id="node48" class="node">
-<title>Node92</title>
+<title>Node93</title>
 <polygon fill="#ffffff" stroke="#bfbfbf" points="3180,-823 3180,-842 3224,-842 3224,-823 3180,-823"/>
 <text text-anchor="middle" x="3202" y="-830" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">bitset</text>
 </g>
-<!-- Node0&#45;&gt;Node92 -->
+<!-- Node0&#45;&gt;Node93 -->
 <g id="edge185" class="edge">
-<title>Node0&#45;&gt;Node92</title>
+<title>Node0&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M3052.9042,-884.4723C3082.9157,-875.7829 3127.7613,-862.1718 3166,-848 3167.8185,-847.3261 3169.6782,-846.6101 3171.5475,-845.8697"/>
 <polygon fill="#191970" stroke="#191970" points="3172.9137,-849.0924 3180.8269,-842.0473 3170.2475,-842.62 3172.9137,-849.0924"/>
 </g>
-<!-- Node93 -->
+<!-- Node94 -->
 <g id="node49" class="node">
-<title>Node93</title>
+<title>Node94</title>
 <polygon fill="#ffffff" stroke="#bfbfbf" points="3242.5,-823 3242.5,-842 3295.5,-842 3295.5,-823 3242.5,-823"/>
 <text text-anchor="middle" x="3269" y="-830" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">iterator</text>
 </g>
-<!-- Node0&#45;&gt;Node93 -->
+<!-- Node0&#45;&gt;Node94 -->
 <g id="edge186" class="edge">
-<title>Node0&#45;&gt;Node93</title>
+<title>Node0&#45;&gt;Node94</title>
 <path fill="none" stroke="#191970" d="M3078.3942,-884.4435C3121.7993,-876.6883 3181.7772,-864.36 3233,-848 3235.1067,-847.3271 3237.2587,-846.5753 3239.4105,-845.7754"/>
 <polygon fill="#191970" stroke="#191970" points="3240.7192,-849.0217 3248.7027,-842.0565 3238.1182,-842.5228 3240.7192,-849.0217"/>
 </g>
@@ -1531,33 +1531,33 @@
 <path fill="none" stroke="#191970" d="M1318.8517,-313.1545C1317.3364,-300.2513 1314.5609,-276.9195 1312,-257 1310.8998,-248.442 1309.6448,-239.0917 1308.5042,-230.7351"/>
 <polygon fill="#191970" stroke="#191970" points="1311.97,-230.2462 1307.1408,-220.816 1305.0352,-231.1994 1311.97,-230.2462"/>
 </g>
-<!-- Node89&#45;&gt;Node1 -->
+<!-- Node90&#45;&gt;Node1 -->
 <g id="edge177" class="edge">
-<title>Node89&#45;&gt;Node1</title>
+<title>Node90&#45;&gt;Node1</title>
 <path fill="none" stroke="#191970" d="M2734.8544,-828.441C2576.9379,-817.4173 2140.922,-786.9804 1976.1691,-775.4795"/>
 <polygon fill="#191970" stroke="#191970" points="1975.9347,-771.9547 1965.7152,-774.7497 1975.4472,-778.9377 1975.9347,-771.9547"/>
 </g>
-<!-- Node89&#45;&gt;Node21 -->
+<!-- Node90&#45;&gt;Node21 -->
 <g id="edge178" class="edge">
-<title>Node89&#45;&gt;Node21</title>
+<title>Node90&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M2851.0282,-819.1893C2891.2926,-809.0518 2940.5356,-794.573 2957,-781 3070.0904,-687.7699 3114,-637.5649 3114,-491 3114,-491 3114,-491 3114,-138.5 3114,-104.9798 3113.9736,-90.428 3090,-67 3071.1098,-48.5397 3000.4258,-31.2436 2957.6203,-22.1523"/>
 <polygon fill="#191970" stroke="#191970" points="2958.1489,-18.6874 2947.6453,-20.0748 2956.7216,-25.5403 2958.1489,-18.6874"/>
 </g>
-<!-- Node90&#45;&gt;Node1 -->
+<!-- Node91&#45;&gt;Node1 -->
 <g id="edge180" class="edge">
-<title>Node90&#45;&gt;Node1</title>
+<title>Node91&#45;&gt;Node1</title>
 <path fill="none" stroke="#191970" d="M2868.6452,-818.2977C2865.7301,-817.8172 2862.8375,-817.3802 2860,-817 2688.3926,-794.0059 2159.6875,-777.765 1975.8161,-772.6852"/>
 <polygon fill="#191970" stroke="#191970" points="1975.8545,-769.1851 1965.7621,-772.4091 1975.6622,-776.1824 1975.8545,-769.1851"/>
 </g>
-<!-- Node90&#45;&gt;Node21 -->
+<!-- Node91&#45;&gt;Node21 -->
 <g id="edge181" class="edge">
-<title>Node90&#45;&gt;Node21</title>
+<title>Node91&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M2991.2501,-826.2695C3056.9608,-815.8041 3152,-788.0421 3152,-715 3152,-715 3152,-715 3152,-138.5 3152,-104.9798 3152.5921,-89.7779 3128,-67 3103.3673,-44.1846 3008.9146,-27.5796 2957.6206,-20.031"/>
 <polygon fill="#191970" stroke="#191970" points="2958.0669,-16.5592 2947.6706,-18.6007 2957.0708,-23.488 2958.0669,-16.5592"/>
 </g>
-<!-- Node91&#45;&gt;Node17 -->
+<!-- Node92&#45;&gt;Node17 -->
 <g id="edge183" class="edge">
-<title>Node91&#45;&gt;Node17</title>
+<title>Node92&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M74,-761.4116C74,-742.0538 74,-696.9167 74,-659 74,-659 74,-659 74,-138.5 74,-104.9798 71.8944,-88.0263 98,-67 132.0401,-39.583 434.7751,-22.1387 535.5397,-17.0509"/>
 <polygon fill="#191970" stroke="#191970" points="535.9135,-20.5368 545.7272,-16.5441 535.5656,-13.5454 535.9135,-20.5368"/>
 </g>
diff --git a/docs/reference/api/doxygen/elemwise_8h__dep__incl.svg b/docs/reference/api/doxygen/elemwise_8h__dep__incl.svg
index 596b01742d..962873d540 100644
--- a/docs/reference/api/doxygen/elemwise_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/elemwise_8h__dep__incl.svg
@@ -9,30 +9,30 @@
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 214)">
 <title>include/tvm/topi/elemwise.h</title>
 <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-214 414.718,-214 414.718,4 -4,4"/>
-<!-- Node93 -->
+<!-- Node94 -->
 <g id="node1" class="node">
-<title>Node93</title>
+<title>Node94</title>
 <polygon fill="#bfbfbf" stroke="#000000" points="60.218,-190.5 60.218,-209.5 215.218,-209.5 215.218,-190.5 60.218,-190.5"/>
 <text text-anchor="middle" x="137.718" y="-197.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/elemwise.h</text>
 </g>
-<!-- Node94 -->
+<!-- Node95 -->
 <g id="node2" class="node">
-<title>Node94</title>
+<title>Node95</title>
 <g id="a_node2"><a xlink:href="reduction_8h.html" target="_top" xlink:title="Reduction op constructors. ">
 <polygon fill="#ffffff" stroke="#000000" points="60.718,-134.5 60.718,-153.5 214.718,-153.5 214.718,-134.5 60.718,-134.5"/>
 <text text-anchor="middle" x="137.718" y="-141.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/reduction.h</text>
 </a>
 </g>
 </g>
-<!-- Node93&#45;&gt;Node94 -->
+<!-- Node94&#45;&gt;Node95 -->
 <g id="edge1" class="edge">
-<title>Node93&#45;&gt;Node94</title>
+<title>Node94&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M137.718,-180.1575C137.718,-171.155 137.718,-160.9199 137.718,-153.6427"/>
 <polygon fill="#191970" stroke="#191970" points="134.218,-180.2455 137.718,-190.2455 141.218,-180.2456 134.218,-180.2455"/>
 </g>
-<!-- Node95 -->
+<!-- Node96 -->
 <g id="node3" class="node">
-<title>Node95</title>
+<title>Node96</title>
 <g id="a_node3"><a xlink:href="nn_2pooling_8h.html" target="_top" xlink:title="Pooling op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="40.218,-.5 40.218,-30.5 151.218,-30.5 151.218,-.5 40.218,-.5"/>
 <text text-anchor="start" x="48.218" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -40,15 +40,15 @@
 </a>
 </g>
 </g>
-<!-- Node94&#45;&gt;Node95 -->
+<!-- Node95&#45;&gt;Node96 -->
 <g id="edge2" class="edge">
-<title>Node94&#45;&gt;Node95</title>
+<title>Node95&#45;&gt;Node96</title>
 <path fill="none" stroke="#191970" d="M83.8307,-131.9252C51.655,-123.5022 15.2398,-111.4017 5.718,-98 -2.262,-86.7685 -1.1939,-78.9186 5.718,-67 15.3813,-50.3368 32.6895,-38.6061 49.5638,-30.5956"/>
 <polygon fill="#191970" stroke="#191970" points="83.0918,-135.3485 93.6457,-134.4185 84.8153,-128.564 83.0918,-135.3485"/>
 </g>
-<!-- Node96 -->
+<!-- Node97 -->
 <g id="node4" class="node">
-<title>Node96</title>
+<title>Node97</title>
 <g id="a_node4"><a xlink:href="nn_2softmax_8h.html" target="_top" xlink:title="Softmax op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="15.218,-67.5 15.218,-97.5 126.218,-97.5 126.218,-67.5 15.218,-67.5"/>
 <text text-anchor="start" x="23.218" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -56,30 +56,30 @@
 </a>
 </g>
 </g>
-<!-- Node94&#45;&gt;Node96 -->
+<!-- Node95&#45;&gt;Node97 -->
 <g id="edge3" class="edge">
-<title>Node94&#45;&gt;Node96</title>
+<title>Node95&#45;&gt;Node97</title>
 <path fill="none" stroke="#191970" d="M119.7288,-127.4875C109.5767,-118.1689 96.9859,-106.6116 87.108,-97.5446"/>
 <polygon fill="#191970" stroke="#191970" points="117.5154,-130.2068 127.2492,-134.3906 122.249,-125.0499 117.5154,-130.2068"/>
 </g>
-<!-- Node97 -->
+<!-- Node98 -->
 <g id="node5" class="node">
-<title>Node97</title>
+<title>Node98</title>
 <g id="a_node5"><a xlink:href="topi_2nn_8h.html" target="_top" xlink:title="NN op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="144.718,-73 144.718,-92 264.718,-92 264.718,-73 144.718,-73"/>
 <text text-anchor="middle" x="204.718" y="-80" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn.h</text>
 </a>
 </g>
 </g>
-<!-- Node94&#45;&gt;Node97 -->
+<!-- Node95&#45;&gt;Node98 -->
 <g id="edge4" class="edge">
-<title>Node94&#45;&gt;Node97</title>
+<title>Node95&#45;&gt;Node98</title>
 <path fill="none" stroke="#191970" d="M155.6411,-127.5482C167.9172,-116.2798 183.7834,-101.716 194.1016,-92.2449"/>
 <polygon fill="#191970" stroke="#191970" points="153.1869,-125.0499 148.1867,-134.3906 157.9205,-130.2068 153.1869,-125.0499"/>
 </g>
-<!-- Node98 -->
+<!-- Node99 -->
 <g id="node6" class="node">
-<title>Node98</title>
+<title>Node99</title>
 <g id="a_node6"><a xlink:href="reorg_8h.html" target="_top" xlink:title="Reorg op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="282.718,-67.5 282.718,-97.5 410.718,-97.5 410.718,-67.5 282.718,-67.5"/>
 <text text-anchor="start" x="290.718" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/vision</text>
@@ -87,15 +87,15 @@
 </a>
 </g>
 </g>
-<!-- Node94&#45;&gt;Node98 -->
+<!-- Node95&#45;&gt;Node99 -->
 <g id="edge6" class="edge">
-<title>Node94&#45;&gt;Node98</title>
+<title>Node95&#45;&gt;Node99</title>
 <path fill="none" stroke="#191970" d="M180.2799,-131.4758C213.6489,-121.6567 260.2015,-107.9582 295.5908,-97.5446"/>
 <polygon fill="#191970" stroke="#191970" points="178.9795,-128.21 170.3742,-134.3906 180.9555,-134.9253 178.9795,-128.21"/>
 </g>
-<!-- Node97&#45;&gt;Node95 -->
+<!-- Node98&#45;&gt;Node96 -->
 <g id="edge5" class="edge">
-<title>Node97&#45;&gt;Node95</title>
+<title>Node98&#45;&gt;Node96</title>
 <path fill="none" stroke="#191970" d="M180.3651,-67.5308C162.4225,-56.5019 138.1192,-41.5631 120.1563,-30.5218"/>
 <polygon fill="#191970" stroke="#191970" points="178.7488,-70.6456 189.1009,-72.9005 182.4145,-64.6821 178.7488,-70.6456"/>
 </g>
diff --git a/docs/reference/api/doxygen/flatten_8h__incl.svg b/docs/reference/api/doxygen/flatten_8h__incl.svg
index d029e98d4a..e13c7853da 100644
--- a/docs/reference/api/doxygen/flatten_8h__incl.svg
+++ b/docs/reference/api/doxygen/flatten_8h__incl.svg
@@ -71,18 +71,18 @@
 <path fill="none" stroke="#191970" d="M1190.092,-1063.4639C1210.555,-1053.8763 1237.2806,-1041.3545 1259.3694,-1031.0052"/>
 <polygon fill="#191970" stroke="#191970" points="1260.95,-1034.1299 1268.5204,-1026.7177 1257.98,-1027.7911 1260.95,-1034.1299"/>
 </g>
-<!-- Node88 -->
+<!-- Node89 -->
 <g id="node50" class="node">
-<title>Node88</title>
+<title>Node89</title>
 <g id="a_node50"><a xlink:href="tags_8h.html" target="_top" xlink:title="External function interface to rocBLAS libraries. ">
 <polygon fill="#ffffff" stroke="#000000" points="28.5,-940.5 28.5,-959.5 119.5,-959.5 119.5,-940.5 28.5,-940.5"/>
 <text text-anchor="middle" x="74" y="-947.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/tags.h</text>
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node88 -->
+<!-- Node0&#45;&gt;Node89 -->
 <g id="edge203" class="edge">
-<title>Node0&#45;&gt;Node88</title>
+<title>Node0&#45;&gt;Node89</title>
 <path fill="none" stroke="#191970" d="M1102.3752,-1073.0497C948.2228,-1057.7324 502.5941,-1012.0104 134,-960 132.5295,-959.7925 131.0395,-959.5776 129.5366,-959.3568"/>
 <polygon fill="#191970" stroke="#191970" points="129.913,-955.8739 119.5,-957.8285 128.8591,-962.7941 129.913,-955.8739"/>
 </g>
@@ -1654,9 +1654,9 @@
 <path fill="none" stroke="#191970" d="M1627.776,-492.3845C1661.1452,-473.2582 1736.4629,-430.0884 1781.0473,-404.5339"/>
 <polygon fill="#191970" stroke="#191970" points="1782.8652,-407.5261 1789.8007,-399.5167 1779.3843,-401.4529 1782.8652,-407.5261"/>
 </g>
-<!-- Node88&#45;&gt;Node17 -->
+<!-- Node89&#45;&gt;Node17 -->
 <g id="edge204" class="edge">
-<title>Node88&#45;&gt;Node17</title>
+<title>Node89&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M68.6881,-940.4462C58.6741,-921.5117 38,-877.623 38,-838 38,-838 38,-838 38,-133 38,-101.7875 37.7859,-86.6952 62,-67 111.2779,-26.9184 307.0435,-18.0178 385.3647,-16.0526"/>
 <polygon fill="#191970" stroke="#191970" points="385.6917,-19.5461 395.6092,-15.8188 385.5319,-12.5479 385.6917,-19.5461"/>
 </g>
diff --git a/docs/reference/api/doxygen/namespacemembers_func_o.html b/docs/reference/api/doxygen/namespacemembers_func_o.html
index 3215fab842..6012082d5f 100644
--- a/docs/reference/api/doxygen/namespacemembers_func_o.html
+++ b/docs/reference/api/doxygen/namespacemembers_func_o.html
@@ -134,28 +134,28 @@ $(function() {
 : <a class="el" href="namespacetvm.html#af682776c3609284f1bc3ea436e21a67a">tvm</a>
 , <a class="el" href="namespacetvm_1_1runtime.html#af22b89284299c81d0c1802199af446d7">tvm::runtime</a>
 , <a class="el" href="namespacetvm_1_1te.html#a6bb44656b78b7d6a02ede706ed0a85ec">tvm::te</a>
-, <a class="el" href="namespacetvm_1_1tir.html#aba58d59be99ed4026f32b0c10f690929">tvm::tir</a>
+, <a class="el" href="namespacetvm_1_1tir.html#a82b14dba63bb7d64a1da68dad8651d81">tvm::tir</a>
 , <a class="el" href="namespacetvm_1_1topi.html#a13eb3768682ba9bc6cec7022849ed021">tvm::topi</a>
 </li>
 <li>operator&lt;=()
-: <a class="el" href="namespacetvm.html#a6eea8276bcc178425bc14f3d878970ff">tvm</a>
-, <a class="el" href="namespacetvm_1_1runtime.html#a2e76c697beb4a77556a869f7cc45f09a">tvm::runtime</a>
-, <a class="el" href="namespacetvm_1_1te.html#a155868a829cdec5e04c00fee9fd6b8ab">tvm::te</a>
+: <a class="el" href="namespacetvm.html#af94a56db543e741a23bbf2f51c49091a">tvm</a>
+, <a class="el" href="namespacetvm_1_1runtime.html#a1d696c920a17b8c54775705062de75be">tvm::runtime</a>
+, <a class="el" href="namespacetvm_1_1te.html#ae7fe819e0a6e9615e65cabfe5058b498">tvm::te</a>
 </li>
 <li>operator==()
-: <a class="el" href="namespacetvm.html#a2ea3b45c96d3980227e418f7158ce5c3">tvm</a>
+: <a class="el" href="namespacetvm.html#a04a0a3acfb061ec692ba8fc24e9eacba">tvm</a>
 , <a class="el" href="namespacetvm_1_1runtime.html#aba04626a0c1e717679d673bc90c6a23f">tvm::runtime</a>
-, <a class="el" href="namespacetvm_1_1te.html#a643184f062c87081a55f9e4ba0e6d0d6">tvm::te</a>
+, <a class="el" href="namespacetvm_1_1te.html#a5d186948df24bd18a8aef7eee3b37727">tvm::te</a>
 </li>
 <li>operator&gt;()
-: <a class="el" href="namespacetvm.html#ad93d00f7b080dc3f905f5c34c170a041">tvm</a>
+: <a class="el" href="namespacetvm.html#a6aeb6ed068c5de8ab908ff234337aeeb">tvm</a>
 , <a class="el" href="namespacetvm_1_1runtime.html#a031e6c8e64cd9db11754355e3250ab4c">tvm::runtime</a>
-, <a class="el" href="namespacetvm_1_1te.html#aa8b01278c56c8099c02bf1a614f1a100">tvm::te</a>
+, <a class="el" href="namespacetvm_1_1te.html#a74074c1b06a426adb0f300944b8c4e88">tvm::te</a>
 </li>
 <li>operator&gt;=()
 : <a class="el" href="namespacetvm.html#ac194836fc11a8ba34e44738da17fd116">tvm</a>
 , <a class="el" href="namespacetvm_1_1runtime.html#af7310fb8b9944f41f8f30021d92847eb">tvm::runtime</a>
-, <a class="el" href="namespacetvm_1_1te.html#a54c35df3fc069cb65ad6e28fa6b35109">tvm::te</a>
+, <a class="el" href="namespacetvm_1_1te.html#ac3fd738e5127c9070b347732f40c88b9">tvm::te</a>
 </li>
 <li>operator&gt;&gt;()
 : <a class="el" href="namespacetvm.html#a1ce1eb32fc9d76ebe5a6b8d185024d41">tvm</a>
@@ -168,11 +168,11 @@ $(function() {
 </li>
 <li>operator|()
 : <a class="el" href="namespacetvm_1_1arith.html#a660ff1fdbb08fea19a922157cadad7a7">tvm::arith</a>
-, <a class="el" href="namespacetvm.html#a236d9aae385e6697874f75e4c8a69f8d">tvm</a>
-, <a class="el" href="namespacetvm_1_1topi.html#a0e3d0c113031f4b209febd097e426e06">tvm::topi</a>
+, <a class="el" href="namespacetvm.html#ad5ba71021b167b0a6ca2138b2c8bbace">tvm</a>
+, <a class="el" href="namespacetvm_1_1topi.html#a5fed408670c5215cb416f427bdefc512">tvm::topi</a>
 </li>
 <li>operator||()
-: <a class="el" href="namespacetvm.html#a002710a4652156a57495e10a09b5d002">tvm</a>
+: <a class="el" href="namespacetvm.html#a1a3f9ad4d0e25eee9c0b3a9c83114bc0">tvm</a>
 , <a class="el" href="namespacetvm_1_1te.html#a1619810ecdc1c9b051522a4313a2c24e">tvm::te</a>
 , <a class="el" href="namespacetvm_1_1topi.html#aed48bd10491c0ba13a63b3ebb1bbd8fb">tvm::topi</a>
 </li>
diff --git a/docs/reference/api/doxygen/namespacemembers_o.html b/docs/reference/api/doxygen/namespacemembers_o.html
index 1dda6e0b42..2659da74f2 100644
--- a/docs/reference/api/doxygen/namespacemembers_o.html
+++ b/docs/reference/api/doxygen/namespacemembers_o.html
@@ -140,18 +140,18 @@ $(function() {
 : <a class="el" href="namespacetvm.html#af682776c3609284f1bc3ea436e21a67a">tvm</a>
 , <a class="el" href="namespacetvm_1_1runtime.html#af22b89284299c81d0c1802199af446d7">tvm::runtime</a>
 , <a class="el" href="namespacetvm_1_1te.html#a8d52a6dd288ed59dd3f75fac6e3833f4">tvm::te</a>
-, <a class="el" href="namespacetvm_1_1tir.html#aba58d59be99ed4026f32b0c10f690929">tvm::tir</a>
+, <a class="el" href="namespacetvm_1_1tir.html#a82b14dba63bb7d64a1da68dad8651d81">tvm::tir</a>
 , <a class="el" href="namespacetvm_1_1topi.html#a13eb3768682ba9bc6cec7022849ed021">tvm::topi</a>
 </li>
 <li>operator&lt;=()
-: <a class="el" href="namespacetvm.html#af94a56db543e741a23bbf2f51c49091a">tvm</a>
-, <a class="el" href="namespacetvm_1_1runtime.html#a9cf2e7e67fd12d69c5bce2be881c8296">tvm::runtime</a>
-, <a class="el" href="namespacetvm_1_1te.html#a9049756f490d96b37d24fb4a4d019d6e">tvm::te</a>
+: <a class="el" href="namespacetvm.html#ad5dbec0c48b8644c5c6e9d773ddc106b">tvm</a>
+, <a class="el" href="namespacetvm_1_1runtime.html#a8daf39dc422f228fae2ec11a426bab28">tvm::runtime</a>
+, <a class="el" href="namespacetvm_1_1te.html#a155868a829cdec5e04c00fee9fd6b8ab">tvm::te</a>
 </li>
 <li>operator==()
 : <a class="el" href="namespacetvm.html#a2ea3b45c96d3980227e418f7158ce5c3">tvm</a>
 , <a class="el" href="namespacetvm_1_1runtime.html#aba04626a0c1e717679d673bc90c6a23f">tvm::runtime</a>
-, <a class="el" href="namespacetvm_1_1te.html#a643184f062c87081a55f9e4ba0e6d0d6">tvm::te</a>
+, <a class="el" href="namespacetvm_1_1te.html#a5d186948df24bd18a8aef7eee3b37727">tvm::te</a>
 </li>
 <li>operator&gt;()
 : <a class="el" href="namespacetvm.html#ad93d00f7b080dc3f905f5c34c170a041">tvm</a>
@@ -159,18 +159,18 @@ $(function() {
 , <a class="el" href="namespacetvm_1_1te.html#af05f53104e6686e271783712280e4005">tvm::te</a>
 </li>
 <li>operator&gt;=()
-: <a class="el" href="namespacetvm.html#a5530417da455bd46f5dc55f27d69bcdf">tvm</a>
-, <a class="el" href="namespacetvm_1_1runtime.html#ad83a0ea66d3af9fd1b232e49a960f6f4">tvm::runtime</a>
-, <a class="el" href="namespacetvm_1_1te.html#a54c35df3fc069cb65ad6e28fa6b35109">tvm::te</a>
+: <a class="el" href="namespacetvm.html#a35961a6074b72fae0dfc48ee395e0673">tvm</a>
+, <a class="el" href="namespacetvm_1_1runtime.html#af7310fb8b9944f41f8f30021d92847eb">tvm::runtime</a>
+, <a class="el" href="namespacetvm_1_1te.html#a5cbaee6b481ab46d55c17206e2487eed">tvm::te</a>
 </li>
 <li>operator&gt;&gt;()
-: <a class="el" href="namespacetvm.html#a550c2a251b2a6fd2a72172fe3db75d40">tvm</a>
-, <a class="el" href="namespacetvm_1_1te.html#a4a8524467a57ae005654a3f0cb816e3f">tvm::te</a>
+: <a class="el" href="namespacetvm.html#a1ce1eb32fc9d76ebe5a6b8d185024d41">tvm</a>
+, <a class="el" href="namespacetvm_1_1te.html#a8705a88b943011532ff4c94c4b06c213">tvm::te</a>
 , <a class="el" href="namespacetvm_1_1topi.html#a27d3173a662930df8ab27f3f75ebfa4c">tvm::topi</a>
 </li>
 <li>operator^()
 : <a class="el" href="namespacetvm.html#a6f638564e5e4d1023096523800f2579e">tvm</a>
-, <a class="el" href="namespacetvm_1_1topi.html#a32379f4c2a17152ea26ea90967889847">tvm::topi</a>
+, <a class="el" href="namespacetvm_1_1topi.html#abbf86a9b8077930e0869f8243497e427">tvm::topi</a>
 </li>
 <li>operator|()
 : <a class="el" href="namespacetvm_1_1arith.html#a660ff1fdbb08fea19a922157cadad7a7">tvm::arith</a>
@@ -179,7 +179,7 @@ $(function() {
 </li>
 <li>operator||()
 : <a class="el" href="namespacetvm.html#a1a3f9ad4d0e25eee9c0b3a9c83114bc0">tvm</a>
-, <a class="el" href="namespacetvm_1_1te.html#a5be3fdf7d7b6e325b0d1887b47416918">tvm::te</a>
+, <a class="el" href="namespacetvm_1_1te.html#a1fd6b8f8380a489cfcd806952c2aae42">tvm::te</a>
 , <a class="el" href="namespacetvm_1_1topi.html#aed48bd10491c0ba13a63b3ebb1bbd8fb">tvm::topi</a>
 </li>
 <li>operator~()
diff --git a/docs/reference/api/doxygen/namespacetvm_1_1tir.html b/docs/reference/api/doxygen/namespacetvm_1_1tir.html
index 4a752b8536..351caff659 100644
--- a/docs/reference/api/doxygen/namespacetvm_1_1tir.html
+++ b/docs/reference/api/doxygen/namespacetvm_1_1tir.html
@@ -828,6 +828,8 @@ Functions</h2></td></tr>
 <tr class="memitem:a79fc1666a383e629654ca5f8d2cdf9f4"><td class="memTemplParams" colspan="2">template&lt;&gt; </td></tr>
 <tr class="memitem:a79fc1666a383e629654ca5f8d2cdf9f4"><td class="memTemplItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1PrimExpr.html">PrimExpr</a>&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir.html#a79fc1666a383e629654ca5f8d2cdf9f4">MakeConstScalar</a> (<a class="el" href="namespacetvm.html#a41918af1a1dc386388639a9d3ad06c5d">DataType</a> t, bool value, <a class="el" href="classtvm_1_1Span.html">Span</a> span)</td></tr>
 <tr class="separator:a79fc1666a383e629654ca5f8d2cdf9f4"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a82b14dba63bb7d64a1da68dad8651d81"><td class="memItemLeft" align="right" valign="top">std::ostream &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir.html#a82b14dba63bb7d64a1da68dad8651d81">operator&lt;&lt;</a> (std::ostream &amp;os, <a class="el" href="namespacetvm_1_1tir.html#a8f4a86b205145696c0555fd02bd37f46">CallEffectKind</a> side_effect)</td></tr>
+<tr class="separator:a82b14dba63bb7d64a1da68dad8651d81"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:abf355a4fdeb063b1adb4946cad5fca68"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1PrimExpr.html">PrimExpr</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacetvm_1_1tir.html#abf355a4fdeb063b1adb4946cad5fca68">TypeAnnotation</a> (<a class="el" href="namespacetvm.html#a41918af1a1dc386388639a9d3ad06c5d">DataType</a> dtype, <a class="el" href="classtvm_1_1Span.html">Span</a> span=<a class="el" href="classt [...]
 <tr class="memdesc:abf355a4fdeb063b1adb4946cad5fca68"><td class="mdescLeft">&#160;</td><td class="mdescRight">Create a type annotation expression.  <a href="#abf355a4fdeb063b1adb4946cad5fca68">More...</a><br /></td></tr>
 <tr class="separator:abf355a4fdeb063b1adb4946cad5fca68"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -2791,10 +2793,46 @@ template&lt;&gt; </div>
 </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="a82b14dba63bb7d64a1da68dad8651d81"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a82b14dba63bb7d64a1da68dad8651d81">&#9670;&nbsp;</a></span>operator<<() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">std::ostream&amp; tvm::tir::operator&lt;&lt; </td>
+          <td>(</td>
+          <td class="paramtype">std::ostream &amp;&#160;</td>
+          <td class="paramname"><em>os</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype"><a class="el" href="namespacetvm_1_1tir.html#a8f4a86b205145696c0555fd02bd37f46">CallEffectKind</a>&#160;</td>
+          <td class="paramname"><em>side_effect</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <a id="aba58d59be99ed4026f32b0c10f690929"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aba58d59be99ed4026f32b0c10f690929">&#9670;&nbsp;</a></span>operator<<()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#aba58d59be99ed4026f32b0c10f690929">&#9670;&nbsp;</a></span>operator<<() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
diff --git a/docs/reference/api/doxygen/nn_2bnn_8h__incl.svg b/docs/reference/api/doxygen/nn_2bnn_8h__incl.svg
index a03c299520..f0f4351386 100644
--- a/docs/reference/api/doxygen/nn_2bnn_8h__incl.svg
+++ b/docs/reference/api/doxygen/nn_2bnn_8h__incl.svg
@@ -74,18 +74,18 @@
 <path fill="none" stroke="#191970" d="M1505.7769,-1119.4639C1523.5657,-1110.0047 1546.7259,-1097.6894 1566.0343,-1087.4222"/>
 <polygon fill="#191970" stroke="#191970" points="1567.6955,-1090.503 1574.8816,-1082.7177 1564.409,-1084.3225 1567.6955,-1090.503"/>
 </g>
-<!-- Node88 -->
+<!-- Node89 -->
 <g id="node50" class="node">
-<title>Node88</title>
+<title>Node89</title>
 <g id="a_node50"><a xlink:href="tags_8h.html" target="_top" xlink:title="External function interface to rocBLAS libraries. ">
 <polygon fill="#ffffff" stroke="#000000" points="0,-996.5 0,-1015.5 91,-1015.5 91,-996.5 0,-996.5"/>
 <text text-anchor="middle" x="45.5" y="-1003.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/tags.h</text>
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node88 -->
+<!-- Node0&#45;&gt;Node89 -->
 <g id="edge209" class="edge">
-<title>Node0&#45;&gt;Node88</title>
+<title>Node0&#45;&gt;Node89</title>
 <path fill="none" stroke="#191970" d="M1421.7503,-1133.719C1171.9124,-1130.0522 164.9411,-1113.4327 105.5,-1083 81.1893,-1070.5534 63.7091,-1043.2631 54.0937,-1024.7502"/>
 <polygon fill="#191970" stroke="#191970" points="57.1124,-1022.9565 49.5675,-1015.5184 50.8271,-1026.0381 57.1124,-1022.9565"/>
 </g>
@@ -1691,9 +1691,9 @@
 <path fill="none" stroke="#191970" d="M2285.7463,-548.3896C2308.7323,-539.6984 2342.8435,-526.1312 2371.5,-512 2403.0679,-496.4331 2437.6032,-476.0423 2462.0212,-461.0214"/>
 <polygon fill="#191970" stroke="#191970" points="2463.8759,-463.9897 2470.5352,-455.7492 2460.1906,-458.0383 2463.8759,-463.9897"/>
 </g>
-<!-- Node88&#45;&gt;Node16 -->
+<!-- Node89&#45;&gt;Node16 -->
 <g id="edge210" class="edge">
-<title>Node88&#45;&gt;Node16</title>
+<title>Node89&#45;&gt;Node16</title>
 <path fill="none" stroke="#191970" d="M48.079,-996.4945C53.0561,-977.2857 63.5,-932.4032 63.5,-894 63.5,-894 63.5,-894 63.5,-138.5 63.5,-101.6453 74.6057,-87.0952 105.5,-67 140.5213,-44.2204 426.9646,-23.8434 524.2837,-17.5183"/>
 <polygon fill="#191970" stroke="#191970" points="524.6533,-21.0018 534.4078,-16.8668 524.2037,-14.0163 524.6533,-21.0018"/>
 </g>
diff --git a/docs/reference/api/doxygen/nn_2pooling_8h__incl.svg b/docs/reference/api/doxygen/nn_2pooling_8h__incl.svg
index 13080ca004..c8c8b8e53c 100644
--- a/docs/reference/api/doxygen/nn_2pooling_8h__incl.svg
+++ b/docs/reference/api/doxygen/nn_2pooling_8h__incl.svg
@@ -98,33 +98,33 @@
 <path fill="none" stroke="#191970" d="M2435.329,-1179.9977C2373.9574,-1168.4199 2276.0435,-1149.9484 2216.4233,-1138.701"/>
 <polygon fill="#191970" stroke="#191970" points="2216.9504,-1135.2388 2206.4749,-1136.8243 2215.6527,-1142.1175 2216.9504,-1135.2388"/>
 </g>
-<!-- Node90 -->
+<!-- Node91 -->
 <g id="node41" class="node">
-<title>Node90</title>
+<title>Node91</title>
 <g id="a_node41"><a xlink:href="reduction_8h.html" target="_top" xlink:title="Reduction op constructors. ">
 <polygon fill="#ffffff" stroke="#000000" points="2760,-1063.5 2760,-1082.5 2876,-1082.5 2876,-1063.5 2760,-1063.5"/>
 <text text-anchor="middle" x="2818" y="-1070.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/reduction.h</text>
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node90 -->
+<!-- Node0&#45;&gt;Node91 -->
 <g id="edge214" class="edge">
-<title>Node0&#45;&gt;Node90</title>
+<title>Node0&#45;&gt;Node91</title>
 <path fill="none" stroke="#191970" d="M2532.7469,-1175.4992C2597.3338,-1152.2914 2720.3988,-1108.0708 2781.585,-1086.0849"/>
 <polygon fill="#191970" stroke="#191970" points="2782.9804,-1089.3027 2791.2077,-1082.6272 2780.6132,-1082.7151 2782.9804,-1089.3027"/>
 </g>
-<!-- Node94 -->
+<!-- Node95 -->
 <g id="node44" class="node">
-<title>Node94</title>
+<title>Node95</title>
 <g id="a_node44"><a xlink:href="tags_8h.html" target="_top" xlink:title="External function interface to rocBLAS libraries. ">
 <polygon fill="#ffffff" stroke="#000000" points="3149.5,-890 3149.5,-909 3240.5,-909 3240.5,-890 3149.5,-890"/>
 <text text-anchor="middle" x="3195" y="-897" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/tags.h</text>
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node94 -->
+<!-- Node0&#45;&gt;Node95 -->
 <g id="edge215" class="edge">
-<title>Node0&#45;&gt;Node94</title>
+<title>Node0&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M2546.5206,-1183.2052C2690.4349,-1163.3642 3071.1257,-1104.6605 3169,-1027 3197.8468,-1004.1109 3204.4855,-987.7406 3202,-951 3201.2909,-940.5176 3199.7787,-928.9167 3198.3407,-919.3833"/>
 <polygon fill="#191970" stroke="#191970" points="3201.7692,-918.6572 3196.7392,-909.3325 3194.8564,-919.7587 3201.7692,-918.6572"/>
 </g>
@@ -1261,30 +1261,30 @@
 <path fill="none" stroke="#191970" d="M2171.7171,-1119.4707C2200.5527,-1078.5628 2314.3319,-917.1488 2357.3798,-856.0784"/>
 <polygon fill="#191970" stroke="#191970" points="2360.2986,-858.0124 2363.1994,-847.8224 2354.5772,-853.9794 2360.2986,-858.0124"/>
 </g>
-<!-- Node82&#45;&gt;Node90 -->
+<!-- Node82&#45;&gt;Node91 -->
 <g id="edge155" class="edge">
-<title>Node82&#45;&gt;Node90</title>
+<title>Node82&#45;&gt;Node91</title>
 <path fill="none" stroke="#191970" d="M2206.3435,-1125.4545C2315.7755,-1116.0698 2613.8485,-1090.5076 2749.3338,-1078.8887"/>
 <polygon fill="#191970" stroke="#191970" points="2749.9212,-1082.3512 2759.5855,-1078.0095 2749.323,-1075.3768 2749.9212,-1082.3512"/>
 </g>
-<!-- Node82&#45;&gt;Node94 -->
+<!-- Node82&#45;&gt;Node95 -->
 <g id="edge210" class="edge">
-<title>Node82&#45;&gt;Node94</title>
+<title>Node82&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M2206.2119,-1128.5135C2316.695,-1126.7466 2628.5107,-1118.7187 2885,-1083 3001.6454,-1066.7559 3044.8558,-1090.5646 3144,-1027 3170.1905,-1010.2084 3174.3344,-999.8412 3186,-971 3192.729,-954.3637 3194.7429,-933.9234 3195.213,-919.1648"/>
 <polygon fill="#191970" stroke="#191970" points="3198.7129,-919.1816 3195.324,-909.1433 3191.7133,-919.1039 3198.7129,-919.1816"/>
 </g>
-<!-- Node98 -->
+<!-- Node99 -->
 <g id="node47" class="node">
-<title>Node98</title>
+<title>Node99</title>
 <g id="a_node47"><a xlink:href="topi_2transform_8h.html" target="_top" xlink:title="Transform op constructors. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="2478,-1007.5 2478,-1026.5 2596,-1026.5 2596,-1007.5 2478,-1007.5"/>
 <text text-anchor="middle" x="2537" y="-1014.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/transform.h</text>
 </a>
 </g>
 </g>
-<!-- Node82&#45;&gt;Node98 -->
+<!-- Node82&#45;&gt;Node99 -->
 <g id="edge211" class="edge">
-<title>Node82&#45;&gt;Node98</title>
+<title>Node82&#45;&gt;Node99</title>
 <path fill="none" stroke="#191970" d="M2196.7165,-1119.4509C2263.9779,-1099.2002 2420.8866,-1051.9589 2495.7867,-1029.4083"/>
 <polygon fill="#191970" stroke="#191970" points="2496.7974,-1032.7593 2505.3638,-1026.5249 2494.7793,-1026.0565 2496.7974,-1032.7593"/>
 </g>
@@ -1351,60 +1351,60 @@
 <path fill="none" stroke="#191970" d="M1333.5614,-593.2455C1331.7145,-585.8579 1329.1172,-575.4689 1326.8522,-566.4087"/>
 <polygon fill="#191970" stroke="#191970" points="1330.2316,-565.4952 1324.4107,-556.6427 1323.4406,-567.193 1330.2316,-565.4952"/>
 </g>
-<!-- Node90&#45;&gt;Node16 -->
+<!-- Node91&#45;&gt;Node16 -->
 <g id="edge208" class="edge">
-<title>Node90&#45;&gt;Node16</title>
+<title>Node91&#45;&gt;Node16</title>
 <path fill="none" stroke="#191970" d="M2876.1065,-1067.9511C2956.2684,-1060.4493 3096.2915,-1045.2636 3144,-1027 3238.9742,-990.6423 3334,-1001.1955 3334,-899.5 3334,-899.5 3334,-899.5 3334,-133 3334,-83.3971 3290.5146,-84.229 3244,-67 3183.8071,-44.7045 2731.9145,-22.6807 2606.306,-16.94"/>
 <polygon fill="#191970" stroke="#191970" points="2606.3209,-13.4371 2596.1725,-16.4799 2606.0034,-20.4299 2606.3209,-13.4371"/>
 </g>
-<!-- Node90&#45;&gt;Node20 -->
+<!-- Node91&#45;&gt;Node20 -->
 <g id="edge209" class="edge">
-<title>Node90&#45;&gt;Node20</title>
+<title>Node91&#45;&gt;Node20</title>
 <path fill="none" stroke="#191970" d="M2876.2723,-1063.5117C2951.7962,-1051.0973 3074.3441,-1030.5039 3078,-1027 3141.9165,-965.7419 3121,-921.0318 3121,-832.5 3121,-832.5 3121,-832.5 3121,-715 3121,-459.9489 3076.0384,-377.592 2916,-179 2889.2219,-145.771 2878.2095,-137.2979 2838,-123 2769.2195,-98.5427 2252.0914,-81.9734 2113.8789,-77.9496"/>
 <polygon fill="#191970" stroke="#191970" points="2113.9597,-74.4505 2103.8629,-77.6607 2113.7578,-81.4476 2113.9597,-74.4505"/>
 </g>
-<!-- Node90&#45;&gt;Node26 -->
+<!-- Node91&#45;&gt;Node26 -->
 <g id="edge206" class="edge">
-<title>Node90&#45;&gt;Node26</title>
+<title>Node91&#45;&gt;Node26</title>
 <path fill="none" stroke="#191970" d="M2823.0596,-1063.2661C2840.0435,-1029.3852 2892.9143,-913.4326 2874,-817 2843.8813,-663.4429 2741,-647.4829 2741,-491 2741,-491 2741,-491 2741,-435 2741,-396.4632 2541.5125,-196.1456 2507,-179 2466.8788,-159.068 2341.0104,-143.9111 2274.3664,-137.0443"/>
 <polygon fill="#191970" stroke="#191970" points="2274.5032,-133.5403 2264.2012,-136.014 2273.7972,-140.5046 2274.5032,-133.5403"/>
 </g>
-<!-- Node90&#45;&gt;Node58 -->
+<!-- Node91&#45;&gt;Node58 -->
 <g id="edge156" class="edge">
-<title>Node90&#45;&gt;Node58</title>
+<title>Node91&#45;&gt;Node58</title>
 <path fill="none" stroke="#191970" d="M2759.9861,-1068.8242C2666.3248,-1061.6443 2486.1183,-1045.8942 2425,-1027 2232.0712,-967.3578 2025.539,-832.0364 1958.9889,-786.3759"/>
 <polygon fill="#191970" stroke="#191970" points="1960.7612,-783.3464 1950.5434,-780.5452 1956.7842,-789.1069 1960.7612,-783.3464"/>
 </g>
-<!-- Node90&#45;&gt;Node83 -->
+<!-- Node91&#45;&gt;Node83 -->
 <g id="edge168" class="edge">
-<title>Node90&#45;&gt;Node83</title>
+<title>Node91&#45;&gt;Node83</title>
 <path fill="none" stroke="#191970" d="M2759.8083,-1069.2225C2674.4578,-1063.0273 2519.935,-1049.1372 2469,-1027 2405.8623,-999.5594 2382.6181,-980.683 2362,-915 2356.0769,-896.1309 2359.8955,-873.9011 2364.7438,-857.2738"/>
 <polygon fill="#191970" stroke="#191970" points="2368.085,-858.3166 2367.8178,-847.7251 2361.4217,-856.1714 2368.085,-858.3166"/>
 </g>
-<!-- Node91 -->
+<!-- Node92 -->
 <g id="node42" class="node">
-<title>Node91</title>
+<title>Node92</title>
 <g id="a_node42"><a xlink:href="broadcast_8h.html" target="_top" xlink:title="Broadcast op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="2884,-951.5 2884,-970.5 3002,-970.5 3002,-951.5 2884,-951.5"/>
 <text text-anchor="middle" x="2943" y="-958.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/broadcast.h</text>
 </a>
 </g>
 </g>
-<!-- Node90&#45;&gt;Node91 -->
+<!-- Node91&#45;&gt;Node92 -->
 <g id="edge157" class="edge">
-<title>Node90&#45;&gt;Node91</title>
+<title>Node91&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M2829.189,-1063.1643C2839.7407,-1053.8741 2855.9843,-1039.5344 2870,-1027 2888.6973,-1010.2787 2909.958,-991.0298 2924.6927,-977.6517"/>
 <polygon fill="#191970" stroke="#191970" points="2927.1017,-980.1919 2932.1495,-970.8768 2922.3945,-975.0109 2927.1017,-980.1919"/>
 </g>
-<!-- Node90&#45;&gt;Node94 -->
+<!-- Node91&#45;&gt;Node95 -->
 <g id="edge178" class="edge">
-<title>Node90&#45;&gt;Node94</title>
+<title>Node91&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M2876.0717,-1070.0379C2952.3737,-1065.1623 3081.2426,-1053.2383 3120,-1027 3158.657,-1000.8297 3180.2305,-947.5397 3189.4907,-918.8387"/>
 <polygon fill="#191970" stroke="#191970" points="3192.876,-919.7366 3192.4468,-909.1505 3186.1807,-917.6937 3192.876,-919.7366"/>
 </g>
-<!-- Node95 -->
+<!-- Node96 -->
 <g id="node45" class="node">
-<title>Node95</title>
+<title>Node96</title>
 <g id="a_node45"><a xlink:href="ravel__unravel_8h.html" target="_top" xlink:title="Index ravel and unraval operations. ">
 <polygon fill="#ffffff" stroke="#000000" points="2749,-817.5 2749,-847.5 2865,-847.5 2865,-817.5 2749,-817.5"/>
 <text text-anchor="start" x="2757" y="-835.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/detail/ravel</text>
@@ -1412,240 +1412,240 @@
 </a>
 </g>
 </g>
-<!-- Node90&#45;&gt;Node95 -->
+<!-- Node91&#45;&gt;Node96 -->
 <g id="edge169" class="edge">
-<title>Node90&#45;&gt;Node95</title>
+<title>Node91&#45;&gt;Node96</title>
 <path fill="none" stroke="#191970" d="M2817.5552,-1063.2742C2815.9679,-1028.5702 2810.5497,-910.1096 2808.1546,-857.7429"/>
 <polygon fill="#191970" stroke="#191970" points="2811.6452,-857.4551 2807.6918,-847.6255 2804.6525,-857.775 2811.6452,-857.4551"/>
 </g>
-<!-- Node96 -->
+<!-- Node97 -->
 <g id="node46" class="node">
-<title>Node96</title>
+<title>Node97</title>
 <g id="a_node46"><a xlink:href="elemwise_8h.html" target="_top" xlink:title="Elementwise op constructions. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="2951.5,-1007.5 2951.5,-1026.5 3068.5,-1026.5 3068.5,-1007.5 2951.5,-1007.5"/>
 <text text-anchor="middle" x="3010" y="-1014.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/elemwise.h</text>
 </a>
 </g>
 </g>
-<!-- Node90&#45;&gt;Node96 -->
+<!-- Node91&#45;&gt;Node97 -->
 <g id="edge172" class="edge">
-<title>Node90&#45;&gt;Node96</title>
+<title>Node91&#45;&gt;Node97</title>
 <path fill="none" stroke="#191970" d="M2851.006,-1063.3733C2883.2091,-1053.9807 2932.3442,-1039.6496 2967.4639,-1029.4064"/>
 <polygon fill="#191970" stroke="#191970" points="2968.8047,-1032.6612 2977.4247,-1026.5011 2966.8447,-1025.9412 2968.8047,-1032.6612"/>
 </g>
-<!-- Node90&#45;&gt;Node98 -->
+<!-- Node91&#45;&gt;Node99 -->
 <g id="edge179" class="edge">
-<title>Node90&#45;&gt;Node98</title>
+<title>Node91&#45;&gt;Node99</title>
 <path fill="none" stroke="#191970" d="M2770.0137,-1063.4369C2721.5186,-1053.7724 2646.5527,-1038.8326 2594.8121,-1028.5213"/>
 <polygon fill="#191970" stroke="#191970" points="2595.2797,-1025.0457 2584.7884,-1026.5237 2593.9115,-1031.9107 2595.2797,-1025.0457"/>
 </g>
-<!-- Node102 -->
+<!-- Node103 -->
 <g id="node50" class="node">
-<title>Node102</title>
+<title>Node103</title>
 <polygon fill="#ffffff" stroke="#bfbfbf" points="2698.5,-951.5 2698.5,-970.5 2751.5,-970.5 2751.5,-951.5 2698.5,-951.5"/>
 <text text-anchor="middle" x="2725" y="-958.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">iterator</text>
 </g>
-<!-- Node90&#45;&gt;Node102 -->
+<!-- Node91&#45;&gt;Node103 -->
 <g id="edge207" class="edge">
-<title>Node90&#45;&gt;Node102</title>
+<title>Node91&#45;&gt;Node103</title>
 <path fill="none" stroke="#191970" d="M2810.0709,-1063.4509C2794.4223,-1044.6054 2759.3642,-1002.3849 2739.483,-978.4419"/>
 <polygon fill="#191970" stroke="#191970" points="2741.9901,-975.9824 2732.9091,-970.5249 2736.6047,-980.4543 2741.9901,-975.9824"/>
 </g>
-<!-- Node91&#45;&gt;Node16 -->
+<!-- Node92&#45;&gt;Node16 -->
 <g id="edge167" class="edge">
-<title>Node91&#45;&gt;Node16</title>
+<title>Node92&#45;&gt;Node16</title>
 <path fill="none" stroke="#191970" d="M2968.0292,-951.4726C3008.2982,-934.3228 3083,-894.1945 3083,-832.5 3083,-832.5 3083,-832.5 3083,-715 3083,-535.3075 3182,-502.6925 3182,-323 3182,-323 3182,-323 3182,-133 3182,-101.7875 3182.7532,-86.0132 3158,-67 3114.0681,-33.2554 2722.5141,-19.6597 2606.348,-16.3472"/>
 <polygon fill="#191970" stroke="#191970" points="2606.3862,-12.847 2596.2925,-16.0667 2606.191,-19.8443 2606.3862,-12.847"/>
 </g>
-<!-- Node91&#45;&gt;Node26 -->
+<!-- Node92&#45;&gt;Node26 -->
 <g id="edge166" class="edge">
-<title>Node91&#45;&gt;Node26</title>
+<title>Node92&#45;&gt;Node26</title>
 <path fill="none" stroke="#191970" d="M2946.2929,-951.4619C2953.2892,-930.3181 2969,-877.7946 2969,-832.5 2969,-832.5 2969,-832.5 2969,-603 2969,-576.9141 2683.9925,-262.7564 2664,-246 2618.9587,-208.2494 2605.4538,-198.4602 2550,-179 2499.917,-161.4246 2348.5729,-144.5601 2274.1887,-137.0615"/>
 <polygon fill="#191970" stroke="#191970" points="2274.5204,-133.5773 2264.2221,-136.0664 2273.8249,-140.5426 2274.5204,-133.5773"/>
 </g>
-<!-- Node91&#45;&gt;Node83 -->
+<!-- Node92&#45;&gt;Node83 -->
 <g id="edge163" class="edge">
-<title>Node91&#45;&gt;Node83</title>
+<title>Node92&#45;&gt;Node83</title>
 <path fill="none" stroke="#191970" d="M2883.7535,-957.8226C2813.5968,-953.0038 2693.6861,-941.4303 2594,-915 2558.7387,-905.651 2552.0716,-897.0343 2518,-884 2488.1842,-872.5938 2454.611,-860.5416 2427.3867,-850.9663"/>
 <polygon fill="#191970" stroke="#191970" points="2428.2575,-847.5628 2417.6627,-847.5564 2425.9411,-854.1684 2428.2575,-847.5628"/>
 </g>
-<!-- Node92 -->
+<!-- Node93 -->
 <g id="node43" class="node">
-<title>Node92</title>
+<title>Node93</title>
 <g id="a_node43"><a xlink:href="detail_2broadcast_8h.html" target="_top" xlink:title="Detail broadcast. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="2602.5,-890 2602.5,-909 2751.5,-909 2751.5,-890 2602.5,-890"/>
 <text text-anchor="middle" x="2677" y="-897" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/detail/broadcast.h</text>
 </a>
 </g>
 </g>
-<!-- Node91&#45;&gt;Node92 -->
+<!-- Node92&#45;&gt;Node93 -->
 <g id="edge158" class="edge">
-<title>Node91&#45;&gt;Node92</title>
+<title>Node92&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M2901.7294,-951.4581C2855.0391,-940.6632 2778.9425,-923.0694 2728.426,-911.3899"/>
 <polygon fill="#191970" stroke="#191970" points="2728.8989,-907.9069 2718.3675,-909.0643 2727.322,-914.727 2728.8989,-907.9069"/>
 </g>
-<!-- Node91&#45;&gt;Node94 -->
+<!-- Node92&#45;&gt;Node95 -->
 <g id="edge164" class="edge">
-<title>Node91&#45;&gt;Node94</title>
+<title>Node92&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M2982.0985,-951.4581C3026.239,-940.6857 3098.122,-923.1428 3145.9806,-911.4631"/>
 <polygon fill="#191970" stroke="#191970" points="3146.9247,-914.8355 3155.8098,-909.0643 3145.265,-908.035 3146.9247,-914.8355"/>
 </g>
-<!-- Node92&#45;&gt;Node16 -->
+<!-- Node93&#45;&gt;Node16 -->
 <g id="edge162" class="edge">
-<title>Node92&#45;&gt;Node16</title>
+<title>Node93&#45;&gt;Node16</title>
 <path fill="none" stroke="#191970" d="M2684.0882,-889.6455C2739.8663,-812.0178 3106,-300.9588 3106,-261.5 3106,-261.5 3106,-261.5 3106,-133 3106,-92.7538 3080.0109,-84.9713 3044,-67 2966.8147,-28.4806 2699.563,-18.4969 2606.3644,-16.157"/>
 <polygon fill="#191970" stroke="#191970" points="2606.2157,-12.6525 2596.1353,-15.9137 2606.0492,-19.6505 2606.2157,-12.6525"/>
 </g>
-<!-- Node92&#45;&gt;Node26 -->
+<!-- Node93&#45;&gt;Node26 -->
 <g id="edge161" class="edge">
-<title>Node92&#45;&gt;Node26</title>
+<title>Node93&#45;&gt;Node26</title>
 <path fill="none" stroke="#191970" d="M2676.3204,-889.7798C2673.7819,-852.5998 2665,-715.8241 2665,-603 2665,-603 2665,-603 2665,-435 2665,-364.022 2535.3528,-214.689 2474,-179 2440.6964,-159.6272 2334.3905,-144.7373 2274.3435,-137.6168"/>
 <polygon fill="#191970" stroke="#191970" points="2274.3913,-134.0988 2264.0537,-136.4189 2273.5818,-141.0518 2274.3913,-134.0988"/>
 </g>
-<!-- Node92&#45;&gt;Node58 -->
+<!-- Node93&#45;&gt;Node58 -->
 <g id="edge159" class="edge">
-<title>Node92&#45;&gt;Node58</title>
+<title>Node93&#45;&gt;Node58</title>
 <path fill="none" stroke="#191970" d="M2656.2661,-889.8891C2616.9804,-872.1521 2528.5858,-834.4918 2450,-817 2366.1206,-798.3299 2119.0292,-781.7653 2001.078,-774.6684"/>
 <polygon fill="#191970" stroke="#191970" points="2000.9502,-771.1546 1990.7592,-774.0519 2000.5326,-778.1421 2000.9502,-771.1546"/>
 </g>
-<!-- Node92&#45;&gt;Node83 -->
+<!-- Node93&#45;&gt;Node83 -->
 <g id="edge160" class="edge">
-<title>Node92&#45;&gt;Node83</title>
+<title>Node93&#45;&gt;Node83</title>
 <path fill="none" stroke="#191970" d="M2633.9093,-889.9717C2586.416,-879.4699 2509.0739,-862.3678 2451.0843,-849.545"/>
 <polygon fill="#191970" stroke="#191970" points="2451.7437,-846.1064 2441.2239,-847.3647 2450.2323,-852.9413 2451.7437,-846.1064"/>
 </g>
-<!-- Node94&#45;&gt;Node16 -->
+<!-- Node95&#45;&gt;Node16 -->
 <g id="edge165" class="edge">
-<title>Node94&#45;&gt;Node16</title>
+<title>Node95&#45;&gt;Node16</title>
 <path fill="none" stroke="#191970" d="M3198.1663,-889.9519C3204.8934,-868.7881 3220,-816.2274 3220,-771 3220,-771 3220,-771 3220,-133 3220,-101.7875 3220.804,-85.9469 3196,-67 3148.8788,-31.0057 2727.586,-18.913 2606.4457,-16.1636"/>
 <polygon fill="#191970" stroke="#191970" points="2606.3909,-12.6617 2596.316,-15.9398 2606.2362,-19.66 2606.3909,-12.6617"/>
 </g>
-<!-- Node95&#45;&gt;Node20 -->
+<!-- Node96&#45;&gt;Node20 -->
 <g id="edge171" class="edge">
-<title>Node95&#45;&gt;Node20</title>
+<title>Node96&#45;&gt;Node20</title>
 <path fill="none" stroke="#191970" d="M2817.6528,-817.3469C2831.7939,-795.7084 2855,-754.2237 2855,-715 2855,-715 2855,-715 2855,-603 2855,-386.9392 2963.1428,-234.3751 2778,-123 2720.7477,-88.5591 2246.138,-79.3388 2114.0175,-77.4288"/>
 <polygon fill="#191970" stroke="#191970" points="2113.7517,-73.9249 2103.7037,-77.2847 2113.6538,-80.9242 2113.7517,-73.9249"/>
 </g>
-<!-- Node95&#45;&gt;Node58 -->
+<!-- Node96&#45;&gt;Node58 -->
 <g id="edge170" class="edge">
-<title>Node95&#45;&gt;Node58</title>
+<title>Node96&#45;&gt;Node58</title>
 <path fill="none" stroke="#191970" d="M2748.9049,-828.3933C2592.6308,-817.3463 2164.1179,-787.0549 2001.101,-775.5313"/>
 <polygon fill="#191970" stroke="#191970" points="2000.975,-772.0137 1990.7531,-774.7998 2000.4813,-778.9963 2000.975,-772.0137"/>
 </g>
-<!-- Node96&#45;&gt;Node16 -->
+<!-- Node97&#45;&gt;Node16 -->
 <g id="edge176" class="edge">
-<title>Node96&#45;&gt;Node16</title>
+<title>Node97&#45;&gt;Node16</title>
 <path fill="none" stroke="#191970" d="M3068.7509,-1009.9954C3125.696,-999.7661 3209.2613,-974.8964 3250,-915 3328.0112,-800.3033 3258,-741.7122 3258,-603 3258,-603 3258,-603 3258,-133 3258,-98.6999 3247.1901,-85.0119 3218,-67 3165.6491,-34.6966 2730.0936,-19.9383 2606.5514,-16.377"/>
 <polygon fill="#191970" stroke="#191970" points="2606.3324,-12.8695 2596.2373,-16.0849 2606.1342,-19.8667 2606.3324,-12.8695"/>
 </g>
-<!-- Node96&#45;&gt;Node26 -->
+<!-- Node97&#45;&gt;Node26 -->
 <g id="edge175" class="edge">
-<title>Node96&#45;&gt;Node26</title>
+<title>Node97&#45;&gt;Node26</title>
 <path fill="none" stroke="#191970" d="M3010.2671,-1007.4862C3010.5141,-998.1847 3010.8628,-983.6203 3011,-971 3011.0966,-962.1116 3011.0753,-959.8886 3011,-951 3010.1111,-846.0998 3007,-819.904 3007,-715 3007,-715 3007,-715 3007,-547 3007,-302.1422 2825.3046,-267.5005 2597,-179 2567.4274,-167.5364 2363.9984,-146.1681 2274.3141,-137.1673"/>
 <polygon fill="#191970" stroke="#191970" points="2274.5126,-133.6698 2264.214,-136.1577 2273.8163,-140.6351 2274.5126,-133.6698"/>
 </g>
-<!-- Node96&#45;&gt;Node52 -->
+<!-- Node97&#45;&gt;Node52 -->
 <g id="edge173" class="edge">
-<title>Node96&#45;&gt;Node52</title>
+<title>Node97&#45;&gt;Node52</title>
 <path fill="none" stroke="#191970" d="M2980.4616,-1007.4571C2934.193,-992.9008 2841.5762,-965.3337 2761,-951 2591.3631,-920.8233 2545.8713,-937.143 2375,-915 1926.3559,-856.8607 1805.1506,-865.1148 1375,-725 1305.7519,-702.4435 1286.9412,-699.9169 1221,-669 1198.6382,-658.5155 1143.5349,-629.3227 1125,-613 1101.4503,-592.261 1095.3419,-585.4022 1082,-557 1075.1305,-542.3761 1071.313,-524.4214 1069.2508,-510.9326"/>
 <polygon fill="#191970" stroke="#191970" points="1072.694,-510.2733 1067.9029,-500.8237 1065.7554,-511.1986 1072.694,-510.2733"/>
 </g>
-<!-- Node96&#45;&gt;Node91 -->
+<!-- Node97&#45;&gt;Node92 -->
 <g id="edge177" class="edge">
-<title>Node96&#45;&gt;Node91</title>
+<title>Node97&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M2998.3295,-1007.2455C2988.4339,-998.9746 2974.0373,-986.9416 2962.4099,-977.2232"/>
 <polygon fill="#191970" stroke="#191970" points="2964.4542,-974.3703 2954.5368,-970.6427 2959.965,-979.7413 2964.4542,-974.3703"/>
 </g>
-<!-- Node96&#45;&gt;Node94 -->
+<!-- Node97&#45;&gt;Node95 -->
 <g id="edge174" class="edge">
-<title>Node96&#45;&gt;Node94</title>
+<title>Node97&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M3025.1393,-1007.3845C3057.3354,-986.9356 3132.8039,-939.0029 3171.3846,-914.499"/>
 <polygon fill="#191970" stroke="#191970" points="3173.3461,-917.3995 3179.9109,-909.0836 3169.5931,-911.4905 3173.3461,-917.3995"/>
 </g>
-<!-- Node98&#45;&gt;Node16 -->
+<!-- Node99&#45;&gt;Node16 -->
 <g id="edge204" class="edge">
-<title>Node98&#45;&gt;Node16</title>
+<title>Node99&#45;&gt;Node16</title>
 <path fill="none" stroke="#191970" d="M2561.9219,-1007.4924C2584.9753,-998.5476 2620.0914,-984.5405 2650,-971 2668.107,-962.8025 2671.3877,-957.9746 2690,-951 2749.7581,-928.6067 2773.0019,-945.6057 2829,-915 2943.8047,-852.2536 3045,-845.8328 3045,-715 3045,-715 3045,-715 3045,-547 3045,-412.6989 3144,-395.8011 3144,-261.5 3144,-261.5 3144,-261.5 3144,-133 3144,-90.8832 3114.1704,-84.8001 3076,-67 2991.9104,-27.7863 2703.3196,-18.2071 2606.1933,-16.0685"/>
 <polygon fill="#191970" stroke="#191970" points="2606.1691,-12.5673 2596.0983,-15.8582 2606.0232,-19.5658 2606.1691,-12.5673"/>
 </g>
-<!-- Node98&#45;&gt;Node20 -->
+<!-- Node99&#45;&gt;Node20 -->
 <g id="edge205" class="edge">
-<title>Node98&#45;&gt;Node20</title>
+<title>Node99&#45;&gt;Node20</title>
 <path fill="none" stroke="#191970" d="M2553.3758,-1007.279C2580.8988,-991.0499 2635.4377,-959.3871 2656,-951 2755.2225,-910.5284 2813.1641,-936.2159 2874,-848 2923.8097,-775.7728 2931,-746.737 2931,-659 2931,-659 2931,-659 2931,-547 2931,-350.537 2987.6317,-197.8843 2806,-123 2741.5568,-96.4309 2248.3284,-81.4768 2113.8001,-77.8641"/>
 <polygon fill="#191970" stroke="#191970" points="2113.7647,-74.3621 2103.6754,-77.5956 2113.579,-81.3596 2113.7647,-74.3621"/>
 </g>
-<!-- Node98&#45;&gt;Node26 -->
+<!-- Node99&#45;&gt;Node26 -->
 <g id="edge201" class="edge">
-<title>Node98&#45;&gt;Node26</title>
+<title>Node99&#45;&gt;Node26</title>
 <path fill="none" stroke="#191970" d="M2537.1709,-1007.3723C2537.9045,-946.6378 2536.8554,-611.3601 2431,-369 2391.0238,-277.4729 2387.0434,-243.7511 2311,-179 2296.1899,-166.3891 2277.4531,-155.3097 2261.9441,-147.1859"/>
 <polygon fill="#191970" stroke="#191970" points="2263.2691,-143.9333 2252.7687,-142.5224 2260.0974,-150.1735 2263.2691,-143.9333"/>
 </g>
-<!-- Node98&#45;&gt;Node47 -->
+<!-- Node99&#45;&gt;Node47 -->
 <g id="edge203" class="edge">
-<title>Node98&#45;&gt;Node47</title>
+<title>Node99&#45;&gt;Node47</title>
 <path fill="none" stroke="#191970" d="M2477.8352,-1016.3398C2165.2214,-1012.4724 720,-989.7149 720,-899.5 720,-899.5 720,-899.5 720,-715 720,-700.5665 238.2859,-562.64 225,-557 176.6767,-536.4863 152.1573,-543.2513 121,-501 71.3981,-433.7367 71.5137,-377.2176 125,-313 137.6405,-297.8234 155.8269,-285.1855 171.1253,-276.2273"/>
 <polygon fill="#191970" stroke="#191970" points="173.1932,-279.0804 180.2021,-271.1352 169.7683,-272.9754 173.1932,-279.0804"/>
 </g>
-<!-- Node98&#45;&gt;Node58 -->
+<!-- Node99&#45;&gt;Node58 -->
 <g id="edge180" class="edge">
-<title>Node98&#45;&gt;Node58</title>
+<title>Node99&#45;&gt;Node58</title>
 <path fill="none" stroke="#191970" d="M2501.134,-1007.4566C2473.159,-999.4034 2433.6969,-986.6889 2401,-971 2286.3981,-916.0107 2277.7586,-864.8576 2160,-817 2108.5896,-796.1066 2046.5047,-784.2951 2000.9121,-777.888"/>
 <polygon fill="#191970" stroke="#191970" points="2001.242,-774.4008 1990.8624,-776.5254 2000.3014,-781.3373 2001.242,-774.4008"/>
 </g>
-<!-- Node98&#45;&gt;Node81 -->
+<!-- Node99&#45;&gt;Node81 -->
 <g id="edge188" class="edge">
-<title>Node98&#45;&gt;Node81</title>
+<title>Node99&#45;&gt;Node81</title>
 <path fill="none" stroke="#191970" d="M2477.6682,-1011.1499C2389.4285,-1002.4422 2218.3907,-985.536 2073,-971 2005.2166,-964.2231 904.4524,-892.645 853,-848 826.3765,-824.8989 834,-806.2487 834,-771 834,-771 834,-771 834,-715 834,-683.7875 828.7142,-673.98 810,-649 793.7974,-627.3725 779.2876,-632.8956 761,-613 714.331,-562.2275 676.8108,-488.5717 660.6286,-454.1155"/>
 <polygon fill="#191970" stroke="#191970" points="663.6741,-452.3619 656.3052,-444.7495 657.3186,-455.2957 663.6741,-452.3619"/>
 </g>
-<!-- Node98&#45;&gt;Node83 -->
+<!-- Node99&#45;&gt;Node83 -->
 <g id="edge191" class="edge">
-<title>Node98&#45;&gt;Node83</title>
+<title>Node99&#45;&gt;Node83</title>
 <path fill="none" stroke="#191970" d="M2505.4791,-1007.4893C2466.8291,-993.9438 2403.0127,-965.0216 2375,-915 2365.2628,-897.6126 2365.8138,-874.7578 2368.3464,-857.5135"/>
 <polygon fill="#191970" stroke="#191970" points="2371.8043,-858.0582 2370.1051,-847.6005 2364.912,-856.8353 2371.8043,-858.0582"/>
 </g>
-<!-- Node98&#45;&gt;Node91 -->
+<!-- Node99&#45;&gt;Node92 -->
 <g id="edge189" class="edge">
-<title>Node98&#45;&gt;Node91</title>
+<title>Node99&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M2596.0359,-1008.8571C2669.7032,-998.6961 2795.4834,-981.3471 2873.6599,-970.5642"/>
 <polygon fill="#191970" stroke="#191970" points="2874.4649,-973.9863 2883.8929,-969.1527 2873.5084,-967.052 2874.4649,-973.9863"/>
 </g>
-<!-- Node98&#45;&gt;Node92 -->
+<!-- Node99&#45;&gt;Node93 -->
 <g id="edge190" class="edge">
-<title>Node98&#45;&gt;Node92</title>
+<title>Node99&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M2541.3159,-1007.3952C2547.9552,-993.4925 2561.8337,-967.6115 2580,-951 2597.8964,-934.6354 2621.9531,-921.8195 2641.6314,-913.0654"/>
 <polygon fill="#191970" stroke="#191970" points="2643.0806,-916.2523 2650.8938,-909.0965 2640.3236,-909.8181 2643.0806,-916.2523"/>
 </g>
-<!-- Node98&#45;&gt;Node94 -->
+<!-- Node99&#45;&gt;Node95 -->
 <g id="edge200" class="edge">
-<title>Node98&#45;&gt;Node94</title>
+<title>Node99&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M2596.159,-1013.2867C2705.1741,-1006.1082 2933.6857,-989.4035 3011,-971 3067.9879,-957.4349 3130.9049,-930.1087 3166.4272,-913.4453"/>
 <polygon fill="#191970" stroke="#191970" points="3168.1585,-916.4979 3175.6922,-909.0484 3165.1573,-910.1739 3168.1585,-916.4979"/>
 </g>
-<!-- Node98&#45;&gt;Node95 -->
+<!-- Node99&#45;&gt;Node96 -->
 <g id="edge192" class="edge">
-<title>Node98&#45;&gt;Node95</title>
+<title>Node99&#45;&gt;Node96</title>
 <path fill="none" stroke="#191970" d="M2545.9952,-1007.3562C2560.0017,-992.9201 2588.4937,-965.8092 2618,-951 2676.5749,-921.6013 2709.0004,-954.8917 2761,-915 2779.8825,-900.5142 2792.1392,-875.9393 2799.2142,-857.3694"/>
 <polygon fill="#191970" stroke="#191970" points="2802.5737,-858.3676 2802.6207,-847.7729 2795.977,-856.0259 2802.5737,-858.3676"/>
 </g>
-<!-- Node99 -->
+<!-- Node100 -->
 <g id="node48" class="node">
-<title>Node99</title>
+<title>Node100</title>
 <g id="a_node48"><a xlink:href="data__layout_8h.html" target="_top" xlink:title="Layout expression to describe the data organization of a tensor. And BijectiveLayout to mapping two d...">
 <polygon fill="#ffffff" stroke="#ff0000" points="1052.5,-649.5 1052.5,-668.5 1171.5,-668.5 1171.5,-649.5 1052.5,-649.5"/>
 <text text-anchor="middle" x="1112" y="-656.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/tir/data_layout.h</text>
 </a>
 </g>
 </g>
-<!-- Node98&#45;&gt;Node99 -->
+<!-- Node99&#45;&gt;Node100 -->
 <g id="edge181" class="edge">
-<title>Node98&#45;&gt;Node99</title>
+<title>Node99&#45;&gt;Node100</title>
 <path fill="none" stroke="#191970" d="M2477.8082,-1010.5375C2289.1067,-988.5455 1686.9829,-907.5992 1224,-725 1190.5646,-711.8132 1155.3868,-689.4925 1133.5728,-674.5047"/>
 <polygon fill="#191970" stroke="#191970" points="1135.2889,-671.4342 1125.0863,-668.5783 1131.2811,-677.1733 1135.2889,-671.4342"/>
 </g>
-<!-- Node100 -->
+<!-- Node101 -->
 <g id="node49" class="node">
-<title>Node100</title>
+<title>Node101</title>
 <g id="a_node49"><a xlink:href="strided__slice_8h.html" target="_top" xlink:title="Utility functions for strided_slice op. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="2383.5,-884.5 2383.5,-914.5 2508.5,-914.5 2508.5,-884.5 2383.5,-884.5"/>
 <text text-anchor="start" x="2391.5" y="-902.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/detail/strided</text>
@@ -1653,87 +1653,87 @@
 </a>
 </g>
 </g>
-<!-- Node98&#45;&gt;Node100 -->
+<!-- Node99&#45;&gt;Node101 -->
 <g id="edge193" class="edge">
-<title>Node98&#45;&gt;Node100</title>
+<title>Node99&#45;&gt;Node101</title>
 <path fill="none" stroke="#191970" d="M2529.5531,-1007.3845C2515.3661,-989.0661 2484.098,-948.6925 2464.0941,-922.8633"/>
 <polygon fill="#191970" stroke="#191970" points="2466.6733,-920.4774 2457.783,-914.7143 2461.139,-924.7636 2466.6733,-920.4774"/>
 </g>
-<!-- Node98&#45;&gt;Node102 -->
+<!-- Node99&#45;&gt;Node103 -->
 <g id="edge202" class="edge">
-<title>Node98&#45;&gt;Node102</title>
+<title>Node99&#45;&gt;Node103</title>
 <path fill="none" stroke="#191970" d="M2569.3184,-1007.3733C2602.4749,-997.4968 2653.963,-982.16 2688.5512,-971.8571"/>
 <polygon fill="#191970" stroke="#191970" points="2689.6815,-975.1725 2698.2662,-968.9633 2687.6831,-968.4638 2689.6815,-975.1725"/>
 </g>
-<!-- Node99&#45;&gt;Node16 -->
+<!-- Node100&#45;&gt;Node16 -->
 <g id="edge185" class="edge">
-<title>Node99&#45;&gt;Node16</title>
+<title>Node100&#45;&gt;Node16</title>
 <path fill="none" stroke="#191970" d="M1171.5168,-657.4078C1321.2011,-653.0532 1710.3837,-639.4672 1837,-613 1969.4078,-585.3222 2000.2095,-563.8339 2120,-501 2328.6544,-391.5541 2372.995,-333.9704 2511,-143 2528.6173,-118.6213 2535.4779,-113.8671 2549,-87 2557.4798,-70.1514 2564.3056,-49.7368 2568.6687,-35.0417"/>
 <polygon fill="#191970" stroke="#191970" points="2572.1349,-35.6494 2571.5118,-25.0729 2565.4033,-33.7295 2572.1349,-35.6494"/>
 </g>
-<!-- Node99&#45;&gt;Node18 -->
+<!-- Node100&#45;&gt;Node18 -->
 <g id="edge186" class="edge">
-<title>Node99&#45;&gt;Node18</title>
+<title>Node100&#45;&gt;Node18</title>
 <path fill="none" stroke="#191970" d="M1052.3468,-657.0621C867.3387,-650.3825 305.6393,-624.88 138,-557 62.1199,-526.2748 0,-516.8647 0,-435 0,-435 0,-435 0,-133 0,-101.7875 .679,-87.7449 24,-67 51.2406,-42.7685 155.5223,-26.4056 209.3789,-19.3838"/>
 <polygon fill="#191970" stroke="#191970" points="209.9615,-22.838 219.4401,-18.1045 209.0785,-15.8939 209.9615,-22.838"/>
 </g>
-<!-- Node99&#45;&gt;Node20 -->
+<!-- Node100&#45;&gt;Node20 -->
 <g id="edge187" class="edge">
-<title>Node99&#45;&gt;Node20</title>
+<title>Node100&#45;&gt;Node20</title>
 <path fill="none" stroke="#191970" d="M1171.7495,-658.2643C1342.849,-655.1728 1830.9094,-639.1863 1964,-557 2079.5718,-485.632 2042,-397.3316 2042,-261.5 2042,-261.5 2042,-261.5 2042,-194.5 2042,-158.2454 2058.5805,-118.6181 2069.837,-95.8929"/>
 <polygon fill="#191970" stroke="#191970" points="2073.0965,-97.2076 2074.5555,-86.7137 2066.8708,-94.0073 2073.0965,-97.2076"/>
 </g>
-<!-- Node99&#45;&gt;Node26 -->
+<!-- Node100&#45;&gt;Node26 -->
 <g id="edge184" class="edge">
-<title>Node99&#45;&gt;Node26</title>
+<title>Node100&#45;&gt;Node26</title>
 <path fill="none" stroke="#191970" d="M1171.5868,-650.9956C1229.2584,-642.8611 1319.0131,-629.178 1396,-613 1493.0541,-592.6051 1519.1995,-591.979 1612,-557 1834.5401,-473.1185 1872.2032,-411.9956 2068,-277 2110.2378,-247.8784 2123.6368,-244.0642 2162,-210 2182.9733,-191.377 2204.2526,-167.0229 2217.7533,-150.738"/>
 <polygon fill="#191970" stroke="#191970" points="2220.7803,-152.5653 2224.402,-142.6087 2215.3618,-148.1336 2220.7803,-152.5653"/>
 </g>
-<!-- Node99&#45;&gt;Node52 -->
+<!-- Node100&#45;&gt;Node52 -->
 <g id="edge182" class="edge">
-<title>Node99&#45;&gt;Node52</title>
+<title>Node100&#45;&gt;Node52</title>
 <path fill="none" stroke="#191970" d="M1094.7144,-649.2802C1069.2933,-633.7422 1023.3867,-600.5687 1010,-557 1007.3893,-548.5031 1006.1665,-545.0198 1010,-537 1016.1448,-524.1449 1027.8299,-513.7306 1039.0285,-506.0832"/>
 <polygon fill="#191970" stroke="#191970" points="1041.2041,-508.8488 1047.79,-500.5495 1037.4661,-502.9304 1041.2041,-508.8488"/>
 </g>
-<!-- Node99&#45;&gt;Node63 -->
+<!-- Node100&#45;&gt;Node63 -->
 <g id="edge183" class="edge">
-<title>Node99&#45;&gt;Node63</title>
+<title>Node100&#45;&gt;Node63</title>
 <path fill="none" stroke="#191970" d="M1059.9152,-649.4369C1002.2537,-638.8499 910.1136,-621.9323 854.4037,-611.7036"/>
 <polygon fill="#191970" stroke="#191970" points="854.9096,-608.2381 844.4419,-609.8746 853.6454,-615.123 854.9096,-608.2381"/>
 </g>
-<!-- Node100&#45;&gt;Node16 -->
+<!-- Node101&#45;&gt;Node16 -->
 <g id="edge197" class="edge">
-<title>Node100&#45;&gt;Node16</title>
+<title>Node101&#45;&gt;Node16</title>
 <path fill="none" stroke="#191970" d="M2490.8808,-884.4591C2514.2536,-875.6033 2542.7307,-863.1596 2566,-848 2602.625,-824.1394 2608.771,-813.502 2638,-781 2724.0978,-685.2611 3030,-323.4188 3030,-194.5 3030,-194.5 3030,-194.5 3030,-133 3030,-45.4549 2710.8427,-22.152 2606.5447,-16.8741"/>
 <polygon fill="#191970" stroke="#191970" points="2606.4597,-13.366 2596.3025,-16.3794 2606.122,-20.3578 2606.4597,-13.366"/>
 </g>
-<!-- Node100&#45;&gt;Node20 -->
+<!-- Node101&#45;&gt;Node20 -->
 <g id="edge198" class="edge">
-<title>Node100&#45;&gt;Node20</title>
+<title>Node101&#45;&gt;Node20</title>
 <path fill="none" stroke="#191970" d="M2474.4854,-884.4181C2541.3985,-846.6982 2703,-741.2863 2703,-603 2703,-603 2703,-603 2703,-435 2703,-160.7836 2243.65,-93.5313 2113.8129,-79.9992"/>
 <polygon fill="#191970" stroke="#191970" points="2113.9608,-76.4966 2103.6616,-78.9819 2113.2627,-83.4617 2113.9608,-76.4966"/>
 </g>
-<!-- Node100&#45;&gt;Node26 -->
+<!-- Node101&#45;&gt;Node26 -->
 <g id="edge195" class="edge">
-<title>Node100&#45;&gt;Node26</title>
+<title>Node101&#45;&gt;Node26</title>
 <path fill="none" stroke="#191970" d="M2451.4971,-884.2978C2464.9247,-845.3134 2497.4816,-738.5501 2486,-649 2469.645,-521.4399 2466.1137,-486.1239 2413,-369 2370.8456,-276.0431 2351.4175,-255.6345 2284,-179 2274.546,-168.2535 2262.6738,-157.5789 2252.6611,-149.2219"/>
 <polygon fill="#191970" stroke="#191970" points="2254.7435,-146.4039 2244.7817,-142.7965 2250.3196,-151.8288 2254.7435,-146.4039"/>
 </g>
-<!-- Node100&#45;&gt;Node47 -->
+<!-- Node101&#45;&gt;Node47 -->
 <g id="edge196" class="edge">
-<title>Node100&#45;&gt;Node47</title>
+<title>Node101&#45;&gt;Node47</title>
 <path fill="none" stroke="#191970" d="M2383.1823,-898.9948C2096.3223,-896.4938 921.8086,-884.0798 853,-848 815.2909,-828.2272 796,-813.5786 796,-771 796,-771 796,-771 796,-715 796,-542.205 748.3655,-714.6289 300,-557 206.7332,-524.2108 114,-533.8627 114,-435 114,-435 114,-435 114,-379 114,-335.62 152.6421,-297.6009 178.1213,-277.2475"/>
 <polygon fill="#191970" stroke="#191970" points="180.3948,-279.9143 186.1826,-271.0401 176.124,-274.3681 180.3948,-279.9143"/>
 </g>
-<!-- Node100&#45;&gt;Node52 -->
+<!-- Node101&#45;&gt;Node52 -->
 <g id="edge194" class="edge">
-<title>Node100&#45;&gt;Node52</title>
+<title>Node101&#45;&gt;Node52</title>
 <path fill="none" stroke="#191970" d="M2383.4198,-898.3271C2137.1438,-892.3931 1238.6642,-857.927 1043,-669 998.9927,-626.5079 976.2921,-591.5389 1004,-537 1010.861,-523.4951 1023.7532,-513.0099 1036.0966,-505.4839"/>
 <polygon fill="#191970" stroke="#191970" points="1037.9003,-508.4849 1044.8934,-500.5258 1034.4633,-502.3868 1037.9003,-508.4849"/>
 </g>
-<!-- Node100&#45;&gt;Node83 -->
+<!-- Node101&#45;&gt;Node83 -->
 <g id="edge199" class="edge">
-<title>Node100&#45;&gt;Node83</title>
+<title>Node101&#45;&gt;Node83</title>
 <path fill="none" stroke="#191970" d="M2429.6621,-884.2967C2420.2723,-875.559 2408.3082,-864.4257 2397.8802,-854.7218"/>
 <polygon fill="#191970" stroke="#191970" points="2399.9783,-851.8933 2390.2733,-847.6432 2395.2097,-857.0178 2399.9783,-851.8933"/>
 </g>
diff --git a/docs/reference/api/doxygen/nn_2softmax_8h__incl.svg b/docs/reference/api/doxygen/nn_2softmax_8h__incl.svg
index 822d8b1c95..2209935714 100644
--- a/docs/reference/api/doxygen/nn_2softmax_8h__incl.svg
+++ b/docs/reference/api/doxygen/nn_2softmax_8h__incl.svg
@@ -70,18 +70,18 @@
 <path fill="none" stroke="#191970" d="M702.7538,-1126.8972C785.4694,-1115.6178 940.5104,-1094.4759 1029.851,-1082.293"/>
 <polygon fill="#191970" stroke="#191970" points="1030.4444,-1085.7446 1039.8798,-1080.9255 1029.4985,-1078.8088 1030.4444,-1085.7446"/>
 </g>
-<!-- Node92 -->
+<!-- Node93 -->
 <g id="node41" class="node">
-<title>Node92</title>
+<title>Node93</title>
 <g id="a_node41"><a xlink:href="tags_8h.html" target="_top" xlink:title="External function interface to rocBLAS libraries. ">
 <polygon fill="#ffffff" stroke="#000000" points="106.5,-890 106.5,-909 197.5,-909 197.5,-890 106.5,-890"/>
 <text text-anchor="middle" x="152" y="-897" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/tags.h</text>
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node92 -->
+<!-- Node0&#45;&gt;Node93 -->
 <g id="edge203" class="edge">
-<title>Node0&#45;&gt;Node92</title>
+<title>Node0&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M591.1733,-1126.3341C511.0796,-1113.159 360.2966,-1082.857 243,-1027 204.795,-1008.8067 188.7522,-1006.6785 166,-971 156.1364,-955.5325 152.9554,-934.6423 152.0511,-919.438"/>
 <polygon fill="#191970" stroke="#191970" points="155.539,-918.9901 151.7303,-909.1035 148.5423,-919.2073 155.539,-918.9901"/>
 </g>
@@ -1182,15 +1182,15 @@
 <path fill="none" stroke="#191970" d="M1119.1428,-1063.4068C1161.9665,-1042.8195 1258.954,-990.0994 1307,-915 1317.8877,-897.9817 1322.2695,-875.3779 1323.9947,-858.1341"/>
 <polygon fill="#191970" stroke="#191970" points="1327.5205,-857.9251 1324.7865,-847.6891 1320.5405,-857.3959 1327.5205,-857.9251"/>
 </g>
-<!-- Node81&#45;&gt;Node92 -->
+<!-- Node81&#45;&gt;Node93 -->
 <g id="edge167" class="edge">
-<title>Node81&#45;&gt;Node92</title>
+<title>Node81&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M1039.7709,-1072.1337C922.947,-1069.7085 651.904,-1060.6184 427,-1027 318.2436,-1010.7432 271.6701,-1039.9411 186,-971 169.8169,-957.977 160.8918,-935.4126 156.2952,-919.1104"/>
 <polygon fill="#191970" stroke="#191970" points="159.5926,-917.8694 153.7685,-909.019 152.8022,-919.5697 159.5926,-917.8694"/>
 </g>
-<!-- Node93 -->
+<!-- Node94 -->
 <g id="node42" class="node">
-<title>Node93</title>
+<title>Node94</title>
 <g id="a_node42"><a xlink:href="ravel__unravel_8h.html" target="_top" xlink:title="Index ravel and unraval operations. ">
 <polygon fill="#ffffff" stroke="#000000" points="2677,-817.5 2677,-847.5 2793,-847.5 2793,-817.5 2677,-817.5"/>
 <text text-anchor="start" x="2685" y="-835.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/detail/ravel</text>
@@ -1198,51 +1198,51 @@
 </a>
 </g>
 </g>
-<!-- Node81&#45;&gt;Node93 -->
+<!-- Node81&#45;&gt;Node94 -->
 <g id="edge155" class="edge">
-<title>Node81&#45;&gt;Node93</title>
+<title>Node81&#45;&gt;Node94</title>
 <path fill="none" stroke="#191970" d="M1156.3592,-1072.1005C1401.0331,-1068.1998 2345.1946,-1051.8186 2642,-1027 2823.5193,-1011.8215 2928.3211,-1108.3194 3048,-971 3083.9891,-929.7061 2905.8113,-876.0584 2803.2703,-849.2665"/>
 <polygon fill="#191970" stroke="#191970" points="2803.847,-845.8005 2793.2888,-846.6808 2802.0915,-852.5768 2803.847,-845.8005"/>
 </g>
-<!-- Node94 -->
+<!-- Node95 -->
 <g id="node43" class="node">
-<title>Node94</title>
+<title>Node95</title>
 <g id="a_node43"><a xlink:href="elemwise_8h.html" target="_top" xlink:title="Elementwise op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="436.5,-1007.5 436.5,-1026.5 553.5,-1026.5 553.5,-1007.5 436.5,-1007.5"/>
 <text text-anchor="middle" x="495" y="-1014.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/elemwise.h</text>
 </a>
 </g>
 </g>
-<!-- Node81&#45;&gt;Node94 -->
+<!-- Node81&#45;&gt;Node95 -->
 <g id="edge158" class="edge">
-<title>Node81&#45;&gt;Node94</title>
+<title>Node81&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M1039.9081,-1067.6051C928.1944,-1057.2303 683.8927,-1034.5423 564.0623,-1023.4138"/>
 <polygon fill="#191970" stroke="#191970" points="564.1891,-1019.9105 553.9083,-1022.4708 563.5418,-1026.8806 564.1891,-1019.9105"/>
 </g>
-<!-- Node96 -->
+<!-- Node97 -->
 <g id="node45" class="node">
-<title>Node96</title>
+<title>Node97</title>
 <g id="a_node45"><a xlink:href="topi_2transform_8h.html" target="_top" xlink:title="Transform op constructors. ">
 <polygon fill="#ffffff" stroke="#000000" points="2392,-1007.5 2392,-1026.5 2510,-1026.5 2510,-1007.5 2392,-1007.5"/>
 <text text-anchor="middle" x="2451" y="-1014.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/transform.h</text>
 </a>
 </g>
 </g>
-<!-- Node81&#45;&gt;Node96 -->
+<!-- Node81&#45;&gt;Node97 -->
 <g id="edge168" class="edge">
-<title>Node81&#45;&gt;Node96</title>
+<title>Node81&#45;&gt;Node97</title>
 <path fill="none" stroke="#191970" d="M1156.1367,-1070.5938C1374.5591,-1061.5534 2145.6459,-1029.6385 2381.7663,-1019.8655"/>
 <polygon fill="#191970" stroke="#191970" points="2382.092,-1023.3552 2391.9387,-1019.4445 2381.8025,-1016.3611 2382.092,-1023.3552"/>
 </g>
-<!-- Node100 -->
+<!-- Node101 -->
 <g id="node49" class="node">
-<title>Node100</title>
+<title>Node101</title>
 <polygon fill="#ffffff" stroke="#bfbfbf" points="2985.5,-951.5 2985.5,-970.5 3038.5,-970.5 3038.5,-951.5 2985.5,-951.5"/>
 <text text-anchor="middle" x="3012" y="-958.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">iterator</text>
 </g>
-<!-- Node81&#45;&gt;Node100 -->
+<!-- Node81&#45;&gt;Node101 -->
 <g id="edge200" class="edge">
-<title>Node81&#45;&gt;Node100</title>
+<title>Node81&#45;&gt;Node101</title>
 <path fill="none" stroke="#191970" d="M1156.3515,-1071.8202C1388.5613,-1067.015 2247.5389,-1048.1659 2519,-1027 2720.8123,-1011.2646 2771.891,-1007.4887 2971,-971 2972.4413,-970.7359 2973.9077,-970.452 2975.3864,-970.153"/>
 <polygon fill="#191970" stroke="#191970" points="2976.1671,-973.5651 2985.1895,-968.0111 2974.6728,-966.7264 2976.1671,-973.5651"/>
 </g>
@@ -1279,9 +1279,9 @@
 <path fill="none" stroke="#191970" d="M907.0954,-957.7037C1017.9331,-951.0849 1248.6353,-935.0786 1279,-915 1299.1752,-901.6592 1311.3721,-876.3058 1318.0869,-857.2101"/>
 <polygon fill="#191970" stroke="#191970" points="1321.4392,-858.2186 1321.1907,-847.6267 1314.7798,-856.0618 1321.4392,-858.2186"/>
 </g>
-<!-- Node82&#45;&gt;Node92 -->
+<!-- Node82&#45;&gt;Node93 -->
 <g id="edge150" class="edge">
-<title>Node82&#45;&gt;Node92</title>
+<title>Node82&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M788.9597,-955.7831C656.4948,-944.0782 337.1126,-915.8569 207.7786,-904.4287"/>
 <polygon fill="#191970" stroke="#191970" points="207.966,-900.9317 197.6967,-903.5379 207.3498,-907.9046 207.966,-900.9317"/>
 </g>
@@ -1372,165 +1372,165 @@
 <path fill="none" stroke="#191970" d="M1141.1464,-537.3733C1172.5109,-527.9807 1220.3665,-513.6496 1254.5716,-503.4064"/>
 <polygon fill="#191970" stroke="#191970" points="1255.6975,-506.7229 1264.273,-500.5011 1253.6893,-500.0171 1255.6975,-506.7229"/>
 </g>
-<!-- Node92&#45;&gt;Node17 -->
+<!-- Node93&#45;&gt;Node17 -->
 <g id="edge151" class="edge">
-<title>Node92&#45;&gt;Node17</title>
+<title>Node93&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M141.1111,-889.6981C120.0964,-869.644 76,-821.5417 76,-771 76,-771 76,-771 76,-138.5 76,-104.9798 74.1843,-88.3812 100,-67 126.2696,-45.243 356.5842,-24.9026 442.6975,-17.9916"/>
 <polygon fill="#191970" stroke="#191970" points="443.2364,-21.4599 452.9281,-17.1795 442.6825,-14.4819 443.2364,-21.4599"/>
 </g>
-<!-- Node93&#45;&gt;Node1 -->
+<!-- Node94&#45;&gt;Node1 -->
 <g id="edge156" class="edge">
-<title>Node93&#45;&gt;Node1</title>
+<title>Node94&#45;&gt;Node1</title>
 <path fill="none" stroke="#191970" d="M2676.9321,-829.7781C2461.8923,-819.6981 1711.7026,-784.5329 1487.0473,-774.0022"/>
 <polygon fill="#191970" stroke="#191970" points="1486.9038,-770.4917 1476.7509,-773.5196 1486.576,-777.4841 1486.9038,-770.4917"/>
 </g>
-<!-- Node93&#45;&gt;Node21 -->
+<!-- Node94&#45;&gt;Node21 -->
 <g id="edge157" class="edge">
-<title>Node93&#45;&gt;Node21</title>
+<title>Node94&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M2793.1336,-826.2347C2857.7129,-815.6766 2953,-787.7685 2953,-715 2953,-715 2953,-715 2953,-603 2953,-413.8576 3105,-394.6424 3105,-205.5 3105,-205.5 3105,-205.5 3105,-138.5 3105,-104.9798 3106.016,-89.3115 3081,-67 3050.532,-39.8259 2931.8848,-24.6125 2872.8077,-18.5798"/>
 <polygon fill="#191970" stroke="#191970" points="2873.1393,-15.0956 2862.8427,-17.5919 2872.4486,-22.0615 2873.1393,-15.0956"/>
 </g>
-<!-- Node94&#45;&gt;Node17 -->
+<!-- Node95&#45;&gt;Node17 -->
 <g id="edge165" class="edge">
-<title>Node94&#45;&gt;Node17</title>
+<title>Node95&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M468.8123,-1007.4096C382.7336,-974.018 114,-854.1442 114,-659 114,-659 114,-659 114,-138.5 114,-104.9798 112.3472,-88.5763 138,-67 183.762,-28.5102 367.2362,-18.6961 442.5183,-16.2693"/>
 <polygon fill="#191970" stroke="#191970" points="442.92,-19.7589 452.8112,-15.9623 442.7113,-12.762 442.92,-19.7589"/>
 </g>
-<!-- Node94&#45;&gt;Node27 -->
+<!-- Node95&#45;&gt;Node27 -->
 <g id="edge164" class="edge">
-<title>Node94&#45;&gt;Node27</title>
+<title>Node95&#45;&gt;Node27</title>
 <path fill="none" stroke="#191970" d="M553.6486,-1015.6579C847.7946,-1008.3788 2153.379,-969.3756 2282,-848 2472.4294,-668.2977 2363.0258,-530.8967 2427,-277 2433.2683,-252.1227 2436.7227,-246.2958 2441,-221 2448.4669,-176.841 2451.8403,-124.2893 2453.1991,-96.4929"/>
 <polygon fill="#191970" stroke="#191970" points="2456.6956,-96.6482 2453.6515,-86.5002 2449.7028,-96.3316 2456.6956,-96.6482"/>
 </g>
-<!-- Node94&#45;&gt;Node53 -->
+<!-- Node95&#45;&gt;Node53 -->
 <g id="edge162" class="edge">
-<title>Node94&#45;&gt;Node53</title>
+<title>Node95&#45;&gt;Node53</title>
 <path fill="none" stroke="#191970" d="M495,-1007.2511C495,-987.1182 495,-939.4826 495,-899.5 495,-899.5 495,-899.5 495,-832.5 495,-660.4173 1019.8038,-529.194 1185,-481 1254.7571,-460.6492 1337.9307,-447.7476 1390.362,-440.9461"/>
 <polygon fill="#191970" stroke="#191970" points="1390.9465,-444.4 1400.4253,-439.6667 1390.0637,-437.4559 1390.9465,-444.4"/>
 </g>
-<!-- Node94&#45;&gt;Node82 -->
+<!-- Node95&#45;&gt;Node82 -->
 <g id="edge166" class="edge">
-<title>Node94&#45;&gt;Node82</title>
+<title>Node95&#45;&gt;Node82</title>
 <path fill="none" stroke="#191970" d="M553.687,-1007.6899C615.563,-997.8739 712.9908,-982.4179 778.6271,-972.0053"/>
 <polygon fill="#191970" stroke="#191970" points="779.55,-975.4028 788.8781,-970.3791 778.4532,-968.4892 779.55,-975.4028"/>
 </g>
-<!-- Node94&#45;&gt;Node92 -->
+<!-- Node95&#45;&gt;Node93 -->
 <g id="edge163" class="edge">
-<title>Node94&#45;&gt;Node92</title>
+<title>Node95&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M436.3892,-1012.1172C365.5376,-1005.4088 251.1707,-991.7601 214,-971 191.5766,-958.4764 173.4203,-934.5875 162.6807,-917.8828"/>
 <polygon fill="#191970" stroke="#191970" points="165.453,-915.7087 157.2383,-909.0177 159.4874,-919.371 165.453,-915.7087"/>
 </g>
-<!-- Node95 -->
+<!-- Node96 -->
 <g id="node44" class="node">
-<title>Node95</title>
+<title>Node96</title>
 <g id="a_node44"><a xlink:href="builtin_8h.html" target="_top" xlink:title="TIR builtin intrinsics. ">
 <polygon fill="#ffffff" stroke="#000000" points="644,-481.5 644,-500.5 736,-500.5 736,-481.5 644,-481.5"/>
 <text text-anchor="middle" x="690" y="-488.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/tir/builtin.h</text>
 </a>
 </g>
 </g>
-<!-- Node94&#45;&gt;Node95 -->
+<!-- Node95&#45;&gt;Node96 -->
 <g id="edge159" class="edge">
-<title>Node94&#45;&gt;Node95</title>
+<title>Node95&#45;&gt;Node96</title>
 <path fill="none" stroke="#191970" d="M489.5555,-1007.2863C479.0482,-987.599 457,-941.3072 457,-899.5 457,-899.5 457,-899.5 457,-603 457,-562.0398 575.4441,-522.5529 643.4352,-503.2603"/>
 <polygon fill="#191970" stroke="#191970" points="644.3928,-506.6269 653.0805,-500.5626 642.5073,-499.8856 644.3928,-506.6269"/>
 </g>
-<!-- Node95&#45;&gt;Node53 -->
+<!-- Node96&#45;&gt;Node53 -->
 <g id="edge161" class="edge">
-<title>Node95&#45;&gt;Node53</title>
+<title>Node96&#45;&gt;Node53</title>
 <path fill="none" stroke="#191970" d="M736.1908,-487.5603C868.9978,-477.6704 1251.249,-449.2049 1390.483,-438.8364"/>
 <polygon fill="#191970" stroke="#191970" points="1390.7593,-442.3256 1400.4717,-438.0925 1390.2394,-435.3449 1390.7593,-442.3256"/>
 </g>
-<!-- Node95&#45;&gt;Node63 -->
+<!-- Node96&#45;&gt;Node63 -->
 <g id="edge160" class="edge">
-<title>Node95&#45;&gt;Node63</title>
+<title>Node96&#45;&gt;Node63</title>
 <path fill="none" stroke="#191970" d="M696.8044,-481.3928C707.0428,-467.487 727.6099,-441.6023 750,-425 796.63,-390.4237 813.4986,-388.9868 868,-369 891.2984,-360.456 952.8515,-344.671 995.8649,-334.004"/>
 <polygon fill="#191970" stroke="#191970" points="996.9547,-337.34 1005.8232,-331.5435 995.2756,-330.5444 996.9547,-337.34"/>
 </g>
-<!-- Node96&#45;&gt;Node1 -->
+<!-- Node97&#45;&gt;Node1 -->
 <g id="edge169" class="edge">
-<title>Node96&#45;&gt;Node1</title>
+<title>Node97&#45;&gt;Node1</title>
 <path fill="none" stroke="#191970" d="M2411.0167,-1007.432C2250.092,-968.9228 1649.933,-825.305 1472.6269,-782.8757"/>
 <polygon fill="#191970" stroke="#191970" points="1473.3253,-779.4441 1462.7853,-780.5206 1471.6961,-786.2519 1473.3253,-779.4441"/>
 </g>
-<!-- Node96&#45;&gt;Node17 -->
+<!-- Node97&#45;&gt;Node17 -->
 <g id="edge196" class="edge">
-<title>Node96&#45;&gt;Node17</title>
+<title>Node97&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M2391.8322,-1016.7106C2113.2989,-1015.1676 939.5974,-1006.5898 780,-971 627.9062,-937.0835 152,-814.8296 152,-659 152,-659 152,-659 152,-138.5 152,-97.0158 176.2702,-88.0789 212,-67 250.7483,-44.1403 381.039,-26.4288 442.453,-19.1327"/>
 <polygon fill="#191970" stroke="#191970" points="443.2088,-22.5683 452.7355,-17.9322 442.3971,-15.6155 443.2088,-22.5683"/>
 </g>
-<!-- Node96&#45;&gt;Node21 -->
+<!-- Node97&#45;&gt;Node21 -->
 <g id="edge198" class="edge">
-<title>Node96&#45;&gt;Node21</title>
+<title>Node97&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M2510.2589,-1007.6131C2511.8552,-1007.4013 2513.4379,-1007.1964 2515,-1007 2682.0024,-985.9984 2731.1855,-1020.6947 2892,-971 2952.8659,-952.1913 3143,-896.2058 3143,-832.5 3143,-832.5 3143,-832.5 3143,-491 3143,-414.4471 3181,-399.5529 3181,-323 3181,-323 3181,-323 3181,-138.5 3181,-102.0874 3170.7446,-88.0032 3141,-67 3098.2121,-36.7868 2942.4946,-22.5791 2872.9489,-17.633"/>
 <polygon fill="#191970" stroke="#191970" points="2872.7745,-14.1128 2862.5577,-16.9177 2872.2936,-21.0963 2872.7745,-14.1128"/>
 </g>
-<!-- Node96&#45;&gt;Node27 -->
+<!-- Node97&#45;&gt;Node27 -->
 <g id="edge193" class="edge">
-<title>Node96&#45;&gt;Node27</title>
+<title>Node97&#45;&gt;Node27</title>
 <path fill="none" stroke="#191970" d="M2510.2612,-1007.6315C2511.8569,-1007.4143 2513.4388,-1007.2033 2515,-1007 2591.4376,-997.0468 2790.6197,-1004.5879 2860,-971 2982.711,-911.594 3029,-851.3344 3029,-715 3029,-715 3029,-715 3029,-547 3029,-441.8703 2953,-428.1297 2953,-323 2953,-323 2953,-323 2953,-267 2953,-204.7492 2888.5228,-223.4143 2836,-190 2788.0217,-159.4769 2780.2387,-142.9806 2727,-123 2648.7919,-93.6483 2550.9473,-82.9487 2496.1754,-79.1037"/>
 <polygon fill="#191970" stroke="#191970" points="2496.3474,-75.6075 2486.1389,-78.4426 2495.8872,-82.5924 2496.3474,-75.6075"/>
 </g>
-<!-- Node96&#45;&gt;Node48 -->
+<!-- Node97&#45;&gt;Node48 -->
 <g id="edge195" class="edge">
-<title>Node96&#45;&gt;Node48</title>
+<title>Node97&#45;&gt;Node48</title>
 <path fill="none" stroke="#191970" d="M2510.2715,-1007.705C2511.8641,-1007.4662 2513.4426,-1007.2307 2515,-1007 2572.046,-998.5484 2729.2338,-1009.6877 2772,-971 2823.7114,-924.2202 2818.3424,-884.789 2802,-817 2735.0263,-539.1895 2464.8106,-287.2178 2390.1786,-221.8787"/>
 <polygon fill="#191970" stroke="#191970" points="2392.2903,-219.0771 2382.4441,-215.1651 2387.7017,-224.3634 2392.2903,-219.0771"/>
 </g>
-<!-- Node96&#45;&gt;Node80 -->
+<!-- Node97&#45;&gt;Node80 -->
 <g id="edge177" class="edge">
-<title>Node96&#45;&gt;Node80</title>
+<title>Node97&#45;&gt;Node80</title>
 <path fill="none" stroke="#191970" d="M2391.8913,-1014.7422C2148.4739,-1005.1411 1230.8198,-965.8572 1112,-915 919.4762,-832.5961 663.8981,-774.7102 768,-593 836.3356,-473.7199 999.8165,-414.4722 1082.817,-391.2295"/>
 <polygon fill="#191970" stroke="#191970" points="1083.8067,-394.5874 1092.5282,-388.5719 1081.959,-387.8356 1083.8067,-394.5874"/>
 </g>
-<!-- Node96&#45;&gt;Node82 -->
+<!-- Node97&#45;&gt;Node82 -->
 <g id="edge178" class="edge">
-<title>Node96&#45;&gt;Node82</title>
+<title>Node97&#45;&gt;Node82</title>
 <path fill="none" stroke="#191970" d="M2391.9366,-1016.0634C2147.5602,-1012.0587 1214.6281,-995.4561 921,-971 919.8846,-970.9071 918.7601,-970.8089 917.6285,-970.7058"/>
 <polygon fill="#191970" stroke="#191970" points="917.6061,-967.1857 907.3042,-969.6595 916.9003,-974.15 917.6061,-967.1857"/>
 </g>
-<!-- Node96&#45;&gt;Node83 -->
+<!-- Node97&#45;&gt;Node83 -->
 <g id="edge179" class="edge">
-<title>Node96&#45;&gt;Node83</title>
+<title>Node97&#45;&gt;Node83</title>
 <path fill="none" stroke="#191970" d="M2391.7332,-1011.4511C2188.4029,-992.4142 1518.6163,-929.7051 1280.741,-907.4339"/>
 <polygon fill="#191970" stroke="#191970" points="1281.0435,-903.947 1270.7607,-906.4995 1280.3909,-910.9165 1281.0435,-903.947"/>
 </g>
-<!-- Node96&#45;&gt;Node84 -->
+<!-- Node97&#45;&gt;Node84 -->
 <g id="edge180" class="edge">
-<title>Node96&#45;&gt;Node84</title>
+<title>Node97&#45;&gt;Node84</title>
 <path fill="none" stroke="#191970" d="M2391.9768,-1008.0632C2232.6611,-983.8094 1780.6354,-914.1242 1406,-848 1404.731,-847.776 1403.4512,-847.5486 1402.1627,-847.3184"/>
 <polygon fill="#191970" stroke="#191970" points="1402.6201,-843.8443 1392.1562,-845.5048 1401.3718,-850.7321 1402.6201,-843.8443"/>
 </g>
-<!-- Node96&#45;&gt;Node92 -->
+<!-- Node97&#45;&gt;Node93 -->
 <g id="edge192" class="edge">
-<title>Node96&#45;&gt;Node92</title>
+<title>Node97&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M2391.9689,-1016.1416C2130.73,-1012.2295 1077.0108,-995.2304 747,-971 547.9202,-956.3829 313.0407,-923.515 207.8426,-907.9629"/>
 <polygon fill="#191970" stroke="#191970" points="208.1764,-904.4742 197.7709,-906.4682 207.1487,-911.3984 208.1764,-904.4742"/>
 </g>
-<!-- Node96&#45;&gt;Node93 -->
+<!-- Node97&#45;&gt;Node94 -->
 <g id="edge181" class="edge">
-<title>Node96&#45;&gt;Node93</title>
+<title>Node97&#45;&gt;Node94</title>
 <path fill="none" stroke="#191970" d="M2500.9928,-1007.4445C2544.8354,-998.4099 2604.6661,-984.2937 2625,-971 2669.9528,-941.6112 2704.8276,-887.7291 2722.4116,-856.5414"/>
 <polygon fill="#191970" stroke="#191970" points="2725.5531,-858.0922 2727.3056,-847.6433 2719.4196,-854.7187 2725.5531,-858.0922"/>
 </g>
-<!-- Node97 -->
+<!-- Node98 -->
 <g id="node46" class="node">
-<title>Node97</title>
+<title>Node98</title>
 <g id="a_node46"><a xlink:href="data__layout_8h.html" target="_top" xlink:title="Layout expression to describe the data organization of a tensor. And BijectiveLayout to mapping two d...">
 <polygon fill="#ffffff" stroke="#ff0000" points="2418.5,-593.5 2418.5,-612.5 2537.5,-612.5 2537.5,-593.5 2418.5,-593.5"/>
 <text text-anchor="middle" x="2478" y="-600.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/tir/data_layout.h</text>
 </a>
 </g>
 </g>
-<!-- Node96&#45;&gt;Node97 -->
+<!-- Node97&#45;&gt;Node98 -->
 <g id="edge170" class="edge">
-<title>Node96&#45;&gt;Node97</title>
+<title>Node97&#45;&gt;Node98</title>
 <path fill="none" stroke="#191970" d="M2488.5129,-1007.3934C2535.8041,-992.7605 2611,-959.9835 2611,-899.5 2611,-899.5 2611,-899.5 2611,-832.5 2611,-742.1647 2532.9073,-655.5171 2496.4014,-619.9473"/>
 <polygon fill="#191970" stroke="#191970" points="2498.5349,-617.1453 2488.8839,-612.7738 2493.7023,-622.2096 2498.5349,-617.1453"/>
 </g>
-<!-- Node98 -->
+<!-- Node99 -->
 <g id="node47" class="node">
-<title>Node98</title>
+<title>Node99</title>
 <g id="a_node47"><a xlink:href="strided__slice_8h.html" target="_top" xlink:title="Utility functions for strided_slice op. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="2388.5,-884.5 2388.5,-914.5 2513.5,-914.5 2513.5,-884.5 2388.5,-884.5"/>
 <text text-anchor="start" x="2396.5" y="-902.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/detail/strided</text>
@@ -1538,15 +1538,15 @@
 </a>
 </g>
 </g>
-<!-- Node96&#45;&gt;Node98 -->
+<!-- Node97&#45;&gt;Node99 -->
 <g id="edge182" class="edge">
-<title>Node96&#45;&gt;Node98</title>
+<title>Node97&#45;&gt;Node99</title>
 <path fill="none" stroke="#191970" d="M2451,-1007.3845C2451,-989.544 2451,-950.7839 2451,-924.9138"/>
 <polygon fill="#191970" stroke="#191970" points="2454.5001,-924.7143 2451,-914.7143 2447.5001,-924.7143 2454.5001,-924.7143"/>
 </g>
-<!-- Node99 -->
+<!-- Node100 -->
 <g id="node48" class="node">
-<title>Node99</title>
+<title>Node100</title>
 <g id="a_node48"><a xlink:href="tensor__utils_8h.html" target="_top" xlink:title="Utility functions for handling tensor. ">
 <polygon fill="#ffffff" stroke="#000000" points="2849,-817.5 2849,-847.5 2971,-847.5 2971,-817.5 2849,-817.5"/>
 <text text-anchor="start" x="2857" y="-835.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/detail/tensor</text>
@@ -1554,15 +1554,15 @@
 </a>
 </g>
 </g>
-<!-- Node96&#45;&gt;Node99 -->
+<!-- Node97&#45;&gt;Node100 -->
 <g id="edge189" class="edge">
-<title>Node96&#45;&gt;Node99</title>
+<title>Node97&#45;&gt;Node100</title>
 <path fill="none" stroke="#191970" d="M2510.2635,-1007.6483C2511.8584,-1007.4261 2513.4396,-1007.2096 2515,-1007 2585.9228,-997.4742 2775.6682,-1011.0066 2835,-971 2874.5162,-944.3547 2894.9899,-889.7901 2904.0333,-857.6752"/>
 <polygon fill="#191970" stroke="#191970" points="2907.4937,-858.2805 2906.673,-847.7175 2900.7274,-856.4868 2907.4937,-858.2805"/>
 </g>
-<!-- Node96&#45;&gt;Node100 -->
+<!-- Node97&#45;&gt;Node101 -->
 <g id="edge194" class="edge">
-<title>Node96&#45;&gt;Node100</title>
+<title>Node97&#45;&gt;Node101</title>
 <path fill="none" stroke="#191970" d="M2510.2422,-1014.6724C2607.146,-1010.2142 2805.3527,-998.3667 2971,-971 2972.4457,-970.7611 2973.9158,-970.4986 2975.3976,-970.2175"/>
 <polygon fill="#191970" stroke="#191970" points="2976.1506,-973.6357 2985.2133,-968.1477 2974.7063,-966.7864 2976.1506,-973.6357"/>
 </g>
@@ -1572,93 +1572,93 @@
 <polygon fill="#ffffff" stroke="#bfbfbf" points="2677,-951.5 2677,-970.5 2763,-970.5 2763,-951.5 2677,-951.5"/>
 <text text-anchor="middle" x="2720" y="-958.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">unordered_set</text>
 </g>
-<!-- Node96&#45;&gt;Node75 -->
+<!-- Node97&#45;&gt;Node75 -->
 <g id="edge197" class="edge">
-<title>Node96&#45;&gt;Node75</title>
+<title>Node97&#45;&gt;Node75</title>
 <path fill="none" stroke="#191970" d="M2496.9371,-1007.4369C2544.1068,-997.6172 2617.4375,-982.3513 2667.0258,-972.0281"/>
 <polygon fill="#191970" stroke="#191970" points="2667.8436,-975.433 2676.9204,-969.9682 2666.4169,-968.5799 2667.8436,-975.433"/>
 </g>
-<!-- Node97&#45;&gt;Node17 -->
+<!-- Node98&#45;&gt;Node17 -->
 <g id="edge174" class="edge">
-<title>Node97&#45;&gt;Node17</title>
+<title>Node98&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M2418.355,-602.7922C2222.1646,-601.743 1579.1098,-595.3956 1049,-557 993.6837,-552.9935 602.2734,-526.4574 553,-501 518.0445,-482.94 494,-474.3453 494,-435 494,-435 494,-435 494,-138.5 494,-101.753 485.6149,-59.5469 479.9854,-35.3522"/>
 <polygon fill="#191970" stroke="#191970" points="483.3095,-34.2061 477.5624,-25.3054 476.5046,-35.8473 483.3095,-34.2061"/>
 </g>
-<!-- Node97&#45;&gt;Node19 -->
+<!-- Node98&#45;&gt;Node19 -->
 <g id="edge175" class="edge">
-<title>Node97&#45;&gt;Node19</title>
+<title>Node98&#45;&gt;Node19</title>
 <path fill="none" stroke="#191970" d="M2511.1167,-593.3579C2570.049,-573.9322 2687,-524.1509 2687,-435 2687,-435 2687,-435 2687,-323 2687,-200.0237 2689.658,-126.4383 2582,-67 2541.7992,-44.805 2225.9114,-23.8195 2122.0051,-17.4561"/>
 <polygon fill="#191970" stroke="#191970" points="2121.9854,-13.9485 2111.7916,-16.836 2121.5611,-20.9357 2121.9854,-13.9485"/>
 </g>
-<!-- Node97&#45;&gt;Node21 -->
+<!-- Node98&#45;&gt;Node21 -->
 <g id="edge176" class="edge">
-<title>Node97&#45;&gt;Node21</title>
+<title>Node98&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M2537.7453,-594.8335C2621.3165,-579.6689 2763,-538.6718 2763,-435 2763,-435 2763,-435 2763,-379 2763,-276.3641 2794.6745,-254.6032 2815,-154 2823.4996,-111.9304 2831.6482,-62.2955 2835.9164,-35.3464"/>
 <polygon fill="#191970" stroke="#191970" points="2839.4043,-35.6958 2837.4977,-25.2739 2832.489,-34.61 2839.4043,-35.6958"/>
 </g>
-<!-- Node97&#45;&gt;Node27 -->
+<!-- Node98&#45;&gt;Node27 -->
 <g id="edge173" class="edge">
-<title>Node97&#45;&gt;Node27</title>
+<title>Node98&#45;&gt;Node27</title>
 <path fill="none" stroke="#191970" d="M2479.6018,-593.221C2488.4339,-538.5458 2530.1056,-270.2979 2511,-190 2502.4493,-154.0627 2480.5462,-116.5919 2466.5162,-95.101"/>
 <polygon fill="#191970" stroke="#191970" points="2469.3432,-93.0321 2460.8736,-86.6667 2463.5251,-96.9244 2469.3432,-93.0321"/>
 </g>
-<!-- Node97&#45;&gt;Node53 -->
+<!-- Node98&#45;&gt;Node53 -->
 <g id="edge171" class="edge">
-<title>Node97&#45;&gt;Node53</title>
+<title>Node98&#45;&gt;Node53</title>
 <path fill="none" stroke="#191970" d="M2418.3789,-597.1873C2348.295,-589.8933 2228.6439,-575.9976 2127,-557 2090.1257,-550.1081 2081.8042,-544.2568 2045,-537 1940.0398,-516.3046 1911.9602,-521.6954 1807,-501 1770.1958,-493.7432 1761.7637,-488.4594 1725,-481 1644.862,-464.7399 1551.0395,-450.4062 1494.0945,-442.2221"/>
 <polygon fill="#191970" stroke="#191970" points="1494.2968,-438.7155 1483.9023,-440.7664 1493.307,-445.6452 1494.2968,-438.7155"/>
 </g>
-<!-- Node97&#45;&gt;Node62 -->
+<!-- Node98&#45;&gt;Node62 -->
 <g id="edge172" class="edge">
-<title>Node97&#45;&gt;Node62</title>
+<title>Node98&#45;&gt;Node62</title>
 <path fill="none" stroke="#191970" d="M2418.4557,-600.3367C2203.1274,-590.7054 1467.3982,-557.7974 1273.2761,-549.1146"/>
 <polygon fill="#191970" stroke="#191970" points="1273.2644,-545.6107 1263.1179,-548.6602 1272.9515,-552.6037 1273.2644,-545.6107"/>
 </g>
-<!-- Node98&#45;&gt;Node17 -->
+<!-- Node99&#45;&gt;Node17 -->
 <g id="edge186" class="edge">
-<title>Node98&#45;&gt;Node17</title>
+<title>Node99&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M2388.46,-897.4936C2147.6643,-888.2144 1259.6861,-839.6281 573,-613 462.7439,-576.612 342,-607.1055 342,-491 342,-491 342,-491 342,-138.5 342,-83.5567 403.9347,-46.3224 443.2752,-28.2135"/>
 <polygon fill="#191970" stroke="#191970" points="445.0126,-31.272 452.7388,-24.0224 442.178,-24.8716 445.0126,-31.272"/>
 </g>
-<!-- Node98&#45;&gt;Node21 -->
+<!-- Node99&#45;&gt;Node21 -->
 <g id="edge187" class="edge">
-<title>Node98&#45;&gt;Node21</title>
+<title>Node99&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M2490.0841,-884.3985C2512.1879,-875.1618 2539.9324,-862.3749 2563,-848 2636.4619,-802.2213 2648.3055,-781.3611 2714,-725 2768.3907,-678.3368 2782.3457,-666.5245 2830,-613 2969.3195,-456.5186 3067,-415.0145 3067,-205.5 3067,-205.5 3067,-205.5 3067,-138.5 3067,-51.3414 2936.3488,-25.656 2872.8493,-18.3195"/>
 <polygon fill="#191970" stroke="#191970" points="2872.8556,-14.8004 2862.5413,-17.2221 2872.1145,-21.7611 2872.8556,-14.8004"/>
 </g>
-<!-- Node98&#45;&gt;Node27 -->
+<!-- Node99&#45;&gt;Node27 -->
 <g id="edge184" class="edge">
-<title>Node98&#45;&gt;Node27</title>
+<title>Node99&#45;&gt;Node27</title>
 <path fill="none" stroke="#191970" d="M2473.6028,-884.4275C2561.6728,-824.89 2877,-603.8419 2877,-491 2877,-491 2877,-491 2877,-323 2877,-148.0864 2603.3805,-95.403 2496.2709,-81.4807"/>
 <polygon fill="#191970" stroke="#191970" points="2496.4976,-77.9819 2486.1409,-80.2157 2495.6302,-84.9279 2496.4976,-77.9819"/>
 </g>
-<!-- Node98&#45;&gt;Node48 -->
+<!-- Node99&#45;&gt;Node48 -->
 <g id="edge185" class="edge">
-<title>Node98&#45;&gt;Node48</title>
+<title>Node99&#45;&gt;Node48</title>
 <path fill="none" stroke="#191970" d="M2446.6693,-884.3745C2430.604,-827.4113 2374.3502,-620.1485 2355,-445 2345.8242,-361.945 2339.0178,-339.3822 2353,-257 2354.8855,-245.8909 2358.9059,-233.9848 2362.6698,-224.4217"/>
 <polygon fill="#191970" stroke="#191970" points="2365.9327,-225.6903 2366.5401,-215.1129 2359.4691,-223.0029 2365.9327,-225.6903"/>
 </g>
-<!-- Node98&#45;&gt;Node53 -->
+<!-- Node99&#45;&gt;Node53 -->
 <g id="edge183" class="edge">
-<title>Node98&#45;&gt;Node53</title>
+<title>Node99&#45;&gt;Node53</title>
 <path fill="none" stroke="#191970" d="M2437.4081,-884.3748C2411.4008,-856.2807 2351.6765,-795.7488 2290,-761 2005.6019,-600.7689 1919.981,-586.2948 1611,-481 1571.874,-467.6666 1526.6478,-455.5229 1492.8321,-447.0651"/>
 <polygon fill="#191970" stroke="#191970" points="1493.2715,-443.5682 1482.7234,-444.5613 1491.5886,-450.3629 1493.2715,-443.5682"/>
 </g>
-<!-- Node98&#45;&gt;Node84 -->
+<!-- Node99&#45;&gt;Node84 -->
 <g id="edge188" class="edge">
-<title>Node98&#45;&gt;Node84</title>
+<title>Node99&#45;&gt;Node84</title>
 <path fill="none" stroke="#191970" d="M2388.3004,-894.0479C2349.1407,-890.7839 2297.6596,-886.7488 2252,-884 1876.3399,-861.3843 1779.8198,-891.4828 1406,-848 1404.72,-847.8511 1403.43,-847.6935 1402.1321,-847.5281"/>
 <polygon fill="#191970" stroke="#191970" points="1402.4554,-844.0393 1392.0665,-846.1181 1401.4843,-850.9716 1402.4554,-844.0393"/>
 </g>
-<!-- Node99&#45;&gt;Node1 -->
+<!-- Node100&#45;&gt;Node1 -->
 <g id="edge190" class="edge">
-<title>Node99&#45;&gt;Node1</title>
+<title>Node100&#45;&gt;Node1</title>
 <path fill="none" stroke="#191970" d="M2848.7372,-822.5224C2833.5053,-820.369 2817.1895,-818.3393 2802,-817 2541.4557,-794.0271 1721.9518,-776.7746 1486.712,-772.1999"/>
 <polygon fill="#191970" stroke="#191970" points="1486.6777,-768.6987 1476.6118,-772.0044 1486.5421,-775.6974 1486.6777,-768.6987"/>
 </g>
-<!-- Node99&#45;&gt;Node21 -->
+<!-- Node100&#45;&gt;Node21 -->
 <g id="edge191" class="edge">
-<title>Node99&#45;&gt;Node21</title>
+<title>Node100&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M2931.3512,-817.3716C2943.2182,-808.0371 2957.4682,-795.1621 2967,-781 2984.4279,-755.1061 2991,-746.2125 2991,-715 2991,-715 2991,-715 2991,-603 2991,-522.1844 3047.5693,-518.5768 3081,-445 3115.654,-368.7309 3143,-350.7728 3143,-267 3143,-267 3143,-267 3143,-138.5 3143,-104.9798 3144.3236,-88.9618 3119,-67 3082.6561,-35.4808 2939.6848,-22.1182 2873.2601,-17.5244"/>
 <polygon fill="#191970" stroke="#191970" points="2873.1197,-14.0074 2862.9097,-16.8371 2872.6558,-20.9921 2873.1197,-14.0074"/>
 </g>
diff --git a/docs/reference/api/doxygen/reduction_8h__dep__incl.svg b/docs/reference/api/doxygen/reduction_8h__dep__incl.svg
index 1b973da768..4cb6f356c2 100644
--- a/docs/reference/api/doxygen/reduction_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/reduction_8h__dep__incl.svg
@@ -9,15 +9,15 @@
 <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 158)">
 <title>include/tvm/topi/reduction.h</title>
 <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-158 414.718,-158 414.718,4 -4,4"/>
-<!-- Node100 -->
+<!-- Node101 -->
 <g id="node1" class="node">
-<title>Node100</title>
+<title>Node101</title>
 <polygon fill="#bfbfbf" stroke="#000000" points="60.718,-134.5 60.718,-153.5 214.718,-153.5 214.718,-134.5 60.718,-134.5"/>
 <text text-anchor="middle" x="137.718" y="-141.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/reduction.h</text>
 </g>
-<!-- Node101 -->
+<!-- Node102 -->
 <g id="node2" class="node">
-<title>Node101</title>
+<title>Node102</title>
 <g id="a_node2"><a xlink:href="nn_2pooling_8h.html" target="_top" xlink:title="Pooling op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="40.218,-.5 40.218,-30.5 151.218,-30.5 151.218,-.5 40.218,-.5"/>
 <text text-anchor="start" x="48.218" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -25,15 +25,15 @@
 </a>
 </g>
 </g>
-<!-- Node100&#45;&gt;Node101 -->
+<!-- Node101&#45;&gt;Node102 -->
 <g id="edge1" class="edge">
-<title>Node100&#45;&gt;Node101</title>
+<title>Node101&#45;&gt;Node102</title>
 <path fill="none" stroke="#191970" d="M83.8307,-131.9252C51.655,-123.5022 15.2398,-111.4017 5.718,-98 -2.262,-86.7685 -1.1939,-78.9186 5.718,-67 15.3813,-50.3368 32.6895,-38.6061 49.5638,-30.5956"/>
 <polygon fill="#191970" stroke="#191970" points="83.0918,-135.3485 93.6457,-134.4185 84.8153,-128.564 83.0918,-135.3485"/>
 </g>
-<!-- Node102 -->
+<!-- Node103 -->
 <g id="node3" class="node">
-<title>Node102</title>
+<title>Node103</title>
 <g id="a_node3"><a xlink:href="nn_2softmax_8h.html" target="_top" xlink:title="Softmax op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="15.218,-67.5 15.218,-97.5 126.218,-97.5 126.218,-67.5 15.218,-67.5"/>
 <text text-anchor="start" x="23.218" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn</text>
@@ -41,30 +41,30 @@
 </a>
 </g>
 </g>
-<!-- Node100&#45;&gt;Node102 -->
+<!-- Node101&#45;&gt;Node103 -->
 <g id="edge2" class="edge">
-<title>Node100&#45;&gt;Node102</title>
+<title>Node101&#45;&gt;Node103</title>
 <path fill="none" stroke="#191970" d="M119.7288,-127.4875C109.5767,-118.1689 96.9859,-106.6116 87.108,-97.5446"/>
 <polygon fill="#191970" stroke="#191970" points="117.5154,-130.2068 127.2492,-134.3906 122.249,-125.0499 117.5154,-130.2068"/>
 </g>
-<!-- Node103 -->
+<!-- Node104 -->
 <g id="node4" class="node">
-<title>Node103</title>
+<title>Node104</title>
 <g id="a_node4"><a xlink:href="topi_2nn_8h.html" target="_top" xlink:title="NN op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="144.718,-73 144.718,-92 264.718,-92 264.718,-73 144.718,-73"/>
 <text text-anchor="middle" x="204.718" y="-80" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/nn.h</text>
 </a>
 </g>
 </g>
-<!-- Node100&#45;&gt;Node103 -->
+<!-- Node101&#45;&gt;Node104 -->
 <g id="edge3" class="edge">
-<title>Node100&#45;&gt;Node103</title>
+<title>Node101&#45;&gt;Node104</title>
 <path fill="none" stroke="#191970" d="M155.6411,-127.5482C167.9172,-116.2798 183.7834,-101.716 194.1016,-92.2449"/>
 <polygon fill="#191970" stroke="#191970" points="153.1869,-125.0499 148.1867,-134.3906 157.9205,-130.2068 153.1869,-125.0499"/>
 </g>
-<!-- Node104 -->
+<!-- Node105 -->
 <g id="node5" class="node">
-<title>Node104</title>
+<title>Node105</title>
 <g id="a_node5"><a xlink:href="reorg_8h.html" target="_top" xlink:title="Reorg op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="282.718,-67.5 282.718,-97.5 410.718,-97.5 410.718,-67.5 282.718,-67.5"/>
 <text text-anchor="start" x="290.718" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/topi/vision</text>
@@ -72,15 +72,15 @@
 </a>
 </g>
 </g>
-<!-- Node100&#45;&gt;Node104 -->
+<!-- Node101&#45;&gt;Node105 -->
 <g id="edge5" class="edge">
-<title>Node100&#45;&gt;Node104</title>
+<title>Node101&#45;&gt;Node105</title>
 <path fill="none" stroke="#191970" d="M180.2799,-131.4758C213.6489,-121.6567 260.2015,-107.9582 295.5908,-97.5446"/>
 <polygon fill="#191970" stroke="#191970" points="178.9795,-128.21 170.3742,-134.3906 180.9555,-134.9253 178.9795,-128.21"/>
 </g>
-<!-- Node103&#45;&gt;Node101 -->
+<!-- Node104&#45;&gt;Node102 -->
 <g id="edge4" class="edge">
-<title>Node103&#45;&gt;Node101</title>
+<title>Node104&#45;&gt;Node102</title>
 <path fill="none" stroke="#191970" d="M180.3651,-67.5308C162.4225,-56.5019 138.1192,-41.5631 120.1563,-30.5218"/>
 <polygon fill="#191970" stroke="#191970" points="178.7488,-70.6456 189.1009,-72.9005 182.4145,-64.6821 178.7488,-70.6456"/>
 </g>
diff --git a/docs/reference/api/doxygen/reduction_8h__incl.svg b/docs/reference/api/doxygen/reduction_8h__incl.svg
index 827a5afe78..fd466754a6 100644
--- a/docs/reference/api/doxygen/reduction_8h__incl.svg
+++ b/docs/reference/api/doxygen/reduction_8h__incl.svg
@@ -97,24 +97,24 @@
 <path fill="none" stroke="#191970" d="M2121.2093,-1018.3117C2117.8184,-1009.1257 2113.0732,-994.8565 2111,-982 2100.6355,-917.7283 2102.5033,-840.7063 2104.4331,-801.7412"/>
 <polygon fill="#191970" stroke="#191970" points="2107.9373,-801.757 2104.9819,-791.5828 2100.9475,-801.3794 2107.9373,-801.757"/>
 </g>
-<!-- Node91 -->
+<!-- Node92 -->
 <g id="node42" class="node">
-<title>Node91</title>
+<title>Node92</title>
 <g id="a_node42"><a xlink:href="tags_8h.html" target="_top" xlink:title="External function interface to rocBLAS libraries. ">
 <polygon fill="#ffffff" stroke="#000000" points="144.5,-834 144.5,-853 235.5,-853 235.5,-834 144.5,-834"/>
 <text text-anchor="middle" x="190" y="-841" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/tags.h</text>
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node91 -->
+<!-- Node0&#45;&gt;Node92 -->
 <g id="edge173" class="edge">
-<title>Node0&#45;&gt;Node91</title>
+<title>Node0&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M2047.5852,-1026.7186C1779.4479,-1022.1134 891.2866,-1005.4245 607,-982 421.5125,-966.7163 316.1779,-1064.6328 192,-926 176.8174,-909.05 179.7042,-881.3761 184.0832,-862.8095"/>
 <polygon fill="#191970" stroke="#191970" points="187.4825,-863.6465 186.7086,-853.08 180.7242,-861.8228 187.4825,-863.6465"/>
 </g>
-<!-- Node92 -->
+<!-- Node93 -->
 <g id="node43" class="node">
-<title>Node92</title>
+<title>Node93</title>
 <g id="a_node43"><a xlink:href="ravel__unravel_8h.html" target="_top" xlink:title="Index ravel and unraval operations. ">
 <polygon fill="#ffffff" stroke="#000000" points="3074,-895.5 3074,-925.5 3190,-925.5 3190,-895.5 3074,-895.5"/>
 <text text-anchor="start" x="3082" y="-913.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/detail/ravel</text>
@@ -122,51 +122,51 @@
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node92 -->
+<!-- Node0&#45;&gt;Node93 -->
 <g id="edge161" class="edge">
-<title>Node0&#45;&gt;Node92</title>
+<title>Node0&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M2202.0863,-1023.1168C2309.0866,-1015.9905 2508.593,-1001.4438 2678,-982 2848.4732,-962.4338 2891.1432,-956.5177 3060,-926 3061.274,-925.7698 3062.5598,-925.5335 3063.8546,-925.292"/>
 <polygon fill="#191970" stroke="#191970" points="3064.7509,-928.6837 3073.907,-923.3529 3063.425,-921.8104 3064.7509,-928.6837"/>
 </g>
-<!-- Node93 -->
+<!-- Node94 -->
 <g id="node44" class="node">
-<title>Node93</title>
+<title>Node94</title>
 <g id="a_node44"><a xlink:href="elemwise_8h.html" target="_top" xlink:title="Elementwise op constructions. ">
 <polygon fill="#ffffff" stroke="#000000" points="616.5,-962.5 616.5,-981.5 733.5,-981.5 733.5,-962.5 616.5,-962.5"/>
 <text text-anchor="middle" x="675" y="-969.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/elemwise.h</text>
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node93 -->
+<!-- Node0&#45;&gt;Node94 -->
 <g id="edge164" class="edge">
-<title>Node0&#45;&gt;Node93</title>
+<title>Node0&#45;&gt;Node94</title>
 <path fill="none" stroke="#191970" d="M2047.8101,-1025.0189C1793.5448,-1015.199 984.764,-983.9633 743.5466,-974.6473"/>
 <polygon fill="#191970" stroke="#191970" points="743.6346,-971.1482 733.507,-974.2596 743.3644,-978.143 743.6346,-971.1482"/>
 </g>
-<!-- Node95 -->
+<!-- Node96 -->
 <g id="node46" class="node">
-<title>Node95</title>
+<title>Node96</title>
 <g id="a_node46"><a xlink:href="topi_2transform_8h.html" target="_top" xlink:title="Transform op constructors. ">
 <polygon fill="#ffffff" stroke="#000000" points="2551,-962.5 2551,-981.5 2669,-981.5 2669,-962.5 2551,-962.5"/>
 <text text-anchor="middle" x="2610" y="-969.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/transform.h</text>
 </a>
 </g>
 </g>
-<!-- Node0&#45;&gt;Node95 -->
+<!-- Node0&#45;&gt;Node96 -->
 <g id="edge174" class="edge">
-<title>Node0&#45;&gt;Node95</title>
+<title>Node0&#45;&gt;Node96</title>
 <path fill="none" stroke="#191970" d="M2202.1177,-1019.0957C2295.4082,-1008.324 2451.068,-990.3509 2540.7008,-980.0016"/>
 <polygon fill="#191970" stroke="#191970" points="2541.2299,-983.4638 2550.7624,-978.8398 2540.4269,-976.51 2541.2299,-983.4638"/>
 </g>
-<!-- Node99 -->
+<!-- Node100 -->
 <g id="node50" class="node">
-<title>Node99</title>
+<title>Node100</title>
 <polygon fill="#ffffff" stroke="#bfbfbf" points="3246.5,-901 3246.5,-920 3299.5,-920 3299.5,-901 3246.5,-901"/>
 <text text-anchor="middle" x="3273" y="-908" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">iterator</text>
 </g>
-<!-- Node0&#45;&gt;Node99 -->
+<!-- Node0&#45;&gt;Node100 -->
 <g id="edge206" class="edge">
-<title>Node0&#45;&gt;Node99</title>
+<title>Node0&#45;&gt;Node100</title>
 <path fill="none" stroke="#191970" d="M2202.2605,-1023.8842C2420.0452,-1011.8066 3035.7985,-974.2555 3232,-926 3235.0066,-925.2605 3238.0932,-924.3646 3241.1584,-923.3806"/>
 <polygon fill="#191970" stroke="#191970" points="3242.4707,-926.6295 3250.7521,-920.0212 3240.1573,-920.0228 3242.4707,-926.6295"/>
 </g>
@@ -1206,9 +1206,9 @@
 <path fill="none" stroke="#191970" d="M2007.6233,-900.8631C2024.2142,-879.8896 2063.9948,-829.6009 2087.575,-799.792"/>
 <polygon fill="#191970" stroke="#191970" points="2090.5874,-801.6253 2094.0465,-791.611 2085.0974,-797.2825 2090.5874,-801.6253"/>
 </g>
-<!-- Node81&#45;&gt;Node91 -->
+<!-- Node81&#45;&gt;Node92 -->
 <g id="edge156" class="edge">
-<title>Node81&#45;&gt;Node91</title>
+<title>Node81&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M1940.7854,-909.3213C1682.2998,-904.0767 651.3521,-882.1009 328,-859 300.8407,-857.0597 270.786,-853.8146 245.6768,-850.7937"/>
 <polygon fill="#191970" stroke="#191970" points="245.9513,-847.3012 235.6005,-849.5618 245.1018,-854.2495 245.9513,-847.3012"/>
 </g>
@@ -1236,15 +1236,15 @@
 <path fill="none" stroke="#191970" d="M2274.7747,-833.9005C2244.8896,-823.7366 2196.9959,-807.4479 2159.849,-794.8141"/>
 <polygon fill="#191970" stroke="#191970" points="2160.7629,-791.4281 2150.1685,-791.5218 2158.5089,-798.0553 2160.7629,-791.4281"/>
 </g>
-<!-- Node90 -->
+<!-- Node91 -->
 <g id="node41" class="node">
-<title>Node90</title>
+<title>Node91</title>
 <polygon fill="#ffffff" stroke="#bfbfbf" points="2279.5,-767 2279.5,-786 2326.5,-786 2326.5,-767 2279.5,-767"/>
 <text text-anchor="middle" x="2303" y="-774" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">deque</text>
 </g>
-<!-- Node82&#45;&gt;Node90 -->
+<!-- Node82&#45;&gt;Node91 -->
 <g id="edge153" class="edge">
-<title>Node82&#45;&gt;Node90</title>
+<title>Node82&#45;&gt;Node91</title>
 <path fill="none" stroke="#191970" d="M2303,-833.9005C2303,-824.149 2303,-808.7597 2303,-796.3695"/>
 <polygon fill="#191970" stroke="#191970" points="2306.5001,-796.0816 2303,-786.0817 2299.5001,-796.0817 2306.5001,-796.0816"/>
 </g>
@@ -1392,171 +1392,171 @@
 <path fill="none" stroke="#191970" d="M1599.7537,-262.9782C1679.5353,-256.0209 1844.8948,-240.5854 1984,-221 1985.2592,-220.8227 1986.5331,-220.6385 1987.8174,-220.4485"/>
 <polygon fill="#191970" stroke="#191970" points="1988.4457,-223.893 1997.7887,-218.897 1987.3694,-216.9762 1988.4457,-223.893"/>
 </g>
-<!-- Node91&#45;&gt;Node17 -->
+<!-- Node92&#45;&gt;Node17 -->
 <g id="edge157" class="edge">
-<title>Node91&#45;&gt;Node17</title>
+<title>Node92&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M171.6839,-833.8709C139.4588,-815.4561 76,-771.9617 76,-715 76,-715 76,-715 76,-149.5 76,-111.3133 71.1826,-92.0555 100,-67 124.3234,-45.8518 340.9433,-25.32 423.8865,-18.1643"/>
 <polygon fill="#191970" stroke="#191970" points="424.3293,-21.6393 433.9954,-17.3015 423.734,-14.6647 424.3293,-21.6393"/>
 </g>
-<!-- Node92&#45;&gt;Node1 -->
+<!-- Node93&#45;&gt;Node1 -->
 <g id="edge162" class="edge">
-<title>Node92&#45;&gt;Node1</title>
+<title>Node93&#45;&gt;Node1</title>
 <path fill="none" stroke="#191970" d="M3073.6892,-907.0304C2963.8138,-900.0319 2730.2214,-882.835 2653,-859 2564.8423,-831.7894 2557.268,-790.9414 2470,-761 2401.9701,-737.6592 2320.0345,-725.8249 2264.6054,-720.0581"/>
 <polygon fill="#191970" stroke="#191970" points="2264.8731,-716.5674 2254.5734,-719.0505 2264.1735,-723.5324 2264.8731,-716.5674"/>
 </g>
-<!-- Node92&#45;&gt;Node21 -->
+<!-- Node93&#45;&gt;Node21 -->
 <g id="edge163" class="edge">
-<title>Node92&#45;&gt;Node21</title>
+<title>Node93&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M3160.1347,-895.3409C3195.6031,-873.9447 3252,-831.3065 3252,-776.5 3252,-776.5 3252,-776.5 3252,-547 3252,-470.4471 3290,-455.5529 3290,-379 3290,-379 3290,-379 3290,-149.5 3290,-109.1307 3284.2758,-91.2477 3252,-67 3203.743,-30.7463 3020.3561,-19.6726 2943.8439,-16.5889"/>
 <polygon fill="#191970" stroke="#191970" points="2943.9083,-13.0889 2933.7821,-16.205 2943.6414,-20.0838 2943.9083,-13.0889"/>
 </g>
-<!-- Node93&#45;&gt;Node17 -->
+<!-- Node94&#45;&gt;Node17 -->
 <g id="edge171" class="edge">
-<title>Node93&#45;&gt;Node17</title>
+<title>Node94&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M616.2107,-969.3061C505.4323,-963.7518 269.913,-949.5374 192,-926 117.6705,-903.5452 38,-921.1472 38,-843.5 38,-843.5 38,-843.5 38,-149.5 38,-109.3146 42.8102,-91.0558 75,-67 129.9431,-25.9403 342.2305,-17.6149 423.8112,-15.9281"/>
 <polygon fill="#191970" stroke="#191970" points="424.05,-19.4244 433.983,-15.7387 423.9196,-12.4256 424.05,-19.4244"/>
 </g>
-<!-- Node93&#45;&gt;Node27 -->
+<!-- Node94&#45;&gt;Node27 -->
 <g id="edge170" class="edge">
-<title>Node93&#45;&gt;Node27</title>
+<title>Node94&#45;&gt;Node27</title>
 <path fill="none" stroke="#191970" d="M733.585,-971.4531C1060.944,-968.0681 2656.1551,-947.1283 2855,-859 3067.3996,-764.8643 3062,-611.3254 3062,-379 3062,-379 3062,-379 3062,-267 3062,-146.2157 2987.0233,-118.9881 2878,-67 2815.2106,-37.0586 2734.6012,-24.2589 2686.2469,-18.9822"/>
 <polygon fill="#191970" stroke="#191970" points="2686.4505,-15.4847 2676.1437,-17.9384 2685.731,-22.4476 2686.4505,-15.4847"/>
 </g>
-<!-- Node93&#45;&gt;Node53 -->
+<!-- Node94&#45;&gt;Node53 -->
 <g id="edge168" class="edge">
-<title>Node93&#45;&gt;Node53</title>
+<title>Node94&#45;&gt;Node53</title>
 <path fill="none" stroke="#191970" d="M675,-962.3281C675,-940.9175 675,-887.895 675,-843.5 675,-843.5 675,-843.5 675,-776.5 675,-647.9916 737.346,-618.1388 837,-537 951.1422,-444.0648 1413.5756,-396.7697 1569.9975,-383.1986"/>
 <polygon fill="#191970" stroke="#191970" points="1570.6228,-386.6579 1580.2872,-382.3164 1570.0248,-379.6835 1570.6228,-386.6579"/>
 </g>
-<!-- Node93&#45;&gt;Node81 -->
+<!-- Node94&#45;&gt;Node81 -->
 <g id="edge172" class="edge">
-<title>Node93&#45;&gt;Node81</title>
+<title>Node94&#45;&gt;Node81</title>
 <path fill="none" stroke="#191970" d="M733.6432,-969.2781C949.4254,-959.2625 1698.7715,-924.4815 1930.9018,-913.7072"/>
 <polygon fill="#191970" stroke="#191970" points="1931.0812,-917.2027 1940.9081,-913.2428 1930.7565,-910.2103 1931.0812,-917.2027"/>
 </g>
-<!-- Node93&#45;&gt;Node91 -->
+<!-- Node94&#45;&gt;Node92 -->
 <g id="edge169" class="edge">
-<title>Node93&#45;&gt;Node91</title>
+<title>Node94&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M616.343,-969.775C499.0844,-964.8094 244.3449,-951.2278 214,-926 195.6859,-910.7743 190.8807,-882.4494 189.8576,-863.2875"/>
 <polygon fill="#191970" stroke="#191970" points="193.3554,-863.1464 189.6188,-853.2324 186.3574,-863.3127 193.3554,-863.1464"/>
 </g>
-<!-- Node94 -->
+<!-- Node95 -->
 <g id="node45" class="node">
-<title>Node94</title>
+<title>Node95</title>
 <g id="a_node45"><a xlink:href="builtin_8h.html" target="_top" xlink:title="TIR builtin intrinsics. ">
 <polygon fill="#ffffff" stroke="#000000" points="745,-425.5 745,-444.5 837,-444.5 837,-425.5 745,-425.5"/>
 <text text-anchor="middle" x="791" y="-432.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/tir/builtin.h</text>
 </a>
 </g>
 </g>
-<!-- Node93&#45;&gt;Node94 -->
+<!-- Node94&#45;&gt;Node95 -->
 <g id="edge165" class="edge">
-<title>Node93&#45;&gt;Node94</title>
+<title>Node94&#45;&gt;Node95</title>
 <path fill="none" stroke="#191970" d="M616.288,-968.5276C514.6491,-962.0017 314.3875,-946.6796 290,-926 260.8748,-901.303 266,-881.6867 266,-843.5 266,-843.5 266,-843.5 266,-547 266,-499.4843 599.1743,-456.5962 734.6903,-441.117"/>
 <polygon fill="#191970" stroke="#191970" points="735.3276,-444.5673 744.8699,-439.9634 734.5393,-437.6118 735.3276,-444.5673"/>
 </g>
-<!-- Node94&#45;&gt;Node53 -->
+<!-- Node95&#45;&gt;Node53 -->
 <g id="edge167" class="edge">
-<title>Node94&#45;&gt;Node53</title>
+<title>Node95&#45;&gt;Node53</title>
 <path fill="none" stroke="#191970" d="M837.2601,-431.5143C863.5662,-429.5533 897.1241,-427.087 927,-425 1167.3895,-408.207 1454.6654,-389.6844 1570.1661,-382.3008"/>
 <polygon fill="#191970" stroke="#191970" points="1570.718,-385.7728 1580.4744,-381.6422 1570.2716,-378.787 1570.718,-385.7728"/>
 </g>
-<!-- Node94&#45;&gt;Node63 -->
+<!-- Node95&#45;&gt;Node63 -->
 <g id="edge166" class="edge">
-<title>Node94&#45;&gt;Node63</title>
+<title>Node95&#45;&gt;Node63</title>
 <path fill="none" stroke="#191970" d="M793.2766,-425.3969C799.085,-402.84 816.4926,-345.9017 851,-313 868.0462,-296.747 891.752,-285.7266 912.4237,-278.5578"/>
 <polygon fill="#191970" stroke="#191970" points="913.5598,-281.8688 921.9783,-275.4362 911.3858,-275.2149 913.5598,-281.8688"/>
 </g>
-<!-- Node95&#45;&gt;Node1 -->
+<!-- Node96&#45;&gt;Node1 -->
 <g id="edge175" class="edge">
-<title>Node95&#45;&gt;Node1</title>
+<title>Node96&#45;&gt;Node1</title>
 <path fill="none" stroke="#191970" d="M2599.9203,-962.1922C2565.6997,-929.3698 2449.7552,-822.0141 2336,-761 2308.7188,-746.3674 2275.8846,-735.0555 2249.4287,-727.3109"/>
 <polygon fill="#191970" stroke="#191970" points="2250.2027,-723.8922 2239.6264,-724.5179 2248.2845,-730.6243 2250.2027,-723.8922"/>
 </g>
-<!-- Node95&#45;&gt;Node17 -->
+<!-- Node96&#45;&gt;Node17 -->
 <g id="edge202" class="edge">
-<title>Node95&#45;&gt;Node17</title>
+<title>Node96&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M2550.9721,-970.2971C2190.7564,-959.7171 304,-901.5308 304,-843.5 304,-843.5 304,-843.5 304,-776.5 304,-624.0807 114,-643.4193 114,-491 114,-491 114,-491 114,-149.5 114,-107.0788 127.1964,-91.2541 162,-67 204.07,-37.6821 356.8082,-22.915 423.9565,-17.7229"/>
 <polygon fill="#191970" stroke="#191970" points="424.2587,-21.2102 433.9685,-16.9711 423.7344,-14.2298 424.2587,-21.2102"/>
 </g>
-<!-- Node95&#45;&gt;Node21 -->
+<!-- Node96&#45;&gt;Node21 -->
 <g id="edge204" class="edge">
-<title>Node95&#45;&gt;Node21</title>
+<title>Node96&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M2669.2437,-968.8875C2814.2742,-961.065 3176.9558,-940.1947 3199,-926 3263.9837,-884.1558 3328,-680.2905 3328,-603 3328,-603 3328,-603 3328,-149.5 3328,-107.0788 3315.224,-90.6394 3280,-67 3225.3052,-30.2934 3024.4548,-19.4138 2943.8269,-16.4856"/>
 <polygon fill="#191970" stroke="#191970" points="2943.8324,-12.9838 2933.7179,-16.1377 2943.5916,-19.9797 2943.8324,-12.9838"/>
 </g>
-<!-- Node95&#45;&gt;Node27 -->
+<!-- Node96&#45;&gt;Node27 -->
 <g id="edge199" class="edge">
-<title>Node95&#45;&gt;Node27</title>
+<title>Node96&#45;&gt;Node27</title>
 <path fill="none" stroke="#191970" d="M2669.2631,-962.6454C2670.8582,-962.4241 2672.4394,-962.2085 2674,-962 2817.6099,-942.8117 2865.6022,-984.8457 2998,-926 3108.0478,-877.088 3168,-835.428 3168,-715 3168,-715 3168,-715 3168,-659 3168,-583.1522 3138,-566.8478 3138,-491 3138,-491 3138,-491 3138,-435 3138,-358.4471 3100,-343.5529 3100,-267 3100,-267 3100,-267 3100,-211 3100,-133.0814 3066.2748,-107.975 3000,-67 2948.234,-34.9952 2768.9217,-21.8715 2686.2946,-17.4307"/>
 <polygon fill="#191970" stroke="#191970" points="2686.4092,-13.932 2676.2408,-16.9077 2686.0455,-20.9225 2686.4092,-13.932"/>
 </g>
-<!-- Node95&#45;&gt;Node48 -->
+<!-- Node96&#45;&gt;Node48 -->
 <g id="edge201" class="edge">
-<title>Node95&#45;&gt;Node48</title>
+<title>Node96&#45;&gt;Node48</title>
 <path fill="none" stroke="#191970" d="M2668.4033,-962.4487C2766.0826,-944.8048 2952,-903.36 2952,-843.5 2952,-843.5 2952,-843.5 2952,-603 2952,-308.8592 2491.5514,-184.7551 2364.9027,-156.183"/>
 <polygon fill="#191970" stroke="#191970" points="2365.5506,-152.7418 2355.0305,-153.9982 2364.038,-159.5764 2365.5506,-152.7418"/>
 </g>
-<!-- Node95&#45;&gt;Node80 -->
+<!-- Node96&#45;&gt;Node80 -->
 <g id="edge183" class="edge">
-<title>Node95&#45;&gt;Node80</title>
+<title>Node96&#45;&gt;Node80</title>
 <path fill="none" stroke="#191970" d="M2550.877,-963.61C2485.9261,-954.3921 2378.5298,-939.1474 2286,-926 2189.1096,-912.233 2164.9164,-908.5828 2068,-895 1748.8238,-850.2675 637,-981.2955 637,-659 637,-659 637,-659 637,-603 637,-552.3608 966.8012,-386.3849 1067.2875,-336.9895"/>
 <polygon fill="#191970" stroke="#191970" points="1068.8398,-340.1265 1076.2773,-332.581 1065.7577,-333.8415 1068.8398,-340.1265"/>
 </g>
-<!-- Node95&#45;&gt;Node81 -->
+<!-- Node96&#45;&gt;Node81 -->
 <g id="edge184" class="edge">
-<title>Node95&#45;&gt;Node81</title>
+<title>Node96&#45;&gt;Node81</title>
 <path fill="none" stroke="#191970" d="M2550.6822,-966.1615C2462.901,-957.5031 2293.232,-940.6979 2149,-926 2122.9621,-923.3466 2094.3703,-920.3841 2069.2562,-917.7656"/>
 <polygon fill="#191970" stroke="#191970" points="2069.3988,-914.2615 2059.0895,-916.7045 2068.6721,-921.2237 2069.3988,-914.2615"/>
 </g>
-<!-- Node95&#45;&gt;Node82 -->
+<!-- Node96&#45;&gt;Node82 -->
 <g id="edge185" class="edge">
-<title>Node95&#45;&gt;Node82</title>
+<title>Node96&#45;&gt;Node82</title>
 <path fill="none" stroke="#191970" d="M2562.1485,-962.4969C2527.8939,-954.8367 2481.1096,-942.6022 2442,-926 2398.328,-907.461 2351.7147,-877.4266 2325.0925,-859.1661"/>
 <polygon fill="#191970" stroke="#191970" points="2326.7583,-856.0619 2316.5472,-853.2363 2322.7675,-861.8129 2326.7583,-856.0619"/>
 </g>
-<!-- Node95&#45;&gt;Node83 -->
+<!-- Node96&#45;&gt;Node83 -->
 <g id="edge186" class="edge">
-<title>Node95&#45;&gt;Node83</title>
+<title>Node96&#45;&gt;Node83</title>
 <path fill="none" stroke="#191970" d="M2550.5318,-963.6586C2503.1643,-956.1937 2435.4972,-943.6853 2378,-926 2305.0958,-903.5757 2286.4283,-896.4855 2220,-859 2188.1156,-841.0076 2154.4963,-815.8102 2132.0988,-798.0083"/>
 <polygon fill="#191970" stroke="#191970" points="2134.0799,-795.1103 2124.0912,-791.5781 2129.697,-800.5684 2134.0799,-795.1103"/>
 </g>
-<!-- Node95&#45;&gt;Node75 -->
+<!-- Node96&#45;&gt;Node75 -->
 <g id="edge203" class="edge">
-<title>Node95&#45;&gt;Node75</title>
+<title>Node96&#45;&gt;Node75</title>
 <path fill="none" stroke="#191970" d="M2646.7069,-962.4919C2719.6777,-942.5083 2876,-893.8252 2876,-843.5 2876,-843.5 2876,-843.5 2876,-715 2876,-348.1491 2277.9463,-241.2527 2094.1104,-217.0596"/>
 <polygon fill="#191970" stroke="#191970" points="2094.5001,-213.581 2084.1349,-215.775 2093.6059,-220.5236 2094.5001,-213.581"/>
 </g>
-<!-- Node95&#45;&gt;Node91 -->
+<!-- Node96&#45;&gt;Node92 -->
 <g id="edge198" class="edge">
-<title>Node95&#45;&gt;Node91</title>
+<title>Node96&#45;&gt;Node92</title>
 <path fill="none" stroke="#191970" d="M2550.7814,-971.9254C2199.3302,-971.3448 399.0497,-966.402 290,-926 254.1745,-912.7269 221.8915,-880.7824 204.2049,-860.7639"/>
 <polygon fill="#191970" stroke="#191970" points="206.7172,-858.3169 197.5508,-853.0039 201.4033,-862.8735 206.7172,-858.3169"/>
 </g>
-<!-- Node95&#45;&gt;Node92 -->
+<!-- Node96&#45;&gt;Node93 -->
 <g id="edge187" class="edge">
-<title>Node95&#45;&gt;Node92</title>
+<title>Node96&#45;&gt;Node93</title>
 <path fill="none" stroke="#191970" d="M2669.2367,-968.0348C2755.548,-961.7846 2920.7676,-948.0345 3060,-926 3061.2787,-925.7976 3062.5689,-925.587 3063.8676,-925.3692"/>
 <polygon fill="#191970" stroke="#191970" points="3064.7138,-928.7736 3073.9442,-923.5725 3063.485,-921.8823 3064.7138,-928.7736"/>
 </g>
-<!-- Node96 -->
+<!-- Node97 -->
 <g id="node47" class="node">
-<title>Node96</title>
+<title>Node97</title>
 <g id="a_node47"><a xlink:href="data__layout_8h.html" target="_top" xlink:title="Layout expression to describe the data organization of a tensor. And BijectiveLayout to mapping two d...">
 <polygon fill="#ffffff" stroke="#ff0000" points="2576.5,-537.5 2576.5,-556.5 2695.5,-556.5 2695.5,-537.5 2576.5,-537.5"/>
 <text text-anchor="middle" x="2636" y="-544.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/tir/data_layout.h</text>
 </a>
 </g>
 </g>
-<!-- Node95&#45;&gt;Node96 -->
+<!-- Node96&#45;&gt;Node97 -->
 <g id="edge176" class="edge">
-<title>Node95&#45;&gt;Node96</title>
+<title>Node96&#45;&gt;Node97</title>
 <path fill="none" stroke="#191970" d="M2610,-962.3281C2610,-940.9175 2610,-887.895 2610,-843.5 2610,-843.5 2610,-843.5 2610,-776.5 2610,-698.2309 2625.1152,-605.896 2632.296,-566.4468"/>
 <polygon fill="#191970" stroke="#191970" points="2635.7443,-567.047 2634.1301,-556.5759 2628.8621,-565.7681 2635.7443,-567.047"/>
 </g>
-<!-- Node97 -->
+<!-- Node98 -->
 <g id="node48" class="node">
-<title>Node97</title>
+<title>Node98</title>
 <g id="a_node48"><a xlink:href="strided__slice_8h.html" target="_top" xlink:title="Utility functions for strided_slice op. ">
 <polygon fill="#ffffff" stroke="#ff0000" points="2661.5,-828.5 2661.5,-858.5 2786.5,-858.5 2786.5,-828.5 2661.5,-828.5"/>
 <text text-anchor="start" x="2669.5" y="-846.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/detail/strided</text>
@@ -1564,15 +1564,15 @@
 </a>
 </g>
 </g>
-<!-- Node95&#45;&gt;Node97 -->
+<!-- Node96&#45;&gt;Node98 -->
 <g id="edge188" class="edge">
-<title>Node95&#45;&gt;Node97</title>
+<title>Node96&#45;&gt;Node98</title>
 <path fill="none" stroke="#191970" d="M2618.5685,-962.3416C2636.4776,-942.1547 2678.3006,-895.012 2703.5575,-866.5427"/>
 <polygon fill="#191970" stroke="#191970" points="2706.4955,-868.5049 2710.5137,-858.7016 2701.2591,-863.8594 2706.4955,-868.5049"/>
 </g>
-<!-- Node98 -->
+<!-- Node99 -->
 <g id="node49" class="node">
-<title>Node98</title>
+<title>Node99</title>
 <g id="a_node49"><a xlink:href="tensor__utils_8h.html" target="_top" xlink:title="Utility functions for handling tensor. ">
 <polygon fill="#ffffff" stroke="#000000" points="3018,-761.5 3018,-791.5 3140,-791.5 3140,-761.5 3018,-761.5"/>
 <text text-anchor="start" x="3026" y="-779.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/topi/detail/tensor</text>
@@ -1580,99 +1580,99 @@
 </a>
 </g>
 </g>
-<!-- Node95&#45;&gt;Node98 -->
+<!-- Node96&#45;&gt;Node99 -->
 <g id="edge195" class="edge">
-<title>Node95&#45;&gt;Node98</title>
+<title>Node96&#45;&gt;Node99</title>
 <path fill="none" stroke="#191970" d="M2669.2665,-962.6706C2670.8606,-962.4418 2672.4407,-962.2179 2674,-962 2803.5022,-943.9044 2852.7265,-991.3253 2966,-926 3016.5341,-896.8568 3051.713,-835.1996 3068.2564,-800.8698"/>
 <polygon fill="#191970" stroke="#191970" points="3071.5443,-802.1013 3072.6077,-791.5599 3065.2027,-799.1373 3071.5443,-802.1013"/>
 </g>
-<!-- Node95&#45;&gt;Node99 -->
+<!-- Node96&#45;&gt;Node100 -->
 <g id="edge200" class="edge">
-<title>Node95&#45;&gt;Node99</title>
+<title>Node96&#45;&gt;Node100</title>
 <path fill="none" stroke="#191970" d="M2669.0805,-969.7962C2802.8988,-964.4538 3125.7445,-949.4966 3232,-926 3235.195,-925.2935 3238.4734,-924.387 3241.7171,-923.3677"/>
 <polygon fill="#191970" stroke="#191970" points="3242.932,-926.6508 3251.2258,-920.0582 3240.631,-920.0398 3242.932,-926.6508"/>
 </g>
-<!-- Node96&#45;&gt;Node17 -->
+<!-- Node97&#45;&gt;Node17 -->
 <g id="edge180" class="edge">
-<title>Node96&#45;&gt;Node17</title>
+<title>Node97&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M2576.2985,-546.1766C2378.4572,-543.2186 1726.316,-531.6322 1188,-501 1178.5112,-500.4601 514.3306,-449.5748 506,-445 473.7434,-427.2861 456,-415.8004 456,-379 456,-379 456,-379 456,-149.5 456,-108.9699 456,-61.5443 456,-35.3795"/>
 <polygon fill="#191970" stroke="#191970" points="459.5001,-35.2517 456,-25.2517 452.5001,-35.2518 459.5001,-35.2517"/>
 </g>
-<!-- Node96&#45;&gt;Node19 -->
+<!-- Node97&#45;&gt;Node19 -->
 <g id="edge181" class="edge">
-<title>Node96&#45;&gt;Node19</title>
+<title>Node97&#45;&gt;Node19</title>
 <path fill="none" stroke="#191970" d="M2638.3046,-537.2268C2647.7351,-495.235 2680.7732,-326.5727 2635,-201 2621.7088,-164.5374 2610.1527,-157.1448 2579,-134 2518.1017,-88.7557 2495.654,-85.1865 2422,-67 2305.7011,-38.2837 1931.9806,-21.331 1818.792,-16.7611"/>
 <polygon fill="#191970" stroke="#191970" points="1818.8004,-13.2587 1808.6689,-16.3575 1818.5215,-20.2532 1818.8004,-13.2587"/>
 </g>
-<!-- Node96&#45;&gt;Node21 -->
+<!-- Node97&#45;&gt;Node21 -->
 <g id="edge182" class="edge">
-<title>Node96&#45;&gt;Node21</title>
+<title>Node97&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M2674.7724,-537.4769C2737.9962,-519.7888 2862.4103,-475.5396 2924,-389 3007.5573,-271.5939 3017.4677,-200.0013 2962,-67 2956.0761,-52.7957 2944.1169,-40.5036 2933.1597,-31.4942"/>
 <polygon fill="#191970" stroke="#191970" points="2935.1472,-28.607 2925.0926,-25.2672 2930.87,-34.1482 2935.1472,-28.607"/>
 </g>
-<!-- Node96&#45;&gt;Node27 -->
+<!-- Node97&#45;&gt;Node27 -->
 <g id="edge179" class="edge">
-<title>Node96&#45;&gt;Node27</title>
+<title>Node97&#45;&gt;Node27</title>
 <path fill="none" stroke="#191970" d="M2643.0417,-537.3936C2655.5382,-519.8993 2681.6267,-481.2927 2696,-445 2736.4119,-342.9593 2764.8503,-306.3266 2734,-201 2718.7171,-148.8226 2690.491,-147.4995 2668,-98 2658.6863,-77.5019 2652.0123,-52.4958 2648.1029,-35.4233"/>
 <polygon fill="#191970" stroke="#191970" points="2651.4841,-34.5013 2645.9331,-25.477 2644.645,-35.9932 2651.4841,-34.5013"/>
 </g>
-<!-- Node96&#45;&gt;Node53 -->
+<!-- Node97&#45;&gt;Node53 -->
 <g id="edge177" class="edge">
-<title>Node96&#45;&gt;Node53</title>
+<title>Node97&#45;&gt;Node53</title>
 <path fill="none" stroke="#191970" d="M2629.8745,-537.3705C2613.8478,-513.1619 2567.9512,-449.9206 2511,-425 2434.4322,-391.4957 1852.5173,-381.7687 1673.945,-379.5581"/>
 <polygon fill="#191970" stroke="#191970" points="1673.8024,-376.0562 1663.7607,-379.4348 1673.7176,-383.0557 1673.8024,-376.0562"/>
 </g>
-<!-- Node96&#45;&gt;Node62 -->
+<!-- Node97&#45;&gt;Node62 -->
 <g id="edge178" class="edge">
-<title>Node96&#45;&gt;Node62</title>
+<title>Node97&#45;&gt;Node62</title>
 <path fill="none" stroke="#191970" d="M2576.3199,-545.9977C2374.2304,-542.3918 1715.7927,-528.8574 1482.1943,-500.8939"/>
 <polygon fill="#191970" stroke="#191970" points="1482.4155,-497.3946 1472.0607,-499.6372 1481.5539,-504.3414 1482.4155,-497.3946"/>
 </g>
-<!-- Node97&#45;&gt;Node17 -->
+<!-- Node98&#45;&gt;Node17 -->
 <g id="edge192" class="edge">
-<title>Node97&#45;&gt;Node17</title>
+<title>Node98&#45;&gt;Node17</title>
 <path fill="none" stroke="#191970" d="M2665.6153,-828.4704C2590.0718,-809.629 2454.1271,-777.7737 2336,-761 1777.8914,-681.7502 1625.0311,-772.9963 1071,-669 987.8213,-653.3867 969.8565,-637.9938 889,-613 621.4617,-530.3003 304,-659.0286 304,-379 304,-379 304,-379 304,-149.5 304,-85.8873 379.8363,-45.284 424.3757,-26.9227"/>
 <polygon fill="#191970" stroke="#191970" points="425.8977,-30.0841 433.8979,-23.138 423.3122,-23.5791 425.8977,-30.0841"/>
 </g>
-<!-- Node97&#45;&gt;Node21 -->
+<!-- Node98&#45;&gt;Node21 -->
 <g id="edge193" class="edge">
-<title>Node97&#45;&gt;Node21</title>
+<title>Node98&#45;&gt;Node21</title>
 <path fill="none" stroke="#191970" d="M2780.7884,-828.4444C2902.1863,-792.4485 3176,-689.282 3176,-491 3176,-491 3176,-491 3176,-267 3176,-162.178 3142.6857,-117.8073 3051,-67 3016.2337,-47.7344 2972.8458,-33.145 2943.4495,-24.4981"/>
 <polygon fill="#191970" stroke="#191970" points="2944.2499,-21.0865 2933.6722,-21.6895 2942.3172,-27.8144 2944.2499,-21.0865"/>
 </g>
-<!-- Node97&#45;&gt;Node27 -->
+<!-- Node98&#45;&gt;Node27 -->
 <g id="edge190" class="edge">
-<title>Node97&#45;&gt;Node27</title>
+<title>Node98&#45;&gt;Node27</title>
 <path fill="none" stroke="#191970" d="M2730.5669,-828.3253C2752.7072,-776.2295 2824.5374,-599.3699 2848,-445 2853.0756,-411.6057 2850.3019,-402.6993 2848,-369 2844.8816,-323.348 2834,-312.7584 2834,-267 2834,-267 2834,-267 2834,-211 2834,-120.8097 2728.7237,-56.5654 2674.9,-29.6135"/>
 <polygon fill="#191970" stroke="#191970" points="2676.3168,-26.4103 2665.7954,-25.1646 2673.2435,-32.6996 2676.3168,-26.4103"/>
 </g>
-<!-- Node97&#45;&gt;Node48 -->
+<!-- Node98&#45;&gt;Node48 -->
 <g id="edge191" class="edge">
-<title>Node97&#45;&gt;Node48</title>
+<title>Node98&#45;&gt;Node48</title>
 <path fill="none" stroke="#191970" d="M2724,-828.3054C2724,-790.323 2724,-688.1771 2724,-603 2724,-603 2724,-603 2724,-547 2724,-342.2521 2457.0821,-204.4864 2364.3169,-162.7897"/>
 <polygon fill="#191970" stroke="#191970" points="2365.7028,-159.5756 2355.1421,-158.725 2362.8674,-165.9756 2365.7028,-159.5756"/>
 </g>
-<!-- Node97&#45;&gt;Node53 -->
+<!-- Node98&#45;&gt;Node53 -->
 <g id="edge189" class="edge">
-<title>Node97&#45;&gt;Node53</title>
+<title>Node98&#45;&gt;Node53</title>
 <path fill="none" stroke="#191970" d="M2716.166,-828.2987C2705.9922,-809.9465 2686.5894,-779.2761 2662,-761 2334.1954,-517.3588 1834.6845,-415.0836 1673.8318,-387.2838"/>
 <polygon fill="#191970" stroke="#191970" points="1674.1783,-383.7923 1663.7319,-385.5596 1673.0003,-390.6925 1674.1783,-383.7923"/>
 </g>
-<!-- Node97&#45;&gt;Node83 -->
+<!-- Node98&#45;&gt;Node83 -->
 <g id="edge194" class="edge">
-<title>Node97&#45;&gt;Node83</title>
+<title>Node98&#45;&gt;Node83</title>
 <path fill="none" stroke="#191970" d="M2661.3419,-836.707C2547.475,-824.3622 2306.962,-798.2871 2183.1834,-784.8678"/>
 <polygon fill="#191970" stroke="#191970" points="2183.3454,-781.3649 2173.0264,-783.7666 2182.5909,-788.3241 2183.3454,-781.3649"/>
 </g>
-<!-- Node98&#45;&gt;Node1 -->
+<!-- Node99&#45;&gt;Node1 -->
 <g id="edge196" class="edge">
-<title>Node98&#45;&gt;Node1</title>
+<title>Node99&#45;&gt;Node1</title>
 <path fill="none" stroke="#191970" d="M3017.9578,-772.2243C2858.0387,-761.0226 2428.0028,-730.9005 2264.9076,-719.4764"/>
 <polygon fill="#191970" stroke="#191970" points="2264.7767,-715.9588 2254.5565,-718.7514 2264.2875,-722.9417 2264.7767,-715.9588"/>
 </g>
... 10181 lines suppressed ...