You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2023/01/20 01:05:19 UTC
[tvm-site] branch asf-site updated: deploying docs (apache/tvm@cfa65b26c1bd975daaef78c60b16989be0d23970)

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new d55536df72 deploying docs (apache/tvm@cfa65b26c1bd975daaef78c60b16989be0d23970)
d55536df72 is described below

commit d55536df72d8d8c73725e678b32729d2f1d5a308
Author: tvm-bot <95...@users.noreply.github.com>
AuthorDate: Fri Jan 20 01:05:13 2023 +0000

    deploying docs (apache/tvm@cfa65b26c1bd975daaef78c60b16989be0d23970)
---
 .../micro_mlperftiny.ipynb                         |  230 ++++
 .../micro_mlperftiny.py                            |  312 ++++++
 docs/_images/sphx_glr_micro_mlperftiny_thumb.png   |  Bin 0 -> 26794 bytes
 docs/_images/sphx_glr_micro_train_001.png          |  Bin 324216 -> 329141 bytes
 docs/_images/sphx_glr_micro_train_thumb.png        |  Bin 23634 -> 22792 bytes
 .../how_to/compile_models/from_darknet.rst.txt     |    2 +-
 .../how_to/compile_models/from_keras.rst.txt       |    2 +-
 .../how_to/compile_models/from_mxnet.rst.txt       |    2 +-
 .../how_to/compile_models/from_oneflow.rst.txt     |    2 +-
 .../how_to/compile_models/from_pytorch.rst.txt     |    2 +-
 .../how_to/compile_models/from_tensorflow.rst.txt  |    2 +-
 .../compile_models/sg_execution_times.rst.txt      |   22 +-
 .../deploy_models/deploy_model_on_adreno.rst.txt   |    2 +-
 .../deploy_models/deploy_model_on_android.rst.txt  |    2 +-
 .../deploy_object_detection_pytorch.rst.txt        |    4 +-
 .../deploy_models/deploy_prequantized.rst.txt      |    6 +-
 .../deploy_prequantized_tflite.rst.txt             |    4 +-
 .../how_to/deploy_models/deploy_quantized.rst.txt  |    2 +-
 .../deploy_models/deploy_ssd_gluoncv.rst.txt       |    4 +-
 .../deploy_models/sg_execution_times.rst.txt       |   20 +-
 .../extend_tvm/bring_your_own_datatypes.rst.txt    |    2 +-
 .../how_to/extend_tvm/sg_execution_times.rst.txt   |    8 +-
 .../how_to/extend_tvm/use_pass_instrument.rst.txt  |   16 +-
 .../optimize_operators/opt_conv_cuda.rst.txt       |    2 +-
 .../optimize_operators/opt_conv_tensorcore.rst.txt |    2 +-
 .../how_to/optimize_operators/opt_gemm.rst.txt     |   16 +-
 .../optimize_operators/sg_execution_times.rst.txt  |    8 +-
 .../sg_execution_times.rst.txt                     |   14 +-
 .../tune_conv2d_layer_cuda.rst.txt                 | 1165 +++++---------------
 .../tune_network_cuda.rst.txt                      |    4 +-
 .../tune_network_x86.rst.txt                       |    4 +-
 .../tune_sparse_x86.rst.txt                        |   78 +-
 .../tune_with_autotvm/sg_execution_times.rst.txt   |    6 +-
 .../tune_with_autotvm/tune_conv2d_cuda.rst.txt     | 1038 ++++++++++++++++-
 .../how_to/work_with_microtvm/index.rst.txt        |   18 +
 .../work_with_microtvm/micro_autotune.rst.txt      |   16 +-
 .../work_with_microtvm/micro_mlperftiny.rst.txt    |  389 +++++++
 .../work_with_microtvm/micro_pytorch.rst.txt       |    4 +-
 .../how_to/work_with_microtvm/micro_train.rst.txt  |   18 +-
 .../work_with_microtvm/sg_execution_times.rst.txt  |   14 +-
 .../work_with_relay/sg_execution_times.rst.txt     |    8 +-
 .../how_to/work_with_schedules/intrin_math.rst.txt |    2 +-
 .../work_with_schedules/sg_execution_times.rst.txt |   14 +-
 .../how_to/work_with_schedules/tensorize.rst.txt   |    2 +-
 .../tutorials/autotvm/sg_execution_times.rst.txt   |    4 +-
 .../frontend/deploy_classification.rst.txt         |    2 +-
 .../tutorials/frontend/deploy_detection.rst.txt    |    2 +-
 .../tutorials/frontend/sg_execution_times.rst.txt  |    6 +-
 .../tutorials/optimize/sg_execution_times.rst.txt  |    6 +-
 .../topic/vta/tutorials/sg_execution_times.rst.txt |    6 +-
 .../tutorial/auto_scheduler_matmul_x86.rst.txt     |    6 +-
 docs/_sources/tutorial/autotvm_matmul_x86.rst.txt  |   20 +-
 docs/_sources/tutorial/autotvm_relay_x86.rst.txt   |   59 +-
 .../tutorial/cross_compilation_and_rpc.rst.txt     |    2 +-
 docs/_sources/tutorial/intro_topi.rst.txt          |    2 +-
 docs/_sources/tutorial/sg_execution_times.rst.txt  |   22 +-
 .../tutorial/tensor_expr_get_started.rst.txt       |   47 +-
 docs/commit_hash                                   |    2 +-
 docs/how_to/compile_models/from_darknet.html       |    2 +-
 docs/how_to/compile_models/from_keras.html         |    2 +-
 docs/how_to/compile_models/from_mxnet.html         |    2 +-
 docs/how_to/compile_models/from_oneflow.html       |   15 +-
 docs/how_to/compile_models/from_pytorch.html       |   10 +-
 docs/how_to/compile_models/from_tensorflow.html    |    2 +-
 docs/how_to/compile_models/sg_execution_times.html |   26 +-
 .../deploy_models/deploy_model_on_adreno.html      |    2 +-
 .../deploy_models/deploy_model_on_android.html     |    2 +-
 .../deploy_object_detection_pytorch.html           |   38 +-
 docs/how_to/deploy_models/deploy_prequantized.html |    9 +-
 .../deploy_models/deploy_prequantized_tflite.html  |    4 +-
 docs/how_to/deploy_models/deploy_quantized.html    |    2 +-
 docs/how_to/deploy_models/deploy_ssd_gluoncv.html  |   35 +-
 docs/how_to/deploy_models/sg_execution_times.html  |   20 +-
 .../extend_tvm/bring_your_own_datatypes.html       |    2 +-
 docs/how_to/extend_tvm/sg_execution_times.html     |    8 +-
 docs/how_to/extend_tvm/use_pass_instrument.html    |   16 +-
 docs/how_to/optimize_operators/opt_conv_cuda.html  |    2 +-
 .../optimize_operators/opt_conv_tensorcore.html    |    2 +-
 docs/how_to/optimize_operators/opt_gemm.html       |   16 +-
 .../optimize_operators/sg_execution_times.html     |    8 +-
 .../sg_execution_times.html                        |   14 +-
 .../tune_conv2d_layer_cuda.html                    | 1165 +++++---------------
 .../tune_with_autoscheduler/tune_network_cuda.html |    4 +-
 .../tune_with_autoscheduler/tune_network_x86.html  |    4 +-
 .../tune_with_autoscheduler/tune_sparse_x86.html   |   78 +-
 .../tune_with_autotvm/sg_execution_times.html      |    6 +-
 .../how_to/tune_with_autotvm/tune_conv2d_cuda.html | 1038 ++++++++++++++++-
 docs/how_to/work_with_microtvm/index.html          |    4 +
 docs/how_to/work_with_microtvm/micro_aot.html      |    1 +
 docs/how_to/work_with_microtvm/micro_autotune.html |   17 +-
 docs/how_to/work_with_microtvm/micro_ethosu.html   |    5 +-
 .../work_with_microtvm/micro_mlperftiny.html       |  796 +++++++++++++
 docs/how_to/work_with_microtvm/micro_pytorch.html  |   10 +-
 .../work_with_microtvm/micro_reference_vm.html     |    1 +
 docs/how_to/work_with_microtvm/micro_tflite.html   |    1 +
 docs/how_to/work_with_microtvm/micro_train.html    |   17 +-
 docs/how_to/work_with_microtvm/micro_tvmc.html     |    1 +
 .../work_with_microtvm/sg_execution_times.html     |   16 +-
 .../how_to/work_with_relay/sg_execution_times.html |    8 +-
 docs/how_to/work_with_schedules/intrin_math.html   |    2 +-
 .../work_with_schedules/sg_execution_times.html    |   14 +-
 docs/how_to/work_with_schedules/tensorize.html     |    2 +-
 docs/install/nnpack.html                           |   12 +-
 docs/objects.inv                                   |  Bin 24181 -> 24304 bytes
 docs/reference/api/python/auto_scheduler.html      |    4 +-
 .../api/typedoc/classes/bytestreamreader.html      |   12 +-
 .../api/typedoc/classes/cachedcallstack.html       |   34 +-
 docs/reference/api/typedoc/classes/dldatatype.html |   12 +-
 docs/reference/api/typedoc/classes/dldevice.html   |   10 +-
 .../reference/api/typedoc/classes/environment.html |   12 +-
 docs/reference/api/typedoc/classes/ffilibrary.html |   20 +-
 .../api/typedoc/classes/graphexecutor.html         |   16 +-
 docs/reference/api/typedoc/classes/instance.html   |   40 +-
 docs/reference/api/typedoc/classes/memory.html     |   34 +-
 docs/reference/api/typedoc/classes/module.html     |   10 +-
 docs/reference/api/typedoc/classes/ndarray.html    |   22 +-
 .../api/typedoc/classes/packedfunccell.html        |    6 +-
 docs/reference/api/typedoc/classes/rpcserver.html  |   14 +-
 docs/reference/api/typedoc/classes/scalar.html     |    6 +-
 .../api/typedoc/classes/webgpucontext.html         |   12 +-
 docs/reference/api/typedoc/enums/argtypecode.html  |   30 +-
 .../api/typedoc/enums/aynccallbackcode.html        |    4 +-
 .../api/typedoc/enums/dldatatypecode.html          |    8 +-
 .../api/typedoc/enums/rpcserverstate.html          |   12 +-
 docs/reference/api/typedoc/enums/sizeof.html       |   18 +-
 docs/reference/api/typedoc/index.html              |  112 +-
 .../api/typedoc/interfaces/disposable.html         |    2 +-
 .../api/typedoc/interfaces/functioninfo.html       |    6 +-
 .../api/typedoc/interfaces/libraryprovider.html    |    4 +-
 docs/searchindex.js                                |    2 +-
 .../vta/tutorials/autotvm/sg_execution_times.html  |    4 +-
 .../tutorials/frontend/deploy_classification.html  |    2 +-
 .../vta/tutorials/frontend/deploy_detection.html   |    2 +-
 .../vta/tutorials/frontend/sg_execution_times.html |    6 +-
 .../vta/tutorials/optimize/sg_execution_times.html |    6 +-
 docs/topic/vta/tutorials/sg_execution_times.html   |    6 +-
 docs/tutorial/auto_scheduler_matmul_x86.html       |    5 +-
 docs/tutorial/autotvm_matmul_x86.html              |   20 +-
 docs/tutorial/autotvm_relay_x86.html               |  273 +++--
 docs/tutorial/cross_compilation_and_rpc.html       |    2 +-
 docs/tutorial/intro_topi.html                      |    2 +-
 docs/tutorial/sg_execution_times.html              |   28 +-
 docs/tutorial/tensor_expr_get_started.html         |   43 +-
 143 files changed, 5296 insertions(+), 2660 deletions(-)

diff --git a/docs/_downloads/1fc1683d67bee4f26703504a58d42578/micro_mlperftiny.ipynb b/docs/_downloads/1fc1683d67bee4f26703504a58d42578/micro_mlperftiny.ipynb
new file mode 100644
index 0000000000..e9dd7442c4
--- /dev/null
+++ b/docs/_downloads/1fc1683d67bee4f26703504a58d42578/micro_mlperftiny.ipynb
@@ -0,0 +1,230 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%%shell\n# Installs the latest dev build of TVM from PyPI. If you wish to build\n# from source, see https://tvm.apache.org/docs/install/from_source.html\npip install apache-tvm --pre"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n\n# Creating Your MLPerfTiny Submission with microTVM\n**Authors**:\n[Mehrdad Hessar](https://github.com/mehrdadh)\n\nThis tutorial is showcasing building an MLPerfTiny submission using microTVM. This\ntutorial shows the steps to import a TFLite model from MLPerfTiny benchmark models,\ncompile it with TVM and generate a Zephyr project which can be flashed to a Zephyr\nsupported board to benchmark the model using EEMBC runner.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Install microTVM Python dependencies\n\nTVM does not include a package for Python serial communication, so\nwe must install one before using microTVM. We will also need TFLite\nto load models.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%%shell\npip install pyserial==3.5 tflite==2.1"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import os\nimport pathlib\nimport tarfile\nimport tempfile\nimport shutil"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Install Zephyr\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%%shell\n# Install west and ninja\npython3 -m pip install west\napt-get install -y ninja-build\n\n# Install ZephyrProject\nZEPHYR_PROJECT_PATH=\"/content/zephyrproject\"\nexport ZEPHYR_BASE=${ZEPHYR_PROJECT_PATH}/zephyr\nwest init ${ZEPHYR_PROJECT_PATH}\ncd ${ZEPHYR_BASE}\ngit checkout v2.7-branch\ncd ..\nwest update\nwest zephyr-export\nchmod -R o+w ${ZEPHYR_PROJECT_PATH}\n\n# Install Zephyr SDK\nZEPHYR_SDK_VERSION=0.13.2\nZEPHYR_SDK_FILE=\"/content/zephyr-sdk-linux-setup.run\" [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "**Note:** Install CMSIS-NN only if you are interested to generate this submission\nusing CMSIS-NN code generator.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Install CMSIS-NN\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%%shell\nCMSIS_SHA=\"51263182d16c92649a48144ba56c0945f9fce60e\"\nCMSIS_URL=\"http://github.com/ARM-software/CMSIS_5/archive/${CMSIS_SHA}.tar.gz\"\nexport CMSIS_PATH=/content/cmsis\nDOWNLOAD_PATH=\"/content/${CMSIS_SHA}.tar.gz\"\nmkdir ${CMSIS_PATH}\nwget ${CMSIS_URL} -O \"${DOWNLOAD_PATH}\"\ntar -xf \"${DOWNLOAD_PATH}\" -C ${CMSIS_PATH} --strip-components=1\nrm ${DOWNLOAD_PATH}\n\nCMSIS_NN_TAG=\"v4.0.0\"\nCMSIS_NN_URL=\"https://github.com/ARM-software/CMSIS-NN.git\"\ngit clone $ [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Import Python dependencies\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf\nimport numpy as np\n\nimport tvm\nfrom tvm import relay\nfrom tvm.relay.backend import Executor, Runtime\nfrom tvm.contrib.download import download_testdata\nfrom tvm.micro import export_model_library_format\nfrom tvm.micro.model_library_format import generate_c_interface_header\nfrom tvm.micro.testing.utils import (\n    create_header_file,\n    mlf_extract_workspace_size_bytes,\n)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Import Visual Wake Word Model\n\nTo begin with, download and import the Visual Wake Word (VWW) TFLite model from MLPerfTiny.\nThis model is originally from [MLPerf Tiny repository](https://github.com/mlcommons/tiny).\nWe also capture metadata information from the TFLite model such as input/output name,\nquantization parameters, etc. which will be used in following steps.\n\nWe use indexing for various models to build the submission. The indices are defined as follows:\nTo bui [...]
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "MODEL_URL = \"https://github.com/mlcommons/tiny/raw/bceb91c5ad2e2deb295547d81505721d3a87d578/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite\"\nMODEL_PATH = download_testdata(MODEL_URL, \"vww_96_int8.tflite\", module=\"model\")\n\nMODEL_SHORT_NAME = \"VWW\"\nMODEL_INDEX = 2\n\nUSE_CMSIS = os.environ.get(\"TVM_USE_CMSIS\", False)\n\ntflite_model_buf = open(MODEL_PATH, \"rb\").read()\ntry:\n    import tflite\n\n    tflite_model = tflite.Model.GetRootAsModel( [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Defining Target, Runtime and Executor\n\nNow we need to define the target, runtime and executor to compile this model. In this tutorial,\nwe use Ahead-of-Time (AoT) compilation and we build a standalone project. This is different\nthan using AoT with host-driven mode where the target would communicate with host using host-driven\nAoT executor to run inference.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Use the C runtime (crt)\nRUNTIME = Runtime(\"crt\")\n\n# Use the AoT executor with `unpacked-api=True` and `interface-api=c`. `interface-api=c` forces\n# the compiler to generate C type function APIs and `unpacked-api=True` forces the compiler\n# to generate minimal unpacked format inputs which reduces the stack memory usage on calling\n# inference layers of the model.\nEXECUTOR = Executor(\n    \"aot\",\n    {\"unpacked-api\": True, \"interface-api\": \"c\", \"workspace-byte- [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Compile the model and export model library format\n\nNow, we compile the model for the target. Then, we generate model\nlibrary format for the compiled model. We also need to calculate the\nworkspace size that is required for the compiled model.\n\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "config = {\"tir.disable_vectorize\": True}\nif USE_CMSIS:\n    from tvm.relay.op.contrib import cmsisnn\n\n    config[\"relay.ext.cmsisnn.options\"] = {\"mcpu\": TARGET.mcpu}\n    relay_mod = cmsisnn.partition_for_cmsisnn(relay_mod, params, mcpu=TARGET.mcpu)\n\nwith tvm.transform.PassContext(opt_level=3, config=config):\n    module = tvm.relay.build(\n        relay_mod, target=TARGET, params=params, runtime=RUNTIME, executor=EXECUTOR\n    )\n\ntemp_dir = tvm.contrib.utils.tempdi [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Generate input/output header files\n\nTo create a microTVM standalone project with AoT, we need to generate\ninput and output header files. These header files are used to connect\nthe input and output API from generated code to the rest of the\nstandalone project. For this specific submission, we only need to generate\noutput header file since the input API call is handled differently.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "extra_tar_dir = tvm.contrib.utils.tempdir()\nextra_tar_file = extra_tar_dir / \"extra.tar\"\n\nwith tarfile.open(extra_tar_file, \"w:gz\") as tf:\n    with tempfile.TemporaryDirectory() as tar_temp_dir:\n        model_files_path = os.path.join(tar_temp_dir, \"include\")\n        os.mkdir(model_files_path)\n        header_path = generate_c_interface_header(\n            module.libmod_name, [input_name], [output_name], [], {}, [], 0, model_files_path, {}, {}\n        )\n        tf [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Create the project, build and prepare the project tar file\n\nNow that we have the compiled model as a model library format,\nwe can generate the full project using Zephyr template project. First,\nwe prepare the project options, then build the project. Finally, we\ncleanup the temporary files and move the submission project to the\ncurrent working directory which could be downloaded and used on\nyour development kit.\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "input_total_size = 1\nfor i in range(len(input_shape)):\n    input_total_size *= input_shape[i]\n\ntemplate_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"zephyr\"))\nproject_options = {\n    \"extra_files_tar\": str(extra_tar_file),\n    \"project_type\": \"mlperftiny\",\n    \"board\": BOARD,\n    \"compile_definitions\": [\n        f\"-DWORKSPACE_SIZE={workspace_size + 512}\",  # Memory workspace size, 512 is a temporary offset\n        # since the mem [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Use this project with your board\n\nNow that we have the generated project, you can use this project locally\nto flash your board and prepare it for EEMBC runner software.\nTo do this follow these steps:\n\n```bash\ntar -xf project.tar\ncd project\nmkdir build\ncmake ..\nmake -j2\nwest flash\n```\nNow you can connect your board to EEMBC runner using this\n[instructions](https://github.com/eembc/energyrunner)\nand benchmark this model on your board.\n\n\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.7.5"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/docs/_downloads/4577847d31fa9ec38d0a6dda3e1d178d/micro_mlperftiny.py b/docs/_downloads/4577847d31fa9ec38d0a6dda3e1d178d/micro_mlperftiny.py
new file mode 100644
index 0000000000..79308e0723
--- /dev/null
+++ b/docs/_downloads/4577847d31fa9ec38d0a6dda3e1d178d/micro_mlperftiny.py
@@ -0,0 +1,312 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _tutorial-micro-MLPerfTiny:
+
+Creating Your MLPerfTiny Submission with microTVM
+=================================================
+**Authors**:
+`Mehrdad Hessar <https://github.com/mehrdadh>`_
+
+This tutorial is showcasing building an MLPerfTiny submission using microTVM. This
+tutorial shows the steps to import a TFLite model from MLPerfTiny benchmark models,
+compile it with TVM and generate a Zephyr project which can be flashed to a Zephyr
+supported board to benchmark the model using EEMBC runner.
+"""
+
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
+#
+
+import os
+import pathlib
+import tarfile
+import tempfile
+import shutil
+
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_zephyr.rst
+#
+
+
+######################################################################
+#
+# **Note:** Install CMSIS-NN only if you are interested to generate this submission
+# using CMSIS-NN code generator.
+#
+
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_cmsis.rst
+#
+
+######################################################################
+# Import Python dependencies
+# -------------------------------
+#
+import tensorflow as tf
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.backend import Executor, Runtime
+from tvm.contrib.download import download_testdata
+from tvm.micro import export_model_library_format
+from tvm.micro.model_library_format import generate_c_interface_header
+from tvm.micro.testing.utils import (
+    create_header_file,
+    mlf_extract_workspace_size_bytes,
+)
+
+######################################################################
+# Import Visual Wake Word Model
+# --------------------------------------------------------------------
+#
+# To begin with, download and import the Visual Wake Word (VWW) TFLite model from MLPerfTiny.
+# This model is originally from `MLPerf Tiny repository <https://github.com/mlcommons/tiny>`_.
+# We also capture metadata information from the TFLite model such as input/output name,
+# quantization parameters, etc. which will be used in following steps.
+#
+# We use indexing for various models to build the submission. The indices are defined as follows:
+# To build another model, you need to update the model URL, the short name and index number.
+#
+#   * Keyword Spotting(KWS) 1
+#   * Visual Wake Word(VWW) 2
+#   * Anomaly Detection(AD) 3
+#   * Image Classification(IC) 4
+#
+# If you would like to build the submission with CMSIS-NN, modify USE_CMSIS environment variable.
+#
+#   .. code-block:: bash
+#
+#     export USE_CMSIS=1
+#
+
+MODEL_URL = "https://github.com/mlcommons/tiny/raw/bceb91c5ad2e2deb295547d81505721d3a87d578/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite"
+MODEL_PATH = download_testdata(MODEL_URL, "vww_96_int8.tflite", module="model")
+
+MODEL_SHORT_NAME = "VWW"
+MODEL_INDEX = 2
+
+USE_CMSIS = os.environ.get("TVM_USE_CMSIS", False)
+
+tflite_model_buf = open(MODEL_PATH, "rb").read()
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+interpreter = tf.lite.Interpreter(model_path=str(MODEL_PATH))
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+
+input_name = input_details[0]["name"]
+input_shape = tuple(input_details[0]["shape"])
+input_dtype = np.dtype(input_details[0]["dtype"]).name
+output_name = output_details[0]["name"]
+output_shape = tuple(output_details[0]["shape"])
+output_dtype = np.dtype(output_details[0]["dtype"]).name
+
+# We extract quantization information from TFLite model.
+# This is required for all models except Anomaly Detection,
+# because for other models we send quantized data to interpreter
+# from host, however, for AD model we send floating data and quantization
+# happens on the microcontroller.
+if MODEL_SHORT_NAME != "AD":
+    quant_output_scale = output_details[0]["quantization_parameters"]["scales"][0]
+    quant_output_zero_point = output_details[0]["quantization_parameters"]["zero_points"][0]
+
+relay_mod, params = relay.frontend.from_tflite(
+    tflite_model, shape_dict={input_name: input_shape}, dtype_dict={input_name: input_dtype}
+)
+
+######################################################################
+# Defining Target, Runtime and Executor
+# --------------------------------------------------------------------
+#
+# Now we need to define the target, runtime and executor to compile this model. In this tutorial,
+# we use Ahead-of-Time (AoT) compilation and we build a standalone project. This is different
+# than using AoT with host-driven mode where the target would communicate with host using host-driven
+# AoT executor to run inference.
+#
+
+# Use the C runtime (crt)
+RUNTIME = Runtime("crt")
+
+# Use the AoT executor with `unpacked-api=True` and `interface-api=c`. `interface-api=c` forces
+# the compiler to generate C type function APIs and `unpacked-api=True` forces the compiler
+# to generate minimal unpacked format inputs which reduces the stack memory usage on calling
+# inference layers of the model.
+EXECUTOR = Executor(
+    "aot",
+    {"unpacked-api": True, "interface-api": "c", "workspace-byte-alignment": 8},
+)
+
+# Select a Zephyr board
+BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
+
+# Get the the full target description using the BOARD
+TARGET = tvm.micro.testing.get_target("zephyr", BOARD)
+
+######################################################################
+# Compile the model and export model library format
+# --------------------------------------------------------------------
+#
+# Now, we compile the model for the target. Then, we generate model
+# library format for the compiled model. We also need to calculate the
+# workspace size that is required for the compiled model.
+#
+#
+
+config = {"tir.disable_vectorize": True}
+if USE_CMSIS:
+    from tvm.relay.op.contrib import cmsisnn
+
+    config["relay.ext.cmsisnn.options"] = {"mcpu": TARGET.mcpu}
+    relay_mod = cmsisnn.partition_for_cmsisnn(relay_mod, params, mcpu=TARGET.mcpu)
+
+with tvm.transform.PassContext(opt_level=3, config=config):
+    module = tvm.relay.build(
+        relay_mod, target=TARGET, params=params, runtime=RUNTIME, executor=EXECUTOR
+    )
+
+temp_dir = tvm.contrib.utils.tempdir()
+model_tar_path = temp_dir / "model.tar"
+export_model_library_format(module, model_tar_path)
+workspace_size = mlf_extract_workspace_size_bytes(model_tar_path)
+
+######################################################################
+# Generate input/output header files
+# --------------------------------------------------------------------
+#
+# To create a microTVM standalone project with AoT, we need to generate
+# input and output header files. These header files are used to connect
+# the input and output API from generated code to the rest of the
+# standalone project. For this specific submission, we only need to generate
+# output header file since the input API call is handled differently.
+#
+
+extra_tar_dir = tvm.contrib.utils.tempdir()
+extra_tar_file = extra_tar_dir / "extra.tar"
+
+with tarfile.open(extra_tar_file, "w:gz") as tf:
+    with tempfile.TemporaryDirectory() as tar_temp_dir:
+        model_files_path = os.path.join(tar_temp_dir, "include")
+        os.mkdir(model_files_path)
+        header_path = generate_c_interface_header(
+            module.libmod_name, [input_name], [output_name], [], {}, [], 0, model_files_path, {}, {}
+        )
+        tf.add(header_path, arcname=os.path.relpath(header_path, tar_temp_dir))
+
+    create_header_file(
+        "output_data",
+        np.zeros(
+            shape=output_shape,
+            dtype=output_dtype,
+        ),
+        "include",
+        tf,
+    )
+
+######################################################################
+# Create the project, build and prepare the project tar file
+# --------------------------------------------------------------------
+#
+# Now that we have the compiled model as a model library format,
+# we can generate the full project using Zephyr template project. First,
+# we prepare the project options, then build the project. Finally, we
+# cleanup the temporary files and move the submission project to the
+# current working directory which could be downloaded and used on
+# your development kit.
+#
+
+input_total_size = 1
+for i in range(len(input_shape)):
+    input_total_size *= input_shape[i]
+
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
+project_options = {
+    "extra_files_tar": str(extra_tar_file),
+    "project_type": "mlperftiny",
+    "board": BOARD,
+    "compile_definitions": [
+        f"-DWORKSPACE_SIZE={workspace_size + 512}",  # Memory workspace size, 512 is a temporary offset
+        # since the memory calculation is not accurate.
+        f"-DTARGET_MODEL={MODEL_INDEX}",  # Sets the model index for project compilation.
+        f"-DTH_MODEL_VERSION=EE_MODEL_VERSION_{MODEL_SHORT_NAME}01",  # Sets model version. This is required by MLPerfTiny API.
+        f"-DMAX_DB_INPUT_SIZE={input_total_size}",  # Max size of the input data array.
+    ],
+}
+
+if MODEL_SHORT_NAME != "AD":
+    project_options["compile_definitions"].append(f"-DOUT_QUANT_SCALE={quant_output_scale}")
+    project_options["compile_definitions"].append(f"-DOUT_QUANT_ZERO={quant_output_zero_point}")
+
+if USE_CMSIS:
+    project_options["compile_definitions"].append(f"-DCOMPILE_WITH_CMSISNN=1")
+
+# Note: You might need to adjust this based on the board that you are using.
+project_options["config_main_stack_size"] = 4000
+
+if USE_CMSIS:
+    project_options["cmsis_path"] = os.environ.get("CMSIS_PATH", "/content/cmsis")
+
+generated_project_dir = temp_dir / "project"
+
+project = tvm.micro.project.generate_project_from_mlf(
+    template_project_path, generated_project_dir, model_tar_path, project_options
+)
+project.build()
+
+# Cleanup the build directory and extra artifacts
+shutil.rmtree(generated_project_dir / "build")
+(generated_project_dir / "model.tar").unlink()
+
+project_tar_path = pathlib.Path(os.getcwd()) / "project.tar"
+with tarfile.open(project_tar_path, "w:tar") as tar:
+    tar.add(generated_project_dir, arcname=os.path.basename("project"))
+
+print(f"The generated project is located here: {project_tar_path}")
+
+######################################################################
+# Use this project with your board
+# --------------------------------------------------------------------
+#
+# Now that we have the generated project, you can use this project locally
+# to flash your board and prepare it for EEMBC runner software.
+# To do this follow these steps:
+#
+#   .. code-block:: bash
+#
+#     tar -xf project.tar
+#     cd project
+#     mkdir build
+#     cmake ..
+#     make -j2
+#     west flash
+#
+# Now you can connect your board to EEMBC runner using this
+# `instructions <https://github.com/eembc/energyrunner>`_
+# and benchmark this model on your board.
+#
diff --git a/docs/_images/sphx_glr_micro_mlperftiny_thumb.png b/docs/_images/sphx_glr_micro_mlperftiny_thumb.png
new file mode 100644
index 0000000000..8a5fed589d
Binary files /dev/null and b/docs/_images/sphx_glr_micro_mlperftiny_thumb.png differ
diff --git a/docs/_images/sphx_glr_micro_train_001.png b/docs/_images/sphx_glr_micro_train_001.png
index 58230570fb..9c86215278 100644
Binary files a/docs/_images/sphx_glr_micro_train_001.png and b/docs/_images/sphx_glr_micro_train_001.png differ
diff --git a/docs/_images/sphx_glr_micro_train_thumb.png b/docs/_images/sphx_glr_micro_train_thumb.png
index c7f45c5bc4..839a4f5975 100644
Binary files a/docs/_images/sphx_glr_micro_train_thumb.png and b/docs/_images/sphx_glr_micro_train_thumb.png differ
diff --git a/docs/_sources/how_to/compile_models/from_darknet.rst.txt b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
index 9590b9888c..520dc78408 100644
--- a/docs/_sources/how_to/compile_models/from_darknet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
@@ -318,7 +318,7 @@ The process is no different from other examples.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  15.692 seconds)
+   **Total running time of the script:** ( 1 minutes  16.257 seconds)
 
 
 .. _sphx_glr_download_how_to_compile_models_from_darknet.py:
diff --git a/docs/_sources/how_to/compile_models/from_keras.rst.txt b/docs/_sources/how_to/compile_models/from_keras.rst.txt
index 5a84dca18c..afdcda9a9c 100644
--- a/docs/_sources/how_to/compile_models/from_keras.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_keras.rst.txt
@@ -232,7 +232,7 @@ Look up prediction top 1 index in 1000 class synset.
  .. code-block:: none
 
     Relay top-1 id: 285, class name: Egyptian cat
-
    1/1 [==============================] - ETA: 0s
    1/1 [==============================] - 1s 917ms/step
+
    1/1 [==============================] - ETA: 0s
    1/1 [==============================] - 1s 958ms/step
     Keras top-1 id: 285, class name: Egyptian cat
 
 
diff --git a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
index 19e02d1340..6a0580eee4 100644
--- a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
@@ -116,7 +116,7 @@ In this section, we download a pretrained imagenet model and classify an image.
 
  .. code-block:: none
 
-    Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip66409e78-3c6c-484b-aaff-ffc414c3d71d from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+    Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip74b370c8-6b03-402b-8791-7ba7f0d5f123 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
     x (1, 3, 224, 224)
 
 
diff --git a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
index 1aa206f637..96c7e6a257 100644
--- a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
@@ -121,7 +121,7 @@ Load a pretrained OneFlow model and save model
  .. code-block:: none
 
     Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
-
      0%|          | 0.00/41.5M [00:00<?, ?B/s]
     15%|#5        | 6.33M/41.5M [00:00<00:00, 53.8MB/s]
     28%|##7       | 11.5M/41.5M [00:00<00:00, 46.0MB/s]
     38%|###8      | 15.9M/41.5M [00:00<00:00, 43.9MB/s]
     48%|####8     | 20.1M/41.5M [00:00<00:00, 37.7MB/s]
     58%|#####7    | 24.0M/41.5M [00:00<00:00, 30.2MB/s]
     77%|#######7  | 32.0M/41.5M [00:00<00:00, 38.9MB/s]
     92%|#########2| 38.3M/41.5M [00:00<00:00, 44.0MB/s]
    100%|##########| 41.5M/41.5M [00:01<00:00, 42.3MB/s]
+
      0%|          | 0.00/41.5M [00:00<?, ?B/s]
     19%|#9        | 7.99M/41.5M [00:00<00:00, 51.7MB/s]
     39%|###8      | 16.0M/41.5M [00:00<00:00, 48.4MB/s]
     58%|#####7    | 24.0M/41.5M [00:00<00:00, 45.0MB/s]
     77%|#######7  | 32.0M/41.5M [00:00<00:00, 47.3MB/s]
     90%|########9 | 37.2M/41.5M [00:00<00:00, 44.8MB/s]
    100%|#########9| 41.4M/41.5M [00:00<00:00, 43.5MB/s]
    100%|##########| 41.5M/41.5M [00:00<00:00, 45.3MB/s]
 
 
 
diff --git a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
index 503af40b7b..686e7a5c10 100644
--- a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
@@ -101,7 +101,7 @@ Load a pretrained PyTorch model
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.
       warnings.warn(msg)
     Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
-
      0%|          | 0.00/44.7M [00:00<?, ?B/s]
     18%|#7        | 7.99M/44.7M [00:00<00:00, 59.1MB/s]
     36%|###5      | 16.0M/44.7M [00:00<00:00, 48.4MB/s]
     54%|#####3    | 24.0M/44.7M [00:00<00:00, 59.5MB/s]
     72%|#######1  | 32.0M/44.7M [00:00<00:00, 60.7MB/s]
     90%|########9 | 40.0M/44.7M [00:00<00:00, 58.3MB/s]
    100%|##########| 44.7M/44.7M [00:00<00:00, 62.7MB/s]
+
      0%|          | 0.00/44.7M [00:00<?, ?B/s]
     24%|##3       | 10.6M/44.7M [00:00<00:00, 112MB/s]
     51%|#####1    | 22.9M/44.7M [00:00<00:00, 122MB/s]
     77%|#######7  | 34.5M/44.7M [00:00<00:00, 108MB/s]
    100%|##########| 44.7M/44.7M [00:00<00:00, 108MB/s]
 
 
 
diff --git a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
index 141dd1c518..8769ede750 100644
--- a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
@@ -424,7 +424,7 @@ Run the corresponding model on tensorflow
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  18.253 seconds)
+   **Total running time of the script:** ( 1 minutes  20.531 seconds)
 
 
 .. _sphx_glr_download_how_to_compile_models_from_tensorflow.py:
diff --git a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
index df3a295ebc..e16f971ac7 100644
--- a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
 
 Computation times
 =================
-**06:10.782** total execution time for **how_to_compile_models** files:
+**06:19.340** total execution time for **how_to_compile_models** files:
 
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:18.253 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:20.531 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``)       | 01:15.692 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``)       | 01:16.257 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``)         | 00:50.434 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``)         | 00:52.501 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``)       | 00:34.219 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``)       | 00:35.465 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``)         | 00:29.567 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``)           | 00:30.357 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``)           | 00:29.050 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``)         | 00:30.149 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``)         | 00:26.740 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``)         | 00:26.165 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``)       | 00:24.063 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``)       | 00:24.732 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``)           | 00:20.186 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``)           | 00:20.531 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``)             | 00:02.577 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``)             | 00:02.651 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
index 7aa79bbba8..32d6a1720c 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
@@ -727,7 +727,7 @@ well as provides information about the model's performance
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-     2545.4545    2545.3663    2547.9816    2543.9309      1.2446   
+     2550.8269    2549.1817    2566.5008    2545.9899      5.4542   
                
 
 
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
index da16fb4a54..8aae35985b 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
@@ -437,7 +437,7 @@ Execute on TVM
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      15.6575      15.6385      15.8746      15.5296       0.0853   
+      16.0102      15.8558      16.7513      15.6779       0.3372   
                
 
 
diff --git a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
index b975e74f1b..0d9e58e5d8 100644
--- a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
@@ -130,7 +130,7 @@ Load pre-trained maskrcnn from torchvision and do tracing
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MaskRCNN_ResNet50_FPN_Weights.COCO_V1`. You can also use `weights=MaskRCNN_ResNet50_FPN_Weights.DEFAULT` to get the most up-to-date weights.
       warnings.warn(msg)
     Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
-
      0%|          | 0.00/170M [00:00<?, ?B/s]
      5%|4         | 7.99M/170M [00:00<00:02, 82.1MB/s]
      9%|9         | 16.1M/170M [00:00<00:01, 82.3MB/s]
     15%|#5        | 26.1M/170M [00:00<00:01, 92.4MB/s]
     21%|##        | 35.0M/170M [00:00<00:02, 59.3MB/s]
     28%|##8       | 48.0M/170M [00:00<00:01, 71.4MB/s]
     36%|###5      | 61.1M/170M [00:00<00:01, 88.1MB/s]
     42%|####1     | 70.6M/170M [00:00<00:01, 85.6MB/s]
     47%|####6     | 79.5M/170M [00:01<00:01, 87.2MB/s]
     52%|#####1    | 88.3M/170M [00:01<00:01, 81.7MB/s]
     60%|######    | 102M/170M [00:01<00:00, 99.3MB/s] 
     66%|######6   | 112M/170M [00:01<00:00, 97.2MB/s]
     72%|#######1  | 122M/170M [00:01<00:00, 96.3MB/s]
     77%|#######7  | 132M/170M [00:01<00:00, 96.7MB/s]
     83%|########3 | 141M/170M [00:01<00:00, 98.5MB/s]
     89%|########8 | 151M/170M [00:01<00:00, 88.0MB/s]
     94%|#########4| 160M/170M [00:01<00:00, 76.6MB/s]
     99%|#########8| 168M/170M [00:02<00:00, 73.4MB/s]
    
 100%|##########| 170M/170M [00:02<00:00, 83.2MB/s]
+
      0%|          | 0.00/170M [00:00<?, ?B/s]
      5%|4         | 7.99M/170M [00:00<00:02, 74.9MB/s]
      9%|8         | 15.1M/170M [00:00<00:02, 59.8MB/s]
     13%|#3        | 22.9M/170M [00:00<00:02, 68.1MB/s]
     18%|#7        | 30.3M/170M [00:00<00:02, 53.7MB/s]
     21%|##1       | 35.9M/170M [00:00<00:03, 46.6MB/s]
     24%|##3       | 40.7M/170M [00:00<00:02, 47.0MB/s]
     33%|###2      | 56.0M/170M [00:00<00:01, 74.1MB/s]
     38%|###7      | 64.0M/170M [00:01<00:01, 74.4MB/s]
     43%|####3     | 73.4M/170M [00:01<00:01, 80.8MB/s]
     52%|#####1    | 88.0M/170M [00:01<00:00, 96.1MB/s]
     61%|######1   | 104M/170M [00:01<00:00, 99.5MB/s] 
     67%|######6   | 114M/170M [00:01<00:00, 88.9MB/s]
     72%|#######1  | 122M/170M [00:01<00:00, 87.8MB/s]
     77%|#######6  | 131M/170M [00:01<00:00, 70.9MB/s]
     81%|########1 | 138M/170M [00:01<00:00, 72.0MB/s]
     86%|########5 | 145M/170M [00:02<00:00, 70.0MB/s]
     94%|#########4| 160M/170M [00:02<00:00, 84.1MB/s]
   
  100%|##########| 170M/170M [00:02<00:00, 78.1MB/s]
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/nn/functional.py:3897: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
       for i in range(dim)
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/detection/anchor_utils.py:124: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -299,7 +299,7 @@ Get boxes with score larger than 0.9
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 3 minutes  22.431 seconds)
+   **Total running time of the script:** ( 3 minutes  28.344 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
index c6b1220425..448c51b8f9 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
@@ -227,7 +227,7 @@ training. Other models require a full post training calibration.
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MobileNet_V2_Weights.IMAGENET1K_V1`. You can also use `weights=MobileNet_V2_Weights.DEFAULT` to get the most up-to-date weights.
       warnings.warn(msg)
     Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
-
      0%|          | 0.00/13.6M [00:00<?, ?B/s]
     59%|#####8    | 7.99M/13.6M [00:00<00:00, 61.9MB/s]
    100%|##########| 13.6M/13.6M [00:00<00:00, 64.4MB/s]
+
      0%|          | 0.00/13.6M [00:00<?, ?B/s]
     47%|####6     | 6.30M/13.6M [00:00<00:00, 53.4MB/s]
     84%|########4 | 11.4M/13.6M [00:00<00:00, 50.5MB/s]
    100%|##########| 13.6M/13.6M [00:00<00:00, 39.7MB/s]
 
 
 
@@ -409,7 +409,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      90.5913      90.3806      99.0003      90.0003       1.0226   
+      90.2068      90.1677      90.8147      90.0469       0.1458   
                
 
 
@@ -458,7 +458,7 @@ TODO
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  11.835 seconds)
+   **Total running time of the script:** ( 1 minutes  12.919 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_prequantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
index 7bbee640ae..29b3b11e05 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
@@ -423,7 +423,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      119.7466     119.7560     124.1398     117.9933      0.6389   
+      119.6592     119.5643     126.8230     119.0097      0.8249   
                
 
 
@@ -460,7 +460,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  40.818 seconds)
+   **Total running time of the script:** ( 2 minutes  32.610 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
index b20122ac57..405d588589 100644
--- a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
@@ -257,7 +257,7 @@ We create a Relay VM to build and execute the model.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  30.392 seconds)
+   **Total running time of the script:** ( 1 minutes  40.483 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_quantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
index a157b2ce3a..96a3e40b6b 100644
--- a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
@@ -170,7 +170,7 @@ Convert and compile model for CPU.
             data: None
       input_sym_arg_type = in_param.infer_type()[0]
     Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
-
      0%|          | 0/132723 [00:00<?, ?KB/s]
      5%|4         | 6457/132723 [00:00<00:01, 64555.75KB/s]
     11%|#         | 14541/132723 [00:00<00:01, 74125.57KB/s]
     17%|#7        | 22584/132723 [00:00<00:01, 77002.52KB/s]
     23%|##3       | 30612/132723 [00:00<00:01, 78294.66KB/s]
     29%|##9       | 38649/132723 [00:00<00:01, 79039.49KB/s]
     35%|###5      | 46553/132723 [00:00<00:01, 78975.48KB/s]
     41%|####1     | 54619/132723 [00:00<00:00, 79521.24KB/s]
     47%|####7     | 62721/132723 [00:00<00:00, 79996.31KB/s]
     53%|#####3    | 70817/132723 [00:00<00:00, 80293.85KB/s]
     59%|#####9    | 78847/132723 [00:01<00:00, 80269.78KB/s]
     66%|######5   | 86990/132723 [00:01<00:00, 80619.81KB/s]
     72%|#######1  | 95071/132723 [00:01<00:00, 80675.10KB/s]
     78%|#######7  | 103249/132723 [00:01<00:00, 81005.19KB/s]
     84%|########3 | 111350/132723 [00:01<00:00, 80907.17KB/s]
     90%|######### | 119477/132723 [00:01<00:00, 81012.32KB/s]
     96%|########
 #6| 127579/132723 [00:01<00:00, 80864.64KB/s]
    100%|##########| 132723/132723 [00:01<00:00, 79718.08KB/s]
+
      0%|          | 0/132723 [00:00<?, ?KB/s]
      5%|5         | 6645/132723 [00:00<00:01, 66444.68KB/s]
     12%|#1        | 15500/132723 [00:00<00:01, 79443.08KB/s]
     18%|#8        | 24276/132723 [00:00<00:01, 83238.71KB/s]
     25%|##4       | 33050/132723 [00:00<00:01, 85011.97KB/s]
     31%|###1      | 41552/132723 [00:00<00:01, 84301.38KB/s]
     38%|###7      | 49984/132723 [00:00<00:01, 81095.71KB/s]
     44%|####4     | 58717/132723 [00:00<00:00, 83069.29KB/s]
     51%|#####     | 67526/132723 [00:00<00:00, 84632.76KB/s]
     57%|#####7    | 76271/132723 [00:00<00:00, 85499.51KB/s]
     64%|######4   | 85023/132723 [00:01<00:00, 86114.52KB/s]
     71%|#######   | 93817/132723 [00:01<00:00, 86665.19KB/s]
     77%|#######7  | 102637/132723 [00:01<00:00, 87119.68KB/s]
     84%|########3 | 111464/132723 [00:01<00:00, 87464.64KB/s]
     91%|######### | 120251/132723 [00:01<00:00, 87584.54KB/s]
     97%|#########7| 129109/132723 [00:01<00:00, 87880.14KB/s]
    100%|#######
 ###| 132723/132723 [00:01<00:00, 85230.00KB/s]
 
 
 
@@ -246,7 +246,7 @@ Display result
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 3 minutes  27.363 seconds)
+   **Total running time of the script:** ( 3 minutes  29.026 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
index a720927d70..0e3033bcb8 100644
--- a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
 
 Computation times
 =================
-**14:39.203** total execution time for **how_to_deploy_models** files:
+**14:51.021** total execution time for **how_to_deploy_models** files:
 
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)                           | 03:27.363 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)                           | 03:29.026 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:22.431 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:28.344 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)           | 02:40.818 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)           | 02:32.610 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``)                               | 01:30.392 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``)                               | 01:40.483 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``)                         | 01:11.835 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``)                         | 01:12.919 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_adreno.py` (``deploy_model_on_adreno.py``)                   | 00:53.054 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_adreno.py` (``deploy_model_on_adreno.py``)                   | 00:53.319 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``)                 | 00:39.334 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``)                 | 00:39.824 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``)                       | 00:27.186 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``)                       | 00:27.427 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)                       | 00:26.783 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)                       | 00:27.063 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``)                                     | 00:00.006 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
index df3ccb5fa5..5b1d109e8d 100644
--- a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
@@ -463,7 +463,7 @@ First let us define two helper functions to get the mobilenet model and a cat im
 
  .. code-block:: none
 
-    Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zipa1643410-c026-4394-a685-fa334720df8f from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+    Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zipcb3154e2-9390-47b4-8140-e0c63d32f07a from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
 
 
 
diff --git a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
index c49548f90e..760ff0207a 100644
--- a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**00:51.247** total execution time for **how_to_extend_tvm** files:
+**00:52.015** total execution time for **how_to_extend_tvm** files:
 
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:47.558 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:48.296 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``)           | 00:02.625 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``)           | 00:02.653 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``)                     | 00:01.057 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``)                     | 00:01.060 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``)       | 00:00.007 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
index faab3c5044..a67020cf5a 100644
--- a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
@@ -220,10 +220,10 @@ profile the execution time of each passes.
  .. code-block:: none
 
     Printing results of timing profile...
-    InferType: 18341us [18341us] (48.70%; 48.70%)
-    FoldScaleAxis: 19321us [7us] (51.30%; 51.30%)
-            FoldConstant: 19314us [1666us] (51.28%; 99.97%)
-                    InferType: 17648us [17648us] (46.86%; 91.37%)
+    InferType: 18158us [18158us] (48.55%; 48.55%)
+    FoldScaleAxis: 19244us [7us] (51.45%; 51.45%)
+            FoldConstant: 19237us [1733us] (51.43%; 99.96%)
+                    InferType: 17505us [17505us] (46.80%; 90.99%)
 
 
 
@@ -262,10 +262,10 @@ Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for th
  .. code-block:: none
 
     Printing results of timing profile...
-    InferType: 18065us [18065us] (48.66%; 48.66%)
-    FoldScaleAxis: 19058us [5us] (51.34%; 51.34%)
-            FoldConstant: 19053us [1658us] (51.32%; 99.97%)
-                    InferType: 17395us [17395us] (46.86%; 91.30%)
+    InferType: 17363us [17363us] (47.85%; 47.85%)
+    FoldScaleAxis: 18921us [5us] (52.15%; 52.15%)
+            FoldConstant: 18916us [1709us] (52.13%; 99.97%)
+                    InferType: 17208us [17208us] (47.42%; 90.97%)
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
index 53e28d9051..027d52359e 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
@@ -331,7 +331,7 @@ latency of convolution.
 
  .. code-block:: none
 
-    Convolution: 54.193183 ms
+    Convolution: 44.062721 ms
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
index db479b2ed2..9701ef674d 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
@@ -608,7 +608,7 @@ be able to run on our build server
 
  .. code-block:: none
 
-    conv2d with tensor core: 8.941158 ms
+    conv2d with tensor core: 10.772263 ms
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
index 968ad5df78..df0ed7c798 100644
--- a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
@@ -134,8 +134,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 
  .. code-block:: none
 
-    Numpy running time: 0.018154
-    Baseline: 3.297522
+    Numpy running time: 0.018628
+    Baseline: 3.397229
 
 
 
@@ -227,7 +227,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 
  .. code-block:: none
 
-    Opt1: 0.298674
+    Opt1: 0.295933
 
 
 
@@ -318,7 +318,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
 
  .. code-block:: none
 
-    Opt2: 0.336746
+    Opt2: 0.332504
 
 
 
@@ -406,7 +406,7 @@ the access pattern for A matrix is more cache friendly.
 
  .. code-block:: none
 
-    Opt3: 0.115573
+    Opt3: 0.116739
 
 
 
@@ -523,7 +523,7 @@ flattening.
 
  .. code-block:: none
 
-    Opt4: 0.110064
+    Opt4: 0.109848
 
 
 
@@ -635,7 +635,7 @@ write to C when all the block results are ready.
 
  .. code-block:: none
 
-    Opt5: 0.111653
+    Opt5: 0.112016
 
 
 
@@ -748,7 +748,7 @@ Furthermore, we can also utilize multi-core processors to do the thread-level pa
 
  .. code-block:: none
 
-    Opt6: 0.148195
+    Opt6: 0.147469
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
index e7f42e87eb..151dc4eced 100644
--- a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
 
 Computation times
 =================
-**00:34.691** total execution time for **how_to_optimize_operators** files:
+**00:34.839** total execution time for **how_to_optimize_operators** files:
 
 +-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``)                       | 00:31.987 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``)                       | 00:32.242 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.528 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.513 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``)             | 00:01.176 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``)             | 00:01.085 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
index e44a395a2d..19f750c4c4 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
 
 Computation times
 =================
-**09:26.723** total execution time for **how_to_tune_with_autoscheduler** files:
+**09:28.448** total execution time for **how_to_tune_with_autoscheduler** files:
 
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 05:37.090 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 05:46.341 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``)             | 01:37.813 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``)             | 01:38.610 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``)           | 01:05.122 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``)           | 01:05.810 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)               | 00:40.466 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)               | 00:31.182 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``)             | 00:13.629 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``)             | 00:13.841 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``)           | 00:12.603 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``)           | 00:12.664 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
index dc8772bcda..ee25ed6dc0 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
@@ -245,12 +245,12 @@ cooperative fetching, unrolling and operator fusion.
             bias_1 = T.match_buffer(bias, (1, 512, 1, 1))
             compute_1 = T.match_buffer(compute, (1, 512, 7, 7))
             blockIdx_x = T.env_thread("blockIdx.x")
-            T.launch_thread(blockIdx_x, 28)
+            T.launch_thread(blockIdx_x, 16)
             conv2d_nchw = T.allocate([14], "float32", "local")
-            pad_temp_shared = T.allocate([72], "float32", "shared")
-            kernel_shared = T.allocate([3072], "float32", "shared")
+            pad_temp_shared = T.allocate([2592], "float32", "shared")
+            kernel_shared = T.allocate([9216], "float32", "shared")
             threadIdx_x = T.env_thread("threadIdx.x")
-            T.launch_thread(threadIdx_x, 64)
+            T.launch_thread(threadIdx_x, 112)
             conv2d_nchw_1 = T.buffer_decl((14,), data=conv2d_nchw, scope="local", align=32)
             conv2d_nchw_1[0] = T.float32(0)
             conv2d_nchw_1[1] = T.float32(0)
@@ -266,460 +266,146 @@ cooperative fetching, unrolling and operator fusion.
             conv2d_nchw_1[11] = T.float32(0)
             conv2d_nchw_1[12] = T.float32(0)
             conv2d_nchw_1[13] = T.float32(0)
-            for rc_outer_outer, ry_outer_outer in T.grid(64, 3):
-                cse_var_2: T.int32 = rc_outer_outer * 72
-                cse_var_1: T.int32 = ry_outer_outer * 3
+            for rc_outer_outer in range(16):
+                cse_var_1: T.int32 = rc_outer_outer * 1568
                 threadIdx_x_1 = T.env_thread("threadIdx.x")
-                pad_temp_shared_1 = T.buffer_decl((72,), data=pad_temp_shared, scope="shared")
-                with T.launch_thread(threadIdx_x_1, 64):
-                    data_2 = T.buffer_decl((25088,), data=data_1.data)
-                    if T.likely(threadIdx_x_1 < 18):
-                        pad_temp_shared_1[threadIdx_x_1 * 4] = T.if_then_else(1 <= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 < 8 and 1 <= threadIdx_x_1 * 4 % 9 and threadIdx_x_1 * 4 % 9 < 8, data_2[rc_outer_outer * 392 + threadIdx_x_1 * 4 // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + threadIdx_x_1 * 4 % 9 - 8], T.float32(0))
-                    if T.likely(threadIdx_x_1 < 18):
-                        pad_temp_shared_1[threadIdx_x_1 * 4 + 1] = T.if_then_else(1 <= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 < 8 and 1 <= (threadIdx_x_1 * 4 + 1) % 9 and (threadIdx_x_1 * 4 + 1) % 9 < 8, data_2[rc_outer_outer * 392 + (threadIdx_x_1 * 4 + 1) // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + (threadIdx_x_1 * 4 + 1) % 9 - 8], T.float32(0))
-                    if T.likely(threadIdx_x_1 < 18):
-                        pad_temp_shared_1[threadIdx_x_1 * 4 + 2] = T.if_then_else(1 <= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 < 8 and 1 <= (threadIdx_x_1 * 4 + 2) % 9 and (threadIdx_x_1 * 4 + 2) % 9 < 8, data_2[rc_outer_outer * 392 + (threadIdx_x_1 * 4 + 2) // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + (threadIdx_x_1 * 4 + 2) % 9 - 8], T.float32(0))
-                    if T.likely(threadIdx_x_1 < 18):
-                        pad_temp_shared_1[threadIdx_x_1 * 4 + 3] = T.if_then_else(1 <= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 < 8 and 1 <= (threadIdx_x_1 * 4 + 3) % 9 and (threadIdx_x_1 * 4 + 3) % 9 < 8, data_2[rc_outer_outer * 392 + (threadIdx_x_1 * 4 + 3) // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + (threadIdx_x_1 * 4 + 3) % 9 - 8], T.float32(0))
-                threadIdx_x_2 = T.env_thread("threadIdx.x")
-                kernel_shared_1 = T.buffer_decl((3072,), data=kernel_shared, scope="shared")
-                kernel_2 = T.buffer_decl((2359296,), data=kernel_1.data)
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 64] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 64) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 128] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 128) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 192] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 36864]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 256] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 256) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 320] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 320) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 384] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 73728]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 448] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 448) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 512] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 512) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 576] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 110592]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 640] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 640) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 704] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 704) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 768] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 147456]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 832] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 832) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 896] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 896) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 960] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 184320]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1024] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1024) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1088] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1088) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1152] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 221184]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1216] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1216) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1280] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1280) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1344] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 258048]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1408] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1408) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1472] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1472) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1536] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 294912]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1600] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1600) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1664] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1664) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1728] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 331776]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1792] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1792) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1856] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1856) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1920] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 368640]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 1984] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1984) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2048] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2048) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2112] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 405504]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2176] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2176) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2240] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2240) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2304] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 442368]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2368] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2368) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2432] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2432) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2496] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 479232]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2560] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2560) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2624] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2624) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2688] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 516096]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2752] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2752) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2816] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2816) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2880] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 552960]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 2944] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2944) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 64):
-                    kernel_shared_1[threadIdx_x_2 + 3008] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 3008) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[0] * kernel_shared_1[threadIdx_x * 48]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[9] * kernel_shared_1[threadIdx_x * 48 + 3]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 3]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 3]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 3]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 3]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 3]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 3]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[0] * kernel_shared_1[threadIdx_x * 48 + 24]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[9] * kernel_shared_1[threadIdx_x * 48 + 27]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48 + 24]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 27]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 24]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 27]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 24]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 27]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 24]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 27]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 24]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 27]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 24]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 27]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48 + 1]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 4]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 1]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 4]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 1]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 4]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 1]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 4]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 1]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 4]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 1]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 4]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 1]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 4]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48 + 25]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 28]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 25]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 28]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 25]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 28]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 25]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 28]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 25]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 28]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 25]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 28]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 25]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 28]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 2]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 5]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 2]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 5]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 2]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 5]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 2]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 5]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 2]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 5]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 2]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 5]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[8] * kernel_shared_1[threadIdx_x * 48 + 2]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[17] * kernel_shared_1[threadIdx_x * 48 + 5]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 26]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 29]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 26]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 29]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 26]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 29]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 26]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 29]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 26]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 29]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 26]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 29]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[8] * kernel_shared_1[threadIdx_x * 48 + 26]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[17] * kernel_shared_1[threadIdx_x * 48 + 29]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[18] * kernel_shared_1[threadIdx_x * 48 + 6]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[27] * kernel_shared_1[threadIdx_x * 48 + 9]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 6]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 9]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 6]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 9]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 6]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 9]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 6]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 9]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 6]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 9]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 6]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 9]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[18] * kernel_shared_1[threadIdx_x * 48 + 30]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[27] * kernel_shared_1[threadIdx_x * 48 + 33]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 30]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 33]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 30]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 33]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 30]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 33]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 30]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 33]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 30]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 33]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 30]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 33]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 7]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 10]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 7]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 10]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 7]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 10]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 7]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 10]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 7]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 10]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 7]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 10]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 7]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 10]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 31]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 34]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 31]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 34]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 31]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 34]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 31]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 34]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 31]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 34]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 31]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 34]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 31]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 34]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 8]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 11]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 8]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 11]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 8]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 11]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 8]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 11]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 8]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 11]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 8]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 11]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[26] * kernel_shared_1[threadIdx_x * 48 + 8]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[35] * kernel_shared_1[threadIdx_x * 48 + 11]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 32]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 35]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 32]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 35]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 32]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 35]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 32]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 35]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 32]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 35]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 32]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 35]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[26] * kernel_shared_1[threadIdx_x * 48 + 32]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[35] * kernel_shared_1[threadIdx_x * 48 + 35]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[36] * kernel_shared_1[threadIdx_x * 48 + 12]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[45] * kernel_shared_1[threadIdx_x * 48 + 15]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 12]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 15]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 12]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 15]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 12]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 15]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 12]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 15]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 12]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 15]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 12]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 15]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[36] * kernel_shared_1[threadIdx_x * 48 + 36]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[45] * kernel_shared_1[threadIdx_x * 48 + 39]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 36]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 39]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 36]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 39]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 36]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 39]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 36]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 39]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 36]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 39]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 36]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 39]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 13]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 16]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 13]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 16]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 13]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 16]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 13]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 16]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 13]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 16]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 13]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 16]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 13]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 16]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 37]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 40]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 37]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 40]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 37]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 40]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 37]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 40]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 37]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 40]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 37]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 40]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 37]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 40]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 14]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 17]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 14]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 17]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 14]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 17]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 14]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 17]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 14]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 17]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 14]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 17]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[44] * kernel_shared_1[threadIdx_x * 48 + 14]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[53] * kernel_shared_1[threadIdx_x * 48 + 17]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 38]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 41]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 38]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 41]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 38]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 41]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 38]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 41]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 38]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 41]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 38]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 41]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[44] * kernel_shared_1[threadIdx_x * 48 + 38]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[53] * kernel_shared_1[threadIdx_x * 48 + 41]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[54] * kernel_shared_1[threadIdx_x * 48 + 18]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[63] * kernel_shared_1[threadIdx_x * 48 + 21]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 18]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 21]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 18]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 21]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 18]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 21]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 18]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 21]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 18]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 21]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 18]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 21]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[54] * kernel_shared_1[threadIdx_x * 48 + 42]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[63] * kernel_shared_1[threadIdx_x * 48 + 45]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 42]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 45]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 42]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 45]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 42]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 45]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 42]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 45]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 42]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 45]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 42]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 45]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 19]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 22]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 19]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 22]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 19]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 22]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 19]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 22]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 19]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 22]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 19]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 22]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 19]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 22]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 43]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 46]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 43]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 46]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 43]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 46]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 43]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 46]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 43]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 46]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 43]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 46]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 43]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 46]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 20]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 23]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 20]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 23]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 20]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 23]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 20]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 23]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 20]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 23]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 20]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 23]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[62] * kernel_shared_1[threadIdx_x * 48 + 20]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[71] * kernel_shared_1[threadIdx_x * 48 + 23]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 44]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 47]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 44]
-                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 47]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 44]
-                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 47]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 44]
-                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 47]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 44]
-                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 47]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 44]
-                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 47]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[62] * kernel_shared_1[threadIdx_x * 48 + 44]
-                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[71] * kernel_shared_1[threadIdx_x * 48 + 47]
-            for i1_inner, i3_inner in T.grid(2, 7):
+                pad_temp_shared_1 = T.buffer_decl((2592,), data=pad_temp_shared, scope="shared")
+                data_2 = T.buffer_decl((25088,), data=data_1.data)
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(9 <= threadIdx_x_1 % 81 and threadIdx_x_1 % 81 < 72 and 1 <= threadIdx_x_1 % 9 and threadIdx_x_1 % 9 < 8, data_2[cse_var_1 + threadIdx_x_1 // 81 * 49 + threadIdx_x_1 % 81 // 9 * 7 + threadIdx_x_1 % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 112] = T.if_then_else(9 <= (threadIdx_x_1 + 31) % 81 and (threadIdx_x_1 + 31) % 81 < 72 and 1 <= (threadIdx_x_1 + 4) % 9 and (threadIdx_x_1 + 4) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 112) // 81 * 49 + (threadIdx_x_1 + 31) % 81 // 9 * 7 + (threadIdx_x_1 + 4) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 224] = T.if_then_else(9 <= (threadIdx_x_1 + 62) % 81 and (threadIdx_x_1 + 62) % 81 < 72 and 1 <= (threadIdx_x_1 + 8) % 9 and (threadIdx_x_1 + 8) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 224) // 81 * 49 + (threadIdx_x_1 + 62) % 81 // 9 * 7 + (threadIdx_x_1 + 8) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 336] = T.if_then_else(9 <= (threadIdx_x_1 + 12) % 81 and (threadIdx_x_1 + 12) % 81 < 72 and 1 <= (threadIdx_x_1 + 3) % 9 and (threadIdx_x_1 + 3) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 336) // 81 * 49 + (threadIdx_x_1 + 12) % 81 // 9 * 7 + (threadIdx_x_1 + 3) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 448] = T.if_then_else(9 <= (threadIdx_x_1 + 43) % 81 and (threadIdx_x_1 + 43) % 81 < 72 and 1 <= (threadIdx_x_1 + 7) % 9 and (threadIdx_x_1 + 7) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 448) // 81 * 49 + (threadIdx_x_1 + 43) % 81 // 9 * 7 + (threadIdx_x_1 + 7) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 560] = T.if_then_else(9 <= (threadIdx_x_1 + 74) % 81 and (threadIdx_x_1 + 74) % 81 < 72 and 1 <= (threadIdx_x_1 + 2) % 9 and (threadIdx_x_1 + 2) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 560) // 81 * 49 + (threadIdx_x_1 + 74) % 81 // 9 * 7 + (threadIdx_x_1 + 2) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 672] = T.if_then_else(9 <= (threadIdx_x_1 + 24) % 81 and (threadIdx_x_1 + 24) % 81 < 72 and 1 <= (threadIdx_x_1 + 6) % 9 and (threadIdx_x_1 + 6) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 672) // 81 * 49 + (threadIdx_x_1 + 24) % 81 // 9 * 7 + (threadIdx_x_1 + 6) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(9 <= (threadIdx_x_1 + 55) % 81 and (threadIdx_x_1 + 55) % 81 < 72 and 1 <= (threadIdx_x_1 + 1) % 9 and (threadIdx_x_1 + 1) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 784) // 81 * 49 + (threadIdx_x_1 + 55) % 81 // 9 * 7 + (threadIdx_x_1 + 1) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 896] = T.if_then_else(9 <= (threadIdx_x_1 + 5) % 81 and (threadIdx_x_1 + 5) % 81 < 72 and 1 <= (threadIdx_x_1 + 5) % 9 and (threadIdx_x_1 + 5) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 896) // 81 * 49 + (threadIdx_x_1 + 5) % 81 // 9 * 7 + (threadIdx_x_1 + 5) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 1008] = T.if_then_else(1 <= (threadIdx_x_1 // 9 + 4) % 9 and (threadIdx_x_1 + 36) % 81 < 72 and 1 <= threadIdx_x_1 % 9 and threadIdx_x_1 % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 1008) // 81 * 49 + (threadIdx_x_1 // 9 + 4) % 9 * 7 + threadIdx_x_1 % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 1120] = T.if_then_else(9 <= (threadIdx_x_1 + 67) % 81 and (threadIdx_x_1 + 67) % 81 < 72 and 1 <= (threadIdx_x_1 + 4) % 9 and (threadIdx_x_1 + 4) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 1120) // 81 * 49 + (threadIdx_x_1 + 67) % 81 // 9 * 7 + (threadIdx_x_1 + 4) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 1232] = T.if_then_else(9 <= (threadIdx_x_1 + 17) % 81 and (threadIdx_x_1 + 17) % 81 < 72 and 1 <= (threadIdx_x_1 + 8) % 9 and (threadIdx_x_1 + 8) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 1232) // 81 * 49 + (threadIdx_x_1 + 17) % 81 // 9 * 7 + (threadIdx_x_1 + 8) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 1344] = T.if_then_else(9 <= (threadIdx_x_1 + 48) % 81 and (threadIdx_x_1 + 48) % 81 < 72 and 1 <= (threadIdx_x_1 + 3) % 9 and (threadIdx_x_1 + 3) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 1344) // 81 * 49 + (threadIdx_x_1 + 48) % 81 // 9 * 7 + (threadIdx_x_1 + 3) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 1456] = T.if_then_else(9 <= (threadIdx_x_1 + 79) % 81 and (threadIdx_x_1 + 79) % 81 < 72 and 1 <= (threadIdx_x_1 + 7) % 9 and (threadIdx_x_1 + 7) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 1456) // 81 * 49 + (threadIdx_x_1 + 79) % 81 // 9 * 7 + (threadIdx_x_1 + 7) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(9 <= (threadIdx_x_1 + 29) % 81 and (threadIdx_x_1 + 29) % 81 < 72 and 1 <= (threadIdx_x_1 + 2) % 9 and (threadIdx_x_1 + 2) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 1568) // 81 * 49 + (threadIdx_x_1 + 29) % 81 // 9 * 7 + (threadIdx_x_1 + 2) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 1680] = T.if_then_else(9 <= (threadIdx_x_1 + 60) % 81 and (threadIdx_x_1 + 60) % 81 < 72 and 1 <= (threadIdx_x_1 + 6) % 9 and (threadIdx_x_1 + 6) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 1680) // 81 * 49 + (threadIdx_x_1 + 60) % 81 // 9 * 7 + (threadIdx_x_1 + 6) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 1792] = T.if_then_else(9 <= (threadIdx_x_1 + 10) % 81 and (threadIdx_x_1 + 10) % 81 < 72 and 1 <= (threadIdx_x_1 + 1) % 9 and (threadIdx_x_1 + 1) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 1792) // 81 * 49 + (threadIdx_x_1 + 10) % 81 // 9 * 7 + (threadIdx_x_1 + 1) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 1904] = T.if_then_else(9 <= (threadIdx_x_1 + 41) % 81 and (threadIdx_x_1 + 41) % 81 < 72 and 1 <= (threadIdx_x_1 + 5) % 9 and (threadIdx_x_1 + 5) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 1904) // 81 * 49 + (threadIdx_x_1 + 41) % 81 // 9 * 7 + (threadIdx_x_1 + 5) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 2016] = T.if_then_else(1 <= (threadIdx_x_1 // 9 + 8) % 9 and (threadIdx_x_1 + 72) % 81 < 72 and 1 <= threadIdx_x_1 % 9 and threadIdx_x_1 % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 2016) // 81 * 49 + (threadIdx_x_1 // 9 + 8) % 9 * 7 + threadIdx_x_1 % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 2128] = T.if_then_else(9 <= (threadIdx_x_1 + 22) % 81 and (threadIdx_x_1 + 22) % 81 < 72 and 1 <= (threadIdx_x_1 + 4) % 9 and (threadIdx_x_1 + 4) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 2128) // 81 * 49 + (threadIdx_x_1 + 22) % 81 // 9 * 7 + (threadIdx_x_1 + 4) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 2240] = T.if_then_else(9 <= (threadIdx_x_1 + 53) % 81 and (threadIdx_x_1 + 53) % 81 < 72 and 1 <= (threadIdx_x_1 + 8) % 9 and (threadIdx_x_1 + 8) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 2240) // 81 * 49 + (threadIdx_x_1 + 53) % 81 // 9 * 7 + (threadIdx_x_1 + 8) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(9 <= (threadIdx_x_1 + 3) % 81 and (threadIdx_x_1 + 3) % 81 < 72 and 1 <= (threadIdx_x_1 + 3) % 9 and (threadIdx_x_1 + 3) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 2352) // 81 * 49 + (threadIdx_x_1 + 3) % 81 // 9 * 7 + (threadIdx_x_1 + 3) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    pad_temp_shared_1[threadIdx_x_1 + 2464] = T.if_then_else(9 <= (threadIdx_x_1 + 34) % 81 and (threadIdx_x_1 + 34) % 81 < 72 and 1 <= (threadIdx_x_1 + 7) % 9 and (threadIdx_x_1 + 7) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 2464) // 81 * 49 + (threadIdx_x_1 + 34) % 81 // 9 * 7 + (threadIdx_x_1 + 7) % 9 - 8], T.float32(0))
+                with T.launch_thread(threadIdx_x_1, 112):
+                    if T.likely(threadIdx_x_1 < 16):
+                        pad_temp_shared_1[threadIdx_x_1 + 2576] = T.if_then_else(threadIdx_x_1 < 7 and 1 <= (threadIdx_x_1 + 2) % 9 and (threadIdx_x_1 + 2) % 9 < 8, data_2[cse_var_1 + (threadIdx_x_1 + 2576) // 81 * 49 + (threadIdx_x_1 + 65) % 81 // 9 * 7 + (threadIdx_x_1 + 2) - 8], T.float32(0))
+                kernel_shared_1 = T.buffer_decl((9216,), data=kernel_shared, scope="shared")
+                for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(4):
+                    threadIdx_x_2 = T.env_thread("threadIdx.x")
+                    T.launch_thread(threadIdx_x_2, 112)
+                    kernel_2 = T.buffer_decl((2359296,), data=kernel_1.data)
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 32 + threadIdx_x_2 * 8) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 1] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 32 + threadIdx_x_2 * 8) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3 + 1]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 2] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 32 + threadIdx_x_2 * 8) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3 + 2]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 3] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 1) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 4] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 1) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3 + 1]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 5] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 1) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3 + 2]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 6] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 2) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 2) % 3 * 3]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 7] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 2) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 2) % 3 * 3 + 1]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 8] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 2) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 2) % 3 * 3 + 2]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 9] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + ((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8) // 3 + 1) % 32 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 10] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + ((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8) // 3 + 1) % 32 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3 + 1]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 11] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + ((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8) // 3 + 1) % 32 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3 + 2]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 12] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 4) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 13] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 4) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3 + 1]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 14] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 4) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3 + 2]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 15] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 5) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 2) % 3 * 3]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 16] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 5) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 2) % 3 * 3 + 1]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 17] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 5) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 2) % 3 * 3 + 2]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 18] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + ((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8) // 3 + 2) % 32 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 19] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + ((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8) // 3 + 2) % 32 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3 + 1]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 20] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + ((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8) // 3 + 2) % 32 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3 + 2]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 21] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 7) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 22] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 7) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3 + 1]
+                    if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 < 24):
+                        kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 23] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 7) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3 + 2]
+                for rc_outer_inner, rx_outer_inner, ff_outer_inner, rc_inner in T.grid(2, 3, 2, 16):
+                    cse_var_8: T.int32 = ff_outer_inner * 7
+                    cse_var_7: T.int32 = cse_var_8 + 6
+                    cse_var_6: T.int32 = cse_var_8 + 5
+                    cse_var_5: T.int32 = cse_var_8 + 4
+                    cse_var_4: T.int32 = cse_var_8 + 3
+                    cse_var_3: T.int32 = cse_var_8 + 2
+                    cse_var_2: T.int32 = cse_var_8 + 1
+                    conv2d_nchw_1[cse_var_8] = conv2d_nchw_1[cse_var_8] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner]
+                    conv2d_nchw_1[cse_var_2] = conv2d_nchw_1[cse_var_2] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 9] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner]
+                    conv2d_nchw_1[cse_var_3] = conv2d_nchw_1[cse_var_3] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner]
+                    conv2d_nchw_1[cse_var_4] = conv2d_nchw_1[cse_var_4] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 27] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner]
+                    conv2d_nchw_1[cse_var_5] = conv2d_nchw_1[cse_var_5] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 36] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner]
+                    conv2d_nchw_1[cse_var_6] = conv2d_nchw_1[cse_var_6] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 45] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner]
+                    conv2d_nchw_1[cse_var_7] = conv2d_nchw_1[cse_var_7] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 54] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner]
+                    conv2d_nchw_1[cse_var_8] = conv2d_nchw_1[cse_var_8] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 9] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 3]
+                    conv2d_nchw_1[cse_var_2] = conv2d_nchw_1[cse_var_2] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 3]
+                    conv2d_nchw_1[cse_var_3] = conv2d_nchw_1[cse_var_3] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 27] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 3]
+                    conv2d_nchw_1[cse_var_4] = conv2d_nchw_1[cse_var_4] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 36] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 3]
+                    conv2d_nchw_1[cse_var_5] = conv2d_nchw_1[cse_var_5] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 45] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 3]
+                    conv2d_nchw_1[cse_var_6] = conv2d_nchw_1[cse_var_6] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 54] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 3]
+                    conv2d_nchw_1[cse_var_7] = conv2d_nchw_1[cse_var_7] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 63] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 3]
+                    conv2d_nchw_1[cse_var_8] = conv2d_nchw_1[cse_var_8] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 6]
+                    conv2d_nchw_1[cse_var_2] = conv2d_nchw_1[cse_var_2] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 27] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 6]
+                    conv2d_nchw_1[cse_var_3] = conv2d_nchw_1[cse_var_3] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 36] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 6]
+                    conv2d_nchw_1[cse_var_4] = conv2d_nchw_1[cse_var_4] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 45] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 6]
+                    conv2d_nchw_1[cse_var_5] = conv2d_nchw_1[cse_var_5] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 54] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 6]
+                    conv2d_nchw_1[cse_var_6] = conv2d_nchw_1[cse_var_6] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 63] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 6]
+                    conv2d_nchw_1[cse_var_7] = conv2d_nchw_1[cse_var_7] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 72] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 6]
+            for i1_inner, i2_inner in T.grid(2, 7):
                 compute_2 = T.buffer_decl((25088,), data=compute_1.data)
                 bias_2 = T.buffer_decl((512,), data=bias_1.data)
-                compute_2[blockIdx_x // 7 * 6272 + threadIdx_x * 98 + i1_inner * 49 + blockIdx_x % 7 * 7 + i3_inner] = T.max(conv2d_nchw_1[i1_inner * 7 + i3_inner] + bias_2[blockIdx_x // 7 * 128 + threadIdx_x * 2 + i1_inner], T.float32(0))
+                compute_2[blockIdx_x * 1568 + threadIdx_x // 7 * 98 + i1_inner * 49 + i2_inner * 7 + threadIdx_x % 7] = T.max(conv2d_nchw_1[i1_inner * 7 + i2_inner] + bias_2[blockIdx_x * 32 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
 
 
 
@@ -769,7 +455,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 0.356 ms
+    Execution time of this operator: 0.325 ms
 
 
 
@@ -819,19 +505,19 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
     conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
     conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
-    conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
+    conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=16)
     conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
-    conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
+    conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=7)
     conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
     conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
     conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
     conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
-    conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
-    conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
+    conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
+    conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
     conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
-    conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
-    conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
-    conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
+    conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=16)
+    conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=2)
+    conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=3)
     conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
     conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
     conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
@@ -840,13 +526,13 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
     compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
     compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
-    compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
+    compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=16)
     compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
-    compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
+    compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=7)
     compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
     compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-    compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
-    compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
+    compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+    compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
     compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
     s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
     s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -864,16 +550,16 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused = s[compute].fuse(compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i)
     s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread_axis("threadIdx.x"))
     kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
-    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
+    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=24)
     s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=112)
     s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
     pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
-    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
+    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
     s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=112)
     s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
-    s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 512)
+    s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 64)
     s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
 
     CUDA source code:
@@ -891,10 +577,10 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
       #define int64_t long long
       #define uint64_t unsigned long long
     #endif
-    extern "C" __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+    extern "C" __global__ void __launch_bounds__(112) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
       float conv2d_nchw[14];
-      __shared__ float pad_temp_shared[72];
-      __shared__ float kernel_shared[3072];
+      __shared__ float pad_temp_shared[2592];
+      __shared__ float kernel_shared[9216];
       conv2d_nchw[0] = 0.000000e+00f;
       conv2d_nchw[1] = 0.000000e+00f;
       conv2d_nchw[2] = 0.000000e+00f;
@@ -909,411 +595,142 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
       conv2d_nchw[11] = 0.000000e+00f;
       conv2d_nchw[12] = 0.000000e+00f;
       conv2d_nchw[13] = 0.000000e+00f;
-      for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
-        for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
-          __syncthreads();
-          if (((int)threadIdx.x) < 18) {
-            pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) * 4) % 9))) && (((((int)threadIdx.x) * 4) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
+      for (int rc_outer_outer = 0; rc_outer_outer < 16; ++rc_outer_outer) {
+        __syncthreads();
+        pad_temp_shared[((int)threadIdx.x)] = (((((9 <= (((int)threadIdx.x) % 81)) && ((((int)threadIdx.x) % 81) < 72)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + ((((int)threadIdx.x) / 81) * 49)) + (((((int)threadIdx.x) % 81) / 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 112)] = (((((9 <= ((((int)threadIdx.x) + 31) % 81)) && (((((int)threadIdx.x) + 31) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 112) / 81) * 49)) + ((((((int)threadIdx.x) + 31) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 224)] = (((((9 <= ((((int)threadIdx.x) + 62) % 81)) && (((((int)threadIdx.x) + 62) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 224) / 81) * 49)) + ((((((int)threadIdx.x) + 62) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 336)] = (((((9 <= ((((int)threadIdx.x) + 12) % 81)) && (((((int)threadIdx.x) + 12) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 3) % 9))) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 336) / 81) * 49)) + ((((((int)threadIdx.x) + 12) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 448)] = (((((9 <= ((((int)threadIdx.x) + 43) % 81)) && (((((int)threadIdx.x) + 43) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 448) / 81) * 49)) + ((((((int)threadIdx.x) + 43) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 560)] = (((((9 <= ((((int)threadIdx.x) + 74) % 81)) && (((((int)threadIdx.x) + 74) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 560) / 81) * 49)) + ((((((int)threadIdx.x) + 74) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 672)] = (((((9 <= ((((int)threadIdx.x) + 24) % 81)) && (((((int)threadIdx.x) + 24) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 672) / 81) * 49)) + ((((((int)threadIdx.x) + 24) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 784)] = (((((9 <= ((((int)threadIdx.x) + 55) % 81)) && (((((int)threadIdx.x) + 55) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 784) / 81) * 49)) + ((((((int)threadIdx.x) + 55) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 896)] = (((((9 <= ((((int)threadIdx.x) + 5) % 81)) && (((((int)threadIdx.x) + 5) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 896) / 81) * 49)) + ((((((int)threadIdx.x) + 5) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 1008)] = (((((1 <= (((((int)threadIdx.x) / 9) + 4) % 9)) && (((((int)threadIdx.x) + 36) % 81) < 72)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1008) / 81) * 49)) + ((((((int)threadIdx.x) / 9) + 4) % 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 1120)] = (((((9 <= ((((int)threadIdx.x) + 67) % 81)) && (((((int)threadIdx.x) + 67) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1120) / 81) * 49)) + ((((((int)threadIdx.x) + 67) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 1232)] = (((((9 <= ((((int)threadIdx.x) + 17) % 81)) && (((((int)threadIdx.x) + 17) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1232) / 81) * 49)) + ((((((int)threadIdx.x) + 17) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 1344)] = (((((9 <= ((((int)threadIdx.x) + 48) % 81)) && (((((int)threadIdx.x) + 48) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 3) % 9))) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1344) / 81) * 49)) + ((((((int)threadIdx.x) + 48) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 1456)] = (((((9 <= ((((int)threadIdx.x) + 79) % 81)) && (((((int)threadIdx.x) + 79) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1456) / 81) * 49)) + ((((((int)threadIdx.x) + 79) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 1568)] = (((((9 <= ((((int)threadIdx.x) + 29) % 81)) && (((((int)threadIdx.x) + 29) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1568) / 81) * 49)) + ((((((int)threadIdx.x) + 29) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 1680)] = (((((9 <= ((((int)threadIdx.x) + 60) % 81)) && (((((int)threadIdx.x) + 60) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1680) / 81) * 49)) + ((((((int)threadIdx.x) + 60) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 1792)] = (((((9 <= ((((int)threadIdx.x) + 10) % 81)) && (((((int)threadIdx.x) + 10) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1792) / 81) * 49)) + ((((((int)threadIdx.x) + 10) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 1904)] = (((((9 <= ((((int)threadIdx.x) + 41) % 81)) && (((((int)threadIdx.x) + 41) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1904) / 81) * 49)) + ((((((int)threadIdx.x) + 41) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 2016)] = (((((1 <= (((((int)threadIdx.x) / 9) + 8) % 9)) && (((((int)threadIdx.x) + 72) % 81) < 72)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 2016) / 81) * 49)) + ((((((int)threadIdx.x) / 9) + 8) % 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 2128)] = (((((9 <= ((((int)threadIdx.x) + 22) % 81)) && (((((int)threadIdx.x) + 22) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 2128) / 81) * 49)) + ((((((int)threadIdx.x) + 22) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 2240)] = (((((9 <= ((((int)threadIdx.x) + 53) % 81)) && (((((int)threadIdx.x) + 53) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 2240) / 81) * 49)) + ((((((int)threadIdx.x) + 53) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 2352)] = (((((9 <= ((((int)threadIdx.x) + 3) % 81)) && (((((int)threadIdx.x) + 3) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 3) % 9))) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 2352) / 81) * 49)) + ((((((int)threadIdx.x) + 3) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
+        pad_temp_shared[(((int)threadIdx.x) + 2464)] = (((((9 <= ((((int)threadIdx.x) + 34) % 81)) && (((((int)threadIdx.x) + 34) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 2464) / 81) * 49)) + ((((((int)threadIdx.x) + 34) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
+        if (((int)threadIdx.x) < 16) {
+          pad_temp_shared[(((int)threadIdx.x) + 2576)] = ((((((int)threadIdx.x) < 7) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 2576) / 81) * 49)) + (((((int)threadIdx.x) + 65) / 9) * 7)) + ((int)threadIdx.x)) - 6)] : 0.000000e+00f);
+        }
+        for (int ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = 0; ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer < 4; ++ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer) {
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 32) + (((int)threadIdx.x) * 8)) % 96) / 3) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % 3) * 3))];
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 1)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 32) + (((int)threadIdx.x) * 8)) % 96) / 3) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % 3) * 3)) + 1)];
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 2)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 32) + (((int)threadIdx.x) * 8)) % 96) / 3) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % 3) * 3)) + 2)];
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 3)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 1) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 1) % [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 4)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 1) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 1)  [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 5)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 1) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 1)  [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 6)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 2) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 2) % [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 7)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 2) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 2)  [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 8)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 2) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 2)  [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 9)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) / 3) + 1) & 31) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % 3) * 3))];
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 10)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) / 3) + 1) & 31) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % 3) *  [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 11)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) / 3) + 1) & 31) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % 3) *  [...]
           }
-          if (((int)threadIdx.x) < 18) {
-            pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 1) % 9))) && ((((((int)threadIdx.x) * 4) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 12)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 4) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 1)  [...]
           }
-          if (((int)threadIdx.x) < 18) {
-            pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 2) % 9))) && ((((((int)threadIdx.x) * 4) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 13)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 4) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 1) [...]
           }
-          if (((int)threadIdx.x) < 18) {
-            pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 3) % 9))) && ((((((int)threadIdx.x) * 4) + 3) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 14)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 4) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 1) [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 15)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 5) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 2)  [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 16)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 5) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 2) [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 17)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 5) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 2) [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 18)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) / 3) + 2) & 31) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % 3) * 3))];
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 19)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) / 3) + 2) & 31) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % 3) *  [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 20)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) / 3) + 2) & 31) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % 3) *  [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 21)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 7) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 1)  [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 22)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 7) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 1) [...]
+          }
+          if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) >> 4)) < 24) {
+            kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 23)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) >> 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 7) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 1) [...]
+          }
+        }
+        __syncthreads();
+        for (int rc_outer_inner = 0; rc_outer_inner < 2; ++rc_outer_inner) {
+          for (int rx_outer_inner = 0; rx_outer_inner < 3; ++rx_outer_inner) {
+            for (int ff_outer_inner = 0; ff_outer_inner < 2; ++ff_outer_inner) {
+              for (int rc_inner = 0; rc_inner < 16; ++rc_inner) {
+                conv2d_nchw[(ff_outer_inner * 7)] = (conv2d_nchw[(ff_outer_inner * 7)] + (pad_temp_shared[((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7))] * kernel_shared[((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 1)] = (conv2d_nchw[((ff_outer_inner * 7) + 1)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 9)] * kernel_shared[((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 2)] = (conv2d_nchw[((ff_outer_inner * 7) + 2)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 3)] = (conv2d_nchw[((ff_outer_inner * 7) + 3)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 4)] = (conv2d_nchw[((ff_outer_inner * 7) + 4)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 5)] = (conv2d_nchw[((ff_outer_inner * 7) + 5)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 6)] = (conv2d_nchw[((ff_outer_inner * 7) + 6)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner)]));
+                conv2d_nchw[(ff_outer_inner * 7)] = (conv2d_nchw[(ff_outer_inner * 7)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 9)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 3)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 1)] = (conv2d_nchw[((ff_outer_inner * 7) + 1)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 3)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 2)] = (conv2d_nchw[((ff_outer_inner * 7) + 2)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 3)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 3)] = (conv2d_nchw[((ff_outer_inner * 7) + 3)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 3)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 4)] = (conv2d_nchw[((ff_outer_inner * 7) + 4)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 3)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 5)] = (conv2d_nchw[((ff_outer_inner * 7) + 5)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 3)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 6)] = (conv2d_nchw[((ff_outer_inner * 7) + 6)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 3)]));
+                conv2d_nchw[(ff_outer_inner * 7)] = (conv2d_nchw[(ff_outer_inner * 7)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 6)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 1)] = (conv2d_nchw[((ff_outer_inner * 7) + 1)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 6)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 2)] = (conv2d_nchw[((ff_outer_inner * 7) + 2)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 6)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 3)] = (conv2d_nchw[((ff_outer_inner * 7) + 3)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 6)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 4)] = (conv2d_nchw[((ff_outer_inner * 7) + 4)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 6)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 5)] = (conv2d_nchw[((ff_outer_inner * 7) + 5)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 6)]));
+                conv2d_nchw[((ff_outer_inner * 7) + 6)] = (conv2d_nchw[((ff_outer_inner * 7) + 6)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 72)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 6)]));
+              }
+            }
           }
-          kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
-          kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
-          kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
-          kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
-          kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
-          kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
-          kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
-          kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
-          kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
-          kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
-          kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
-          kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
-          kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
-          kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
-          kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
-          kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-          kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-          __syncthreads();
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-          conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-          conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-          conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-          conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-          conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-          conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-          conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-          conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-          conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-          conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-          conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-          conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-          conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-          conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
         }
       }
       for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
-        for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
-          compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
+        for (int i2_inner = 0; i2_inner < 7; ++i2_inner) {
+          compute[(((((((int)blockIdx.x) * 1568) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + (i2_inner * 7)) + (((int)threadIdx.x) % 7))] = max((conv2d_nchw[((i1_inner * 7) + i2_inner)] + bias[(((((int)blockIdx.x) * 32) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
         }
       }
     }
@@ -1376,7 +793,7 @@ In the example below we resume the status and do more 5 trials.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 5 minutes  37.090 seconds)
+   **Total running time of the script:** ( 5 minutes  46.341 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
index 930ad86995..ec56826e0b 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
@@ -647,7 +647,7 @@ so we can read the log file and load the best schedules.
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-       7.8484       7.8462       7.8573       7.8418       0.0065   
+       7.8969       7.8932       7.9050       7.8925       0.0057   
                
 
 
@@ -675,7 +675,7 @@ Other Tips
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  5.122 seconds)
+   **Total running time of the script:** ( 1 minutes  5.810 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
index 300b0a165c..3b43fde243 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
@@ -666,7 +666,7 @@ so we can read the log file and load the best schedules.
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      747.1292     746.5459     748.3151     746.5267      0.8386   
+      751.3712     751.1382     753.5222     749.4532      1.6693   
                
 
 
@@ -694,7 +694,7 @@ Other Tips
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  37.813 seconds)
+   **Total running time of the script:** ( 1 minutes  38.610 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_x86.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
index 75de9df91d..6e2dc06608 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
@@ -392,26 +392,74 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
             placeholder_8 = T.match_buffer(placeholder_3, (33,), "int32")
             placeholder_9 = T.match_buffer(placeholder_4, (128, 512))
             compute_1 = T.match_buffer(compute, (128, 512))
-            for i0_outer_i1_outer_fused in T.parallel(32):
-                compute_2 = T.allocate([2048], "float32", "global")
-                compute_3 = T.buffer_decl((2048,), data=compute_2)
-                for i_outer_inner, nb_j_inner in T.grid(8, 2):
-                    for i_inner_init, j_init in T.grid(8, 16):
-                        compute_3[i_outer_inner * 256 + i_inner_init * 32 + nb_j_inner * 16 + j_init] = T.float32(0)
-                    for elem_idx, i_inner, j in T.grid(T.let(cse_var_1, i0_outer_i1_outer_fused % 16 * 2 + nb_j_inner, placeholder_10[cse_var_1 + 1] - placeholder_10[cse_var_1]), 8, 16):
-                        cse_var_1 = T.var("int32")
+            for i0_outer_i1_outer_fused in T.parallel(128):
+                compute_2 = T.allocate([512], "float32", "global")
+                compute_3 = T.buffer_decl((512,), data=compute_2)
+                for i_outer_inner, nb_j_inner in T.grid(2, 2):
+                    for i_inner_init in range(8):
+                        cse_var_1: T.int32 = i_outer_inner * 256 + i_inner_init * 32 + nb_j_inner * 16
+                        compute_3[cse_var_1] = T.float32(0)
+                        compute_3[cse_var_1 + 1] = T.float32(0)
+                        compute_3[cse_var_1 + 2] = T.float32(0)
+                        compute_3[cse_var_1 + 3] = T.float32(0)
+                        compute_3[cse_var_1 + 4] = T.float32(0)
+                        compute_3[cse_var_1 + 5] = T.float32(0)
+                        compute_3[cse_var_1 + 6] = T.float32(0)
+                        compute_3[cse_var_1 + 7] = T.float32(0)
+                        compute_3[cse_var_1 + 8] = T.float32(0)
+                        compute_3[cse_var_1 + 9] = T.float32(0)
+                        compute_3[cse_var_1 + 10] = T.float32(0)
+                        compute_3[cse_var_1 + 11] = T.float32(0)
+                        compute_3[cse_var_1 + 12] = T.float32(0)
+                        compute_3[cse_var_1 + 13] = T.float32(0)
+                        compute_3[cse_var_1 + 14] = T.float32(0)
+                        compute_3[cse_var_1 + 15] = T.float32(0)
+                    for elem_idx, i_inner in T.grid(T.let(cse_var_2, i0_outer_i1_outer_fused % 16 * 2 + nb_j_inner, placeholder_10[cse_var_2 + 1] - placeholder_10[cse_var_2]), 8):
+                        cse_var_2 = T.var("int32")
                         placeholder_10 = T.buffer_decl((33,), "int32", data=placeholder_8.data)
-                        cse_var_3: T.int32 = i0_outer_i1_outer_fused % 16 * 2 + nb_j_inner
-                        cse_var_2: T.int32 = i_outer_inner * 256 + i_inner * 32 + nb_j_inner * 16 + j
+                        cse_var_21: T.int32 = elem_idx * 16
+                        cse_var_20: T.int32 = i0_outer_i1_outer_fused % 16 * 2 + nb_j_inner
+                        cse_var_19: T.int32 = i_outer_inner * 256 + i_inner * 32 + nb_j_inner * 16
+                        cse_var_18: T.int32 = i0_outer_i1_outer_fused // 16 * 4096 + i_outer_inner * 2048 + i_inner * 256
+                        cse_var_17: T.int32 = cse_var_19 + 9
+                        cse_var_16: T.int32 = cse_var_19 + 8
+                        cse_var_15: T.int32 = cse_var_19 + 7
+                        cse_var_14: T.int32 = cse_var_19 + 6
+                        cse_var_13: T.int32 = cse_var_19 + 5
+                        cse_var_12: T.int32 = cse_var_19 + 4
+                        cse_var_11: T.int32 = cse_var_19 + 3
+                        cse_var_10: T.int32 = cse_var_19 + 2
+                        cse_var_9: T.int32 = cse_var_19 + 15
+                        cse_var_8: T.int32 = cse_var_19 + 14
+                        cse_var_7: T.int32 = cse_var_19 + 13
+                        cse_var_6: T.int32 = cse_var_19 + 12
+                        cse_var_5: T.int32 = cse_var_19 + 11
+                        cse_var_4: T.int32 = cse_var_19 + 10
+                        cse_var_3: T.int32 = cse_var_19 + 1
                         placeholder_11 = T.buffer_decl((78656,), data=placeholder_6.data)
                         placeholder_12 = T.buffer_decl((32768,), data=placeholder_5.data)
                         placeholder_13 = T.buffer_decl((4916,), "int32", data=placeholder_7.data)
-                        compute_3[cse_var_2] = compute_3[cse_var_2] + placeholder_11[placeholder_10[cse_var_3] * 16 + elem_idx * 16 + j] * T.max(placeholder_12[i0_outer_i1_outer_fused // 16 * 16384 + i_outer_inner * 2048 + i_inner * 256 + placeholder_13[placeholder_10[cse_var_3] + elem_idx]], T.float32(0))
-                for i0_inner in range(64):
-                    cse_var_4: T.int32 = i0_outer_i1_outer_fused // 16 * 32768 + i0_inner * 512 + i0_outer_i1_outer_fused % 16 * 32
+                        compute_3[cse_var_19] = compute_3[cse_var_19] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_3] = compute_3[cse_var_3] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 1] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_10] = compute_3[cse_var_10] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 2] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_11] = compute_3[cse_var_11] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 3] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_12] = compute_3[cse_var_12] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 4] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_13] = compute_3[cse_var_13] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 5] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_14] = compute_3[cse_var_14] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 6] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_15] = compute_3[cse_var_15] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 7] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_16] = compute_3[cse_var_16] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 8] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_17] = compute_3[cse_var_17] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 9] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_4] = compute_3[cse_var_4] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 10] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_5] = compute_3[cse_var_5] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 11] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_6] = compute_3[cse_var_6] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 12] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_7] = compute_3[cse_var_7] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 13] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_8] = compute_3[cse_var_8] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 14] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_3[cse_var_9] = compute_3[cse_var_9] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 15] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                for i0_inner in range(16):
+                    cse_var_22: T.int32 = i0_outer_i1_outer_fused // 16 * 8192 + i0_inner * 512 + i0_outer_i1_outer_fused % 16 * 32
                     compute_4 = T.buffer_decl((65536,), data=compute_1.data)
                     placeholder_10 = T.buffer_decl((65536,), data=placeholder_9.data)
-                    compute_4[cse_var_4:cse_var_4 + 32] = T.max(compute_3[i0_inner * 32:i0_inner * 32 + 32] + placeholder_10[cse_var_4:cse_var_4 + 32], T.Broadcast(T.float32(0), 32))
+                    compute_4[cse_var_22:cse_var_22 + 32] = T.max(compute_3[i0_inner * 32:i0_inner * 32 + 32] + placeholder_10[cse_var_22:cse_var_22 + 32], T.Broadcast(T.float32(0), 32))
 
 
 
@@ -461,7 +509,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 1.552 ms
+    Execution time of this operator: 1.894 ms
 
 
 
diff --git a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
index a1c616955d..51a55380d3 100644
--- a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
 
 Computation times
 =================
-**00:40.547** total execution time for **how_to_tune_with_autotvm** files:
+**00:24.192** total execution time for **how_to_tune_with_autotvm** files:
 
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)           | 00:40.515 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)           | 00:24.157 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)               | 00:00.019 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)               | 00:00.021 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``)             | 00:00.005 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
index a846bb8320..60fd7976b5 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
@@ -268,7 +268,9 @@ for this template
     waiting for device...
     device available
     Get devices for measurement successfully!
-    No: 1   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+    No: 1   GFLOPS: 50.48/50.48     result: MeasureResult(costs=(0.004586107448275862,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.907130241394043, timestamp=1674175038.7889755)        [('tile_f', [-1, 2, 1, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9371037
+    No: 2   GFLOPS: 93.65/93.65     result: MeasureResult(costs=(0.002471886048780488,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.077481985092163, timestamp=1674175039.5116773)        [('tile_f', [-1, 1, 16, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 64, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5284705
+    No: 3   GFLOPS: 0.00/93.65      result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -390,8 +392,9 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 32, 4, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 256]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2119117
-    No: 2   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 8, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 64, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1666969
+    No: 4   GFLOPS: 213.99/213.99   result: MeasureResult(costs=(0.0010818329677419354,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.1306028366088867, timestamp=1674175042.3393323)      [('tile_f', [-1, 4, 8, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2782589
+    No: 5   GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -513,10 +516,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 128, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 32, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9380629
-    No: 3   GFLOPS: 85.37/85.37     result: MeasureResult(costs=(0.00271172485,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.7610549926757812, timestamp=1674164482.4717371)      [('tile_f', [-1, 16, 8, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8200091
-    No: 4   GFLOPS: 20.14/85.37     result: MeasureResult(costs=(0.011493578555555556,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.423952102661133, timestamp=1674164484.2182357)        [('tile_f', [-1, 2, 1, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9754717
-    No: 5   GFLOPS: 0.00/85.37      result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 64, 2, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7526534
+    No: 6   GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -638,8 +639,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 2, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 128]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3857892
-    No: 6   GFLOPS: 0.00/85.37      result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 2, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9404753
+    No: 7   GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -761,10 +762,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 8, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 32, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2654456
-    No: 7   GFLOPS: 1.25/85.37      result: MeasureResult(costs=(0.18517686449999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=6.310837507247925, timestamp=1674164491.677897)  [('tile_f', [-1, 16, 2, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3488114
-    No: 8   GFLOPS: 10.05/85.37     result: MeasureResult(costs=(0.02303889716666667,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.039496898651123, timestamp=1674164492.631526)  [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7507959
-    No: 9   GFLOPS: 0.00/85.37      result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 2, 64]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1011105
+    No: 8   GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -886,10 +885,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 4, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8822813
-    No: 10  GFLOPS: 89.86/89.86     result: MeasureResult(costs=(0.0025763257049180328,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.712925434112549, timestamp=1674164495.5032163)       [('tile_f', [-1, 2, 4, 2]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8366893
-    No: 11  GFLOPS: 28.31/89.86     result: MeasureResult(costs=(0.008176561153846153,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5758357048034668, timestamp=1674164496.28428) [('tile_f', [-1, 8, 2, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,785731
-    No: 12  GFLOPS: 0.00/89.86      result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 64, 1, 2]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 256, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5484881
+    No: 9   GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1011,8 +1008,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 2, 256]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1103518
-    No: 13  GFLOPS: 0.00/89.86      result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 32, 2]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8375713
+    No: 10  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1134,9 +1131,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 4, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5936633
-    No: 14  GFLOPS: 7.37/89.86      result: MeasureResult(costs=(0.03141268725,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4329447746276855, timestamp=1674164497.9928796)      [('tile_f', [-1, 2, 1, 2]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,687996
-    No: 15  GFLOPS: 0.00/89.86      result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 16, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 64, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9187286
+    No: 11  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1258,9 +1254,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 1, 64]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 8, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8650602
-    No: 16  GFLOPS: 25.55/89.86     result: MeasureResult(costs=(0.0090591335,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4694478511810303, timestamp=1674164498.7831192)       [('tile_f', [-1, 4, 2, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1360185
-    No: 17  GFLOPS: 0.00/89.86      result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 32, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 8, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2262743
+    No: 12  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1382,10 +1377,991 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 32, 4, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2104640
-    No: 18  GFLOPS: 154.49/154.49   result: MeasureResult(costs=(0.00149846352238806,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5423030853271484, timestamp=1674164500.5038612)        [('tile_f', [-1, 2, 2, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3520989
-    No: 19  GFLOPS: 193.05/193.05   result: MeasureResult(costs=(0.0011991573452380954,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.339512586593628, timestamp=1674164501.2104666)       [('tile_f', [-1, 2, 8, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,426922
-    No: 20  GFLOPS: 274.72/274.72   result: MeasureResult(costs=(0.0008426869495798319,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3525335788726807, timestamp=1674164501.93028)        [('tile_f', [-1, 8, 4, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,1973035
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 16, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1107565
+    No: 13  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+        func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+        func = build(s, args, target_host=task.target_host, runtime=runtime)
+      File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+        input_mod = lower(inputs, args, name=name, binds=binds)
+      File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+      File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+    tvm._ffi.base.TVMError: Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+    Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 8, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 512, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,420837
+    No: 14  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+        func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+        func = build(s, args, target_host=task.target_host, runtime=runtime)
+      File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+        input_mod = lower(inputs, args, name=name, binds=binds)
+      File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+      File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+    tvm._ffi.base.TVMError: Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+    Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 32, 8, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,5185872
+    No: 15  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+        func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+        func = build(s, args, target_host=task.target_host, runtime=runtime)
+      File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+        input_mod = lower(inputs, args, name=name, binds=binds)
+      File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+      File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+    tvm._ffi.base.TVMError: Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+    Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3232479
+    No: 16  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+        func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+        func = build(s, args, target_host=task.target_host, runtime=runtime)
+      File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+        input_mod = lower(inputs, args, name=name, binds=binds)
+      File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+      File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+    tvm._ffi.base.TVMError: Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+    Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 2, 32]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 64, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2606310
+    No: 17  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+        func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+        func = build(s, args, target_host=task.target_host, runtime=runtime)
+      File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+        input_mod = lower(inputs, args, name=name, binds=binds)
+      File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+      File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+    tvm._ffi.base.TVMError: Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+    Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 16, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6485563
+    No: 18  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+        func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+        func = build(s, args, target_host=task.target_host, runtime=runtime)
+      File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+        input_mod = lower(inputs, args, name=name, binds=binds)
+      File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+      File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+    tvm._ffi.base.TVMError: Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+    Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 128, 1, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5571827
+    No: 19  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+        func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+        func = build(s, args, target_host=task.target_host, runtime=runtime)
+      File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+        input_mod = lower(inputs, args, name=name, binds=binds)
+      File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+      File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+    tvm._ffi.base.TVMError: Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+    Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 4, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9441890
+    No: 20  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+        func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+        func = build(s, args, target_host=task.target_host, runtime=runtime)
+      File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+        input_mod = lower(inputs, args, name=name, binds=binds)
+      File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+      File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+    tvm._ffi.base.TVMError: Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+    Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 1, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4341582
 
 
 
@@ -1440,9 +2416,9 @@ and measure running time.
     Finish loading 20 records
 
     Best config:
-    [('tile_f', [-1, 8, 4, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,1973035
+    [('tile_f', [-1, 4, 8, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2782589
     Finish loading 20 records
-    Time cost of this operator: 0.001221
+    Time cost of this operator: 0.001132
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/index.rst.txt b/docs/_sources/how_to/work_with_microtvm/index.rst.txt
index 261ed68864..d02b385ef8 100644
--- a/docs/_sources/how_to/work_with_microtvm/index.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/index.rst.txt
@@ -66,6 +66,23 @@ demonstrate how to tune and deploy models with microTVM.
     </div>
 
 
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial is showcasing building an MLPerfTiny submission using microTVM. This tutorial sho...">
+
+.. only:: html
+
+  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_mlperftiny_thumb.png
+    :alt: Creating Your MLPerfTiny Submission with microTVM
+
+  :ref:`sphx_glr_how_to_work_with_microtvm_micro_mlperftiny.py`
+
+.. raw:: html
+
+      <div class="sphx-glr-thumbnail-title">Creating Your MLPerfTiny Submission with microTVM</div>
+    </div>
+
+
 .. raw:: html
 
     <div class="sphx-glr-thumbcontainer" tooltip="This tutorial is showcasing microTVM host-driven AoT compilation with a PyTorch model. This tut...">
@@ -162,6 +179,7 @@ demonstrate how to tune and deploy models with microTVM.
    /how_to/work_with_microtvm/micro_aot
    /how_to/work_with_microtvm/micro_autotune
    /how_to/work_with_microtvm/micro_ethosu
+   /how_to/work_with_microtvm/micro_mlperftiny
    /how_to/work_with_microtvm/micro_pytorch
    /how_to/work_with_microtvm/micro_reference_vm
    /how_to/work_with_microtvm/micro_tflite
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
index e7dae45644..bff02587b5 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
@@ -363,10 +363,10 @@ Timing the untuned program
     ########## Build without Autotuning ##########
     Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  Measurements(us)  
     ---------                                     ---                                           --------  -------  -----              ------  -------  ----------------  
-    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  311.6     98.727   (1, 2, 10, 10, 3)  2       1        [311.6]           
-    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.045     0.965    (1, 6, 10, 10)     1       1        [3.045]           
-    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.973     0.308    (1, 1, 10, 10, 3)  1       1        [0.973]           
-    Total_time                                    -                                             315.618   -        -                  -       -        -                 
+    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  310.8     98.686   (1, 2, 10, 10, 3)  2       1        [310.8]           
+    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.186     1.012    (1, 6, 10, 10)     1       1        [3.186]           
+    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.953     0.303    (1, 1, 10, 10, 3)  1       1        [0.953]           
+    Total_time                                    -                                             314.939   -        -                  -       -        -                 
 
 
 
@@ -431,10 +431,10 @@ Timing the tuned program
     ########## Build with Autotuning ##########
     Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  Measurements(us)  
     ---------                                     ---                                           --------  -------  -----              ------  -------  ----------------  
-    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  134.8     97.951   (1, 6, 10, 10, 1)  2       1        [134.8]           
-    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.852     1.346    (1, 6, 10, 10)     1       1        [1.852]           
-    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.968     0.703    (1, 1, 10, 10, 3)  1       1        [0.968]           
-    Total_time                                    -                                             137.62    -        -                  -       -        -                 
+    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  102.8     97.511   (1, 6, 10, 10, 1)  2       1        [102.8]           
+    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.773     1.682    (1, 6, 10, 10)     1       1        [1.773]           
+    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.851     0.808    (1, 3, 10, 10, 1)  1       1        [0.851]           
+    Total_time                                    -                                             105.424   -        -                  -       -        -                 
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_mlperftiny.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_mlperftiny.rst.txt
new file mode 100644
index 0000000000..a94af8f3bf
--- /dev/null
+++ b/docs/_sources/how_to/work_with_microtvm/micro_mlperftiny.rst.txt
@@ -0,0 +1,389 @@
+
+.. DO NOT EDIT. THIS FILE WAS AUTOMATICALLY GENERATED BY
+.. TVM'S MONKEY-PATCHED VERSION OF SPHINX-GALLERY. TO MAKE
+.. CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "how_to/work_with_microtvm/micro_mlperftiny.py"
+
+.. only:: html
+
+    .. note::
+        :class: sphx-glr-download-link-note
+
+        This tutorial can be used interactively with Google Colab! You can also click
+        :ref:`here <sphx_glr_download_how_to_work_with_microtvm_micro_mlperftiny.py>` to run the Jupyter notebook locally.
+
+        .. image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/utilities/colab_button.svg
+            :align: center
+            :target: https://colab.research.google.com/github/apache/tvm-site/blob/asf-site/docs/_downloads/1fc1683d67bee4f26703504a58d42578/micro_mlperftiny.ipynb
+            :width: 300px
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_how_to_work_with_microtvm_micro_mlperftiny.py:
+
+
+.. _tutorial-micro-MLPerfTiny:
+
+Creating Your MLPerfTiny Submission with microTVM
+=================================================
+**Authors**:
+`Mehrdad Hessar <https://github.com/mehrdadh>`_
+
+This tutorial is showcasing building an MLPerfTiny submission using microTVM. This
+tutorial shows the steps to import a TFLite model from MLPerfTiny benchmark models,
+compile it with TVM and generate a Zephyr project which can be flashed to a Zephyr
+supported board to benchmark the model using EEMBC runner.
+
+.. GENERATED FROM PYTHON SOURCE LINES 32-34
+
+.. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 35-42
+
+.. code-block:: default
+
+
+    import os
+    import pathlib
+    import tarfile
+    import tempfile
+    import shutil
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 43-45
+
+.. include:: ../../../../gallery/how_to/work_with_microtvm/install_zephyr.rst
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 49-52
+
+**Note:** Install CMSIS-NN only if you are interested to generate this submission
+using CMSIS-NN code generator.
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 55-57
+
+.. include:: ../../../../gallery/how_to/work_with_microtvm/install_cmsis.rst
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 60-63
+
+Import Python dependencies
+-------------------------------
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 63-77
+
+.. code-block:: default
+
+    import tensorflow as tf
+    import numpy as np
+
+    import tvm
+    from tvm import relay
+    from tvm.relay.backend import Executor, Runtime
+    from tvm.contrib.download import download_testdata
+    from tvm.micro import export_model_library_format
+    from tvm.micro.model_library_format import generate_c_interface_header
+    from tvm.micro.testing.utils import (
+        create_header_file,
+        mlf_extract_workspace_size_bytes,
+    )
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 78-100
+
+Import Visual Wake Word Model
+--------------------------------------------------------------------
+
+To begin with, download and import the Visual Wake Word (VWW) TFLite model from MLPerfTiny.
+This model is originally from `MLPerf Tiny repository <https://github.com/mlcommons/tiny>`_.
+We also capture metadata information from the TFLite model such as input/output name,
+quantization parameters, etc. which will be used in following steps.
+
+We use indexing for various models to build the submission. The indices are defined as follows:
+To build another model, you need to update the model URL, the short name and index number.
+
+  * Keyword Spotting(KWS) 1
+  * Visual Wake Word(VWW) 2
+  * Anomaly Detection(AD) 3
+  * Image Classification(IC) 4
+
+If you would like to build the submission with CMSIS-NN, modify USE_CMSIS environment variable.
+
+  .. code-block:: bash
+
+    export USE_CMSIS=1
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 100-144
+
+.. code-block:: default
+
+
+    MODEL_URL = "https://github.com/mlcommons/tiny/raw/bceb91c5ad2e2deb295547d81505721d3a87d578/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite"
+    MODEL_PATH = download_testdata(MODEL_URL, "vww_96_int8.tflite", module="model")
+
+    MODEL_SHORT_NAME = "VWW"
+    MODEL_INDEX = 2
+
+    USE_CMSIS = os.environ.get("TVM_USE_CMSIS", False)
+
+    tflite_model_buf = open(MODEL_PATH, "rb").read()
+    try:
+        import tflite
+
+        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+    except AttributeError:
+        import tflite.Model
+
+        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+    interpreter = tf.lite.Interpreter(model_path=str(MODEL_PATH))
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    input_name = input_details[0]["name"]
+    input_shape = tuple(input_details[0]["shape"])
+    input_dtype = np.dtype(input_details[0]["dtype"]).name
+    output_name = output_details[0]["name"]
+    output_shape = tuple(output_details[0]["shape"])
+    output_dtype = np.dtype(output_details[0]["dtype"]).name
+
+    # We extract quantization information from TFLite model.
+    # This is required for all models except Anomaly Detection,
+    # because for other models we send quantized data to interpreter
+    # from host, however, for AD model we send floating data and quantization
+    # happens on the microcontroller.
+    if MODEL_SHORT_NAME != "AD":
+        quant_output_scale = output_details[0]["quantization_parameters"]["scales"][0]
+        quant_output_zero_point = output_details[0]["quantization_parameters"]["zero_points"][0]
+
+    relay_mod, params = relay.frontend.from_tflite(
+        tflite_model, shape_dict={input_name: input_shape}, dtype_dict={input_name: input_dtype}
+    )
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 145-153
+
+Defining Target, Runtime and Executor
+--------------------------------------------------------------------
+
+Now we need to define the target, runtime and executor to compile this model. In this tutorial,
+we use Ahead-of-Time (AoT) compilation and we build a standalone project. This is different
+than using AoT with host-driven mode where the target would communicate with host using host-driven
+AoT executor to run inference.
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 153-172
+
+.. code-block:: default
+
+
+    # Use the C runtime (crt)
+    RUNTIME = Runtime("crt")
+
+    # Use the AoT executor with `unpacked-api=True` and `interface-api=c`. `interface-api=c` forces
+    # the compiler to generate C type function APIs and `unpacked-api=True` forces the compiler
+    # to generate minimal unpacked format inputs which reduces the stack memory usage on calling
+    # inference layers of the model.
+    EXECUTOR = Executor(
+        "aot",
+        {"unpacked-api": True, "interface-api": "c", "workspace-byte-alignment": 8},
+    )
+
+    # Select a Zephyr board
+    BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
+
+    # Get the the full target description using the BOARD
+    TARGET = tvm.micro.testing.get_target("zephyr", BOARD)
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 173-181
+
+Compile the model and export model library format
+--------------------------------------------------------------------
+
+Now, we compile the model for the target. Then, we generate model
+library format for the compiled model. We also need to calculate the
+workspace size that is required for the compiled model.
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 181-199
+
+.. code-block:: default
+
+
+    config = {"tir.disable_vectorize": True}
+    if USE_CMSIS:
+        from tvm.relay.op.contrib import cmsisnn
+
+        config["relay.ext.cmsisnn.options"] = {"mcpu": TARGET.mcpu}
+        relay_mod = cmsisnn.partition_for_cmsisnn(relay_mod, params, mcpu=TARGET.mcpu)
+
+    with tvm.transform.PassContext(opt_level=3, config=config):
+        module = tvm.relay.build(
+            relay_mod, target=TARGET, params=params, runtime=RUNTIME, executor=EXECUTOR
+        )
+
+    temp_dir = tvm.contrib.utils.tempdir()
+    model_tar_path = temp_dir / "model.tar"
+    export_model_library_format(module, model_tar_path)
+    workspace_size = mlf_extract_workspace_size_bytes(model_tar_path)
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 200-209
+
+Generate input/output header files
+--------------------------------------------------------------------
+
+To create a microTVM standalone project with AoT, we need to generate
+input and output header files. These header files are used to connect
+the input and output API from generated code to the rest of the
+standalone project. For this specific submission, we only need to generate
+output header file since the input API call is handled differently.
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 209-232
+
+.. code-block:: default
+
+
+    extra_tar_dir = tvm.contrib.utils.tempdir()
+    extra_tar_file = extra_tar_dir / "extra.tar"
+
+    with tarfile.open(extra_tar_file, "w:gz") as tf:
+        with tempfile.TemporaryDirectory() as tar_temp_dir:
+            model_files_path = os.path.join(tar_temp_dir, "include")
+            os.mkdir(model_files_path)
+            header_path = generate_c_interface_header(
+                module.libmod_name, [input_name], [output_name], [], {}, [], 0, model_files_path, {}, {}
+            )
+            tf.add(header_path, arcname=os.path.relpath(header_path, tar_temp_dir))
+
+        create_header_file(
+            "output_data",
+            np.zeros(
+                shape=output_shape,
+                dtype=output_dtype,
+            ),
+            "include",
+            tf,
+        )
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 233-243
+
+Create the project, build and prepare the project tar file
+--------------------------------------------------------------------
+
+Now that we have the compiled model as a model library format,
+we can generate the full project using Zephyr template project. First,
+we prepare the project options, then build the project. Finally, we
+cleanup the temporary files and move the submission project to the
+current working directory which could be downloaded and used on
+your development kit.
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 243-292
+
+.. code-block:: default
+
+
+    input_total_size = 1
+    for i in range(len(input_shape)):
+        input_total_size *= input_shape[i]
+
+    template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
+    project_options = {
+        "extra_files_tar": str(extra_tar_file),
+        "project_type": "mlperftiny",
+        "board": BOARD,
+        "compile_definitions": [
+            f"-DWORKSPACE_SIZE={workspace_size + 512}",  # Memory workspace size, 512 is a temporary offset
+            # since the memory calculation is not accurate.
+            f"-DTARGET_MODEL={MODEL_INDEX}",  # Sets the model index for project compilation.
+            f"-DTH_MODEL_VERSION=EE_MODEL_VERSION_{MODEL_SHORT_NAME}01",  # Sets model version. This is required by MLPerfTiny API.
+            f"-DMAX_DB_INPUT_SIZE={input_total_size}",  # Max size of the input data array.
+        ],
+    }
+
+    if MODEL_SHORT_NAME != "AD":
+        project_options["compile_definitions"].append(f"-DOUT_QUANT_SCALE={quant_output_scale}")
+        project_options["compile_definitions"].append(f"-DOUT_QUANT_ZERO={quant_output_zero_point}")
+
+    if USE_CMSIS:
+        project_options["compile_definitions"].append(f"-DCOMPILE_WITH_CMSISNN=1")
+
+    # Note: You might need to adjust this based on the board that you are using.
+    project_options["config_main_stack_size"] = 4000
+
+    if USE_CMSIS:
+        project_options["cmsis_path"] = os.environ.get("CMSIS_PATH", "/content/cmsis")
+
+    generated_project_dir = temp_dir / "project"
+
+    project = tvm.micro.project.generate_project_from_mlf(
+        template_project_path, generated_project_dir, model_tar_path, project_options
+    )
+    project.build()
+
+    # Cleanup the build directory and extra artifacts
+    shutil.rmtree(generated_project_dir / "build")
+    (generated_project_dir / "model.tar").unlink()
+
+    project_tar_path = pathlib.Path(os.getcwd()) / "project.tar"
+    with tarfile.open(project_tar_path, "w:tar") as tar:
+        tar.add(generated_project_dir, arcname=os.path.basename("project"))
+
+    print(f"The generated project is located here: {project_tar_path}")
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 293-313
+
+Use this project with your board
+--------------------------------------------------------------------
+
+Now that we have the generated project, you can use this project locally
+to flash your board and prepare it for EEMBC runner software.
+To do this follow these steps:
+
+  .. code-block:: bash
+
+    tar -xf project.tar
+    cd project
+    mkdir build
+    cmake ..
+    make -j2
+    west flash
+
+Now you can connect your board to EEMBC runner using this
+`instructions <https://github.com/eembc/energyrunner>`_
+and benchmark this model on your board.
+
+
+
+.. _sphx_glr_download_how_to_work_with_microtvm_micro_mlperftiny.py:
+
+.. only:: html
+
+  .. container:: sphx-glr-footer sphx-glr-footer-example
+
+
+    .. container:: sphx-glr-download sphx-glr-download-python
+
+      :download:`Download Python source code: micro_mlperftiny.py <micro_mlperftiny.py>`
+
+    .. container:: sphx-glr-download sphx-glr-download-jupyter
+
+      :download:`Download Jupyter notebook: micro_mlperftiny.ipynb <micro_mlperftiny.ipynb>`
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
index 4fbcc5f989..10d497e3b1 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
@@ -117,7 +117,7 @@ download a cat image and preprocess it to use as the model input.
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/ao/quantization/utils.py:281: UserWarning: must run observer before calling calculate_qparams. Returning default values.
       "must run observer before calling calculate_qparams. " +
     Downloading: "https://download.pytorch.org/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2_qnnpack_37f702c5.pth
-
      0%|          | 0.00/3.42M [00:00<?, ?B/s]
    100%|##########| 3.42M/3.42M [00:00<00:00, 49.5MB/s]
+
      0%|          | 0.00/3.42M [00:00<?, ?B/s]
     61%|######    | 2.09M/3.42M [00:00<00:00, 19.9MB/s]
    100%|##########| 3.42M/3.42M [00:00<00:00, 30.4MB/s]
     /workspace/python/tvm/relay/frontend/pytorch_utils.py:47: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
       return LooseVersion(torch_ver) > ver
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/setuptools/_distutils/version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
@@ -322,7 +322,7 @@ Look up prediction top 1 index in 1000 class synset.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  8.520 seconds)
+   **Total running time of the script:** ( 1 minutes  9.130 seconds)
 
 
 .. _sphx_glr_download_how_to_work_with_microtvm_micro_pytorch.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
index 3eae88bbe6..53b217a1f2 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
@@ -218,7 +218,7 @@ take about **2 minutes** to download the Stanford Cars, while COCO 2017 validati
  .. code-block:: none
 
 
-    '/tmp/tmp9vmiouqx/images/random'
+    '/tmp/tmp9frj0wj1/images/random'
 
 
 
@@ -309,7 +309,7 @@ objects to other stuff? We can display some examples from our datasets using ``m
 
 
 .. image-sg:: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
-   :alt: [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]
+   :alt: [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]
    :srcset: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
    :class: sphx-glr-single-img
 
@@ -318,8 +318,8 @@ objects to other stuff? We can display some examples from our datasets using ``m
 
  .. code-block:: none
 
-    /tmp/tmp9vmiouqx/images/target contains 8144 images
-    /tmp/tmp9vmiouqx/images/random contains 5000 images
+    /tmp/tmp9frj0wj1/images/target contains 8144 images
+    /tmp/tmp9frj0wj1/images/random contains 5000 images
 
 
 
@@ -494,13 +494,13 @@ the time on our validation set).
  .. code-block:: none
 
     Epoch 1/3
-    328/328 - 47s - loss: 0.2139 - accuracy: 0.9263 - val_loss: 0.1829 - val_accuracy: 0.9305 - 47s/epoch - 143ms/step
+    328/328 - 47s - loss: 0.2199 - accuracy: 0.9239 - val_loss: 0.1185 - val_accuracy: 0.9581 - 47s/epoch - 142ms/step
     Epoch 2/3
-    328/328 - 43s - loss: 0.0909 - accuracy: 0.9648 - val_loss: 0.1100 - val_accuracy: 0.9668 - 43s/epoch - 132ms/step
+    328/328 - 43s - loss: 0.1003 - accuracy: 0.9617 - val_loss: 0.1516 - val_accuracy: 0.9486 - 43s/epoch - 132ms/step
     Epoch 3/3
-    328/328 - 43s - loss: 0.0661 - accuracy: 0.9757 - val_loss: 0.0951 - val_accuracy: 0.9645 - 43s/epoch - 131ms/step
+    328/328 - 43s - loss: 0.0737 - accuracy: 0.9728 - val_loss: 0.1310 - val_accuracy: 0.9615 - 43s/epoch - 132ms/step
 
-    <keras.callbacks.History object at 0x7f6790261dd0>
+    <keras.callbacks.History object at 0x7fe689e4a810>
 
 
 
@@ -857,7 +857,7 @@ Arduino tutorial for how to do that `on GitHub <https://github.com/guberti/tvm-a
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 4 minutes  21.567 seconds)
+   **Total running time of the script:** ( 4 minutes  29.197 seconds)
 
 
 .. _sphx_glr_download_how_to_work_with_microtvm_micro_train.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
index 590a08433e..5bc9adbb3c 100644
--- a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
 
 Computation times
 =================
-**06:34.382** total execution time for **how_to_work_with_microtvm** files:
+**06:42.755** total execution time for **how_to_work_with_microtvm** files:
 
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``)               | 04:21.567 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``)               | 04:29.197 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_pytorch.py` (``micro_pytorch.py``)           | 01:08.520 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_pytorch.py` (``micro_pytorch.py``)           | 01:09.130 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``)         | 00:51.553 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``)         | 00:51.744 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``)                   | 00:08.921 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``)                   | 00:08.855 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``)             | 00:03.821 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``)             | 00:03.827 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_microtvm_micro_tvmc.py` (``micro_tvmc.py``)                 | 00:00.000 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
@@ -24,3 +24,5 @@ Computation times
 +---------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_microtvm_micro_reference_vm.py` (``micro_reference_vm.py``) | 00:00.000 | 0.0 MB |
 +---------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_mlperftiny.py` (``micro_mlperftiny.py``)     | 00:00.000 | 0.0 MB |
++---------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
index 1f82523d5a..0a737689d9 100644
--- a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**00:44.389** total execution time for **how_to_work_with_relay** files:
+**00:45.175** total execution time for **how_to_work_with_relay** files:
 
 +----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:32.546 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:32.926 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``)           | 00:10.301 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``)           | 00:10.433 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``)                             | 00:01.536 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``)                             | 00:01.810 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``)                 | 00:00.006 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
index 578266748d..71015ef04f 100644
--- a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
@@ -264,7 +264,7 @@ The following example customizes CUDA lowering rule for :code:`exp`.
  .. code-block:: none
 
 
-    <function my_cuda_math_rule at 0x7f65bd33a9e0>
+    <function my_cuda_math_rule at 0x7fe68a33d290>
 
 
 
diff --git a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
index 33e64371e7..50bd7c02e8 100644
--- a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
@@ -5,22 +5,22 @@
 
 Computation times
 =================
-**00:07.835** total execution time for **how_to_work_with_schedules** files:
+**00:07.592** total execution time for **how_to_work_with_schedules** files:
 
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``)                 | 00:05.271 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``)                 | 00:05.077 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``)                     | 00:01.193 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``)                     | 00:01.155 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``)                     | 00:00.585 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``)                     | 00:00.577 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``)                               | 00:00.566 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``)                               | 00:00.562 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``)                     | 00:00.115 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``)                     | 00:00.116 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.050 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``)                               | 00:00.031 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``)                               | 00:00.032 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``)               | 00:00.024 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
index 3030257226..43a87f122c 100644
--- a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
@@ -340,7 +340,7 @@ The importing needs to happen before the tensorized GEMV being executed.
             B_1 = T.match_buffer(B, (512, 64))
             C_1 = T.match_buffer(C, (1024, 512))
             i = T.var("int32")
-            T.attr(T.iter_var(i, None, "DataPar", ""), "pragma_import_llvm", "; ModuleID = '/tmp/tmpuyepqa17/input0.cc'\nsource_filename = \"/tmp/tmpuyepqa17/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca float*, align 8\n  %8 = alloca float*, align 8\n  %9 = alloca float*, [...]
+            T.attr(T.iter_var(i, None, "DataPar", ""), "pragma_import_llvm", "; ModuleID = '/tmp/tmpgb2rg8u1/input0.cc'\nsource_filename = \"/tmp/tmpgb2rg8u1/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca float*, align 8\n  %8 = alloca float*, align 8\n  %9 = alloca float*, [...]
             for i, j_outer in T.grid(1024, 32):
                 T.call_extern("int32", "gemv_update", T.tvm_access_ptr(T.type_annotation("float32"), C_1.data, i * 512 + j_outer * 16, 16, 2), T.tvm_access_ptr(T.type_annotation("float32"), A_1.data, i * 64, 64, 1), T.tvm_access_ptr(T.type_annotation("float32"), B_1.data, j_outer * 1024, 1024, 1), 16, 64, 64)
 
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
index c6ba60e577..1641e21bee 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:29.322** total execution time for **topic_vta_tutorials_autotvm** files:
+**00:29.785** total execution time for **topic_vta_tutorials_autotvm** files:
 
 +---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:29.315 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:29.778 | 0.0 MB |
 +---------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``)     | 00:00.007 | 0.0 MB |
 +---------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
index 50797cb050..d7d798679d 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -293,7 +293,7 @@ The compilation steps are:
       DeprecationWarning,
     /workspace/vta/tutorials/frontend/deploy_classification.py:213: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the  new recommended usage.
       relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
-    resnet18_v1 inference graph built in 31.27s!
+    resnet18_v1 inference graph built in 31.53s!
 
 
 
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
index d3eeba2fa3..4a3da4c7cc 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
@@ -337,7 +337,7 @@ The compilation steps are:
 
     /workspace/python/tvm/relay/build_module.py:348: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
       DeprecationWarning,
-    yolov3-tiny inference graph built in 21.55s!
+    yolov3-tiny inference graph built in 21.77s!
 
 
 
diff --git a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
index f600258914..263c94c995 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**01:36.356** total execution time for **topic_vta_tutorials_frontend** files:
+**01:37.002** total execution time for **topic_vta_tutorials_frontend** files:
 
 +------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``)           | 00:48.303 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``)           | 00:48.682 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:48.053 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:48.320 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
index 995431e573..f2497c6afb 100644
--- a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:03.220** total execution time for **topic_vta_tutorials_optimize** files:
+**00:03.152** total execution time for **topic_vta_tutorials_optimize** files:
 
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)         | 00:02.751 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)         | 00:02.693 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.469 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.459 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
index 3014b8d116..6f24491772 100644
--- a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:00.845** total execution time for **topic_vta_tutorials** files:
+**00:00.817** total execution time for **topic_vta_tutorials** files:
 
 +---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.453 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.434 | 0.0 MB |
 +---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.392 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.383 | 0.0 MB |
 +---------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
index 469b8d67b0..743c33ed7a 100644
--- a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
@@ -319,7 +319,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 95.304 ms
+    Execution time of this operator: 97.015 ms
 
 
 
@@ -419,7 +419,7 @@ resume the status and do more 5 trials.
     Resume search:
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/xgboost/training.py:17: UserWarning: Old style callback is deprecated.  See: https://xgboost.readthedocs.io/en/latest/python/callbacks.html
       warnings.warn(f'Old style callback is deprecated.  See: {link}', UserWarning)
-    *E
+
 
 
 
@@ -437,7 +437,7 @@ operations.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  30.676 seconds)
+   **Total running time of the script:** ( 1 minutes  20.459 seconds)
 
 
 .. _sphx_glr_download_tutorial_auto_scheduler_matmul_x86.py:
diff --git a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
index bb43288b3d..e9d292ee1d 100644
--- a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
@@ -454,16 +454,16 @@ reduce variance, we take 5 measurements and average them.
     waiting for device...
     device available
     Get devices for measurement successfully!
-    No: 1   GFLOPS: 12.67/12.67     result: MeasureResult(costs=(0.0211864856,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6081938743591309, timestamp=1674162973.2090986)       [('tile_y', [-1, 128]), ('tile_x', [-1, 256])],None,87
-    No: 2   GFLOPS: 12.73/12.73     result: MeasureResult(costs=(0.021089921,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6033353805541992, timestamp=1674162974.5552912)        [('tile_y', [-1, 32]), ('tile_x', [-1, 128])],None,75
-    No: 3   GFLOPS: 12.31/12.73     result: MeasureResult(costs=(0.0218037648,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5971829891204834, timestamp=1674162975.9178584)       [('tile_y', [-1, 256]), ('tile_x', [-1, 256])],None,88
-    No: 4   GFLOPS: 2.75/12.73      result: MeasureResult(costs=(0.0974450754,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8002643585205078, timestamp=1674162978.4827886)       [('tile_y', [-1, 4]), ('tile_x', [-1, 8])],None,32
-    No: 5   GFLOPS: 9.88/12.73      result: MeasureResult(costs=(0.027168696799999996,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.0152692794799805, timestamp=1674162979.6128342)       [('tile_y', [-1, 16]), ('tile_x', [-1, 128])],None,74
-    No: 6   GFLOPS: 1.92/12.73      result: MeasureResult(costs=(0.14010060880000003,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.4668102264404297, timestamp=1674162982.1046844)        [('tile_y', [-1, 1]), ('tile_x', [-1, 8])],None,30
-    No: 7   GFLOPS: 3.25/12.73      result: MeasureResult(costs=(0.08260961539999999,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5741770267486572, timestamp=1674162983.6859846)        [('tile_y', [-1, 512]), ('tile_x', [-1, 16])],None,49
-    No: 8   GFLOPS: 11.70/12.73     result: MeasureResult(costs=(0.0229412996,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6466076374053955, timestamp=1674162984.3084815)       [('tile_y', [-1, 16]), ('tile_x', [-1, 256])],None,84
-    No: 9   GFLOPS: 2.10/12.73      result: MeasureResult(costs=(0.1278917254,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.270461320877075, timestamp=1674162986.69022)  [('tile_y', [-1, 8]), ('tile_x', [-1, 2])],None,13
-    No: 10  GFLOPS: 0.51/12.73      result: MeasureResult(costs=(0.5288641098,), error_no=MeasureErrorNo.NO_ERROR, all_cost=8.681216478347778, timestamp=1674162995.4015799)        [('tile_y', [-1, 128]), ('tile_x', [-1, 1])],None,7
+    No: 1   GFLOPS: 11.82/11.82     result: MeasureResult(costs=(0.0227024582,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6222848892211914, timestamp=1674173522.5541093)       [('tile_y', [-1, 256]), ('tile_x', [-1, 256])],None,88
+    No: 2   GFLOPS: 0.50/11.82      result: MeasureResult(costs=(0.5333215642,), error_no=MeasureErrorNo.NO_ERROR, all_cost=8.763290643692017, timestamp=1674173531.341208) [('tile_y', [-1, 256]), ('tile_x', [-1, 1])],None,8
+    No: 3   GFLOPS: 1.98/11.82      result: MeasureResult(costs=(0.13528974740000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.397447109222412, timestamp=1674173534.5228543) [('tile_y', [-1, 2]), ('tile_x', [-1, 4])],None,21
+    No: 4   GFLOPS: 13.79/13.79     result: MeasureResult(costs=(0.0194591336,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5597658157348633, timestamp=1674173535.0910823)       [('tile_y', [-1, 128]), ('tile_x', [-1, 64])],None,67
+    No: 5   GFLOPS: 7.38/13.79      result: MeasureResult(costs=(0.0363800002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.9181301593780518, timestamp=1674173536.297884)        [('tile_y', [-1, 1]), ('tile_x', [-1, 128])],None,70
+    No: 6   GFLOPS: 9.81/13.79      result: MeasureResult(costs=(0.0273711362,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.9397749900817871, timestamp=1674173537.7699847)       [('tile_y', [-1, 2]), ('tile_x', [-1, 128])],None,71
+    No: 7   GFLOPS: 11.62/13.79     result: MeasureResult(costs=(0.0231036638,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6368618011474609, timestamp=1674173539.1731124)       [('tile_y', [-1, 32]), ('tile_x', [-1, 256])],None,85
+    No: 8   GFLOPS: 9.00/13.79      result: MeasureResult(costs=(0.0298130234,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.786644697189331, timestamp=1674173539.9062722)        [('tile_y', [-1, 512]), ('tile_x', [-1, 64])],None,69
+    No: 9   GFLOPS: 11.59/13.79     result: MeasureResult(costs=(0.0231601226,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6171057224273682, timestamp=1674173540.6359928)       [('tile_y', [-1, 16]), ('tile_x', [-1, 256])],None,84
+    No: 10  GFLOPS: 3.58/13.79      result: MeasureResult(costs=(0.074947897,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.402517318725586, timestamp=1674173542.083198)  [('tile_y', [-1, 16]), ('tile_x', [-1, 8])],None,34
 
 
 
diff --git a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
index 83a2b87714..c46b3813de 100644
--- a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
@@ -311,7 +311,7 @@ standard deviation.
 
  .. code-block:: none
 
-    {'mean': 510.8778525799971, 'median': 510.608007299993, 'std': 1.4270214411428404}
+    {'mean': 512.2079894000011, 'median': 512.4371018000033, 'std': 2.4379111328350107}
 
 
 
@@ -545,30 +545,29 @@ the tuning data to.
 
  .. code-block:: none
 
-
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  1/25]  Current/Best:   19.10/  19.10 GFLOPS | Progress: (4/20) | 7.39 s
    [Task  1/25]  Current/Best:    1.82/  22.72 GFLOPS | Progress: (8/20) | 11.51 s
    [Task  1/25]  Current/Best:    6.70/  22.72 GFLOPS | Progress: (12/20) | 15.42 s
    [Task  1/25]  Current/Best:    8.62/  22.72 GFLOPS | Progress: (16/20) | 18.71 s
    [Task  1/25]  Current/Best:   14.03/  22.72 GFLOPS | Progress: (20/20) | 21.97 s Done.
-
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  2/25]  Current/Best:   20.03/  20.03 GFLOPS | Progress: (4/20) | 3.95 s
    [Task  2/25]  Current/Best:   12.24/  20.84 GFLOPS | Progress: (8/20) | 5.43 s
    [Task  2/25]  Current/Best:   14.56/  20.84 GFLOPS | Progress: (12/20) | 7.58 s
    [Task  2/25]  Current/Best:   22.32/  22.32 GFLOPS | Progress: (16/20) | 9.19 s
    [Task  2/25]  Current/Best:   19.79/  22.32 GFLOPS | Progress: (20/20) | 11.10 s Done.
-
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  3/25]  Current/Best:   12.58/  19.21 GFLOPS | Progress: (4/20) | 4.10 s
    [Task  3/25]  Current/Best:   17.18/  20.46 GFLOPS | Progress: (8/20) | 6.09 s
    [Task  3/25]  Current/Best:    9.72/  20.46 GFLOPS | Progress: (12/20) | 8.34 s
    [Task  3/25]  Current/Best:   21.61/  21.61 GFLOPS | Progress: (16/20) | 10.73 s
    [Task  3/25]  Current/Best:   12.88/  23.43 GFLOPS | Progress: (20/20) | 12.89 s Done.
-
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  4/25]  Current/Best:   16.14/  16.14 GFLOPS | Progress: (4/20) | 4.26 s
    [Task  4/25]  Current/Best:    8.40/  16.14 GFLOPS | Progress: (8/20) | 6.79 s
    [Task  4/25]  Current/Best:    5.15/  16.30 GFLOPS | Progress: (12/20) | 12.10 s
    [Task  4/25]  Current/Best:   11.96/  16.30 GFLOPS | Progress: (16/20) | 14.22 s
    [Task  4/25]  Current/Best:    6.76/  21.71 GFLOPS | Progress: (20/20) | 17.07 s Done.
-
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  5/25]  Current/Best:    1.68/  17.02 GFLOPS | Progress: (4/20) | 4.55 s
    [Task  5/25]  Current/Best:    6.08/  17.02 GFLOPS | Progress: (8/20) | 6.85 s
    [Task  5/25]  Current/Best:   14.14/  17.02 GFLOPS | Progress: (12/20) | 9.93 s
    [Task  5/25]  Current/Best:    5.83/  18.48 GFLOPS | Progress: (16/20) | 11.96 s
    [Task  5/25]  Current/Best:    9.02/  18.48 GFLOPS | Progress: (20/20) | 13.78 s Done.
-
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  6/25]  Current/Best:   17.72/  17.72 GFLOPS | Progress: (4/20) | 4.36 s
    [Task  6/25]  Current/Best:    8.89/  17.72 GFLOPS | Progress: (8/20) | 11.53 s
    [Task  6/25]  Current/Best:    6.07/  17.72 GFLOPS | Progress: (12/20) | 14.55 s
    [Task  6/25]  Current/Best:   11.63/  17.72 GFLOPS | Progress: (16/20) | 16.97 s
    [Task  6/25]  Current/Best:   17.12/  20.33 GFLOPS | Progress: (20/20) | 18.86 s Done.
-
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  7/25]  Current/Best:    1.58/  16.66 GFLOPS | Progress: (4/20) | 5.24 s
    [Task  7/25]  Current/Best:   11.84/  18.50 GFLOPS | Progress: (8/20) | 8.34 s
    [Task  7/25]  Current/Best:   18.74/  18.74 GFLOPS | Progress: (12/20) | 11.10 s
    [Task  7/25]  Current/Best:   18.75/  18.75 GFLOPS | Progress: (16/20) | 13.26 s
    [Task  7/25]  Current/Best:    9.93/  21.74 GFLOPS | Progress: (20/20) | 15.34 s Done.
-
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  8/25]  Current/Best:   11.66/  11.66 GFLOPS | Progress: (4/20) | 13.85 s
    [Task  8/25]  Current/Best:    5.61/  14.32 GFLOPS | Progress: (8/20) | 17.46 s
    [Task  8/25]  Current/Best:   13.72/  15.03 GFLOPS | Progress: (12/20) | 22.11 s
    [Task  8/25]  Current/Best:    5.95/  15.03 GFLOPS | Progress: (16/20) | 25.48 s
    [Task  8/25]  Current/Best:   11.95/  15.03 GFLOPS | Progress: (20/20) | 32.65 s
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  9/25]  Current/Best:   21.99/  21.99 GFLOPS | Progress: (4/20) | 7.59 s
    [Task  9/25]  Current/Best:   10.48/  21.99 GFLOPS | Progress: (8/20) | 14.06 s
    [Task  9/25]  Current/Best:    6.82/  21.99 GFLOPS | Progress: (12/20) | 25.44 s
    [Task  9/25]  Current/Best:    5.84/  21.99 GFLOPS | Progress: (16/20) | 28.41 s
    [Task  9/25]  Current/Best:   20.75/  21.99 GFLOPS | Progress: (20
 /20) | 30.16 s
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 10/25]  Current/Best:   14.94/  15.42 GFLOPS | Progress: (4/20) | 3.99 s
    [Task 10/25]  Current/Best:    6.68/  15.42 GFLOPS | Progress: (8/20) | 7.07 s
    [Task 10/25]  Current/Best:   16.15/  16.15 GFLOPS | Progress: (12/20) | 8.75 s
    [Task 10/25]  Current/Best:    5.31/  19.57 GFLOPS | Progress: (16/20) | 11.08 s
    [Task 10/25]  Current/Best:   16.59/  19.57 GFLOPS | Progress: (20/20) | 12.67 s Done.
-
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 11/25]  Current/Best:   12.36/  12.36 GFLOPS | Progress: (4/20) | 5.73 s
    [Task 11/25]  Current/Best:   15.41/  15.41 GFLOPS | Progress: (8/20) | 8.46 s
    [Task 11/25]  Current/Best:   13.23/  17.80 GFLOPS | Progress: (12/20) | 11.00 s
    [Task 11/25]  Current/Best:   15.64/  17.80 GFLOPS | Progress: (16/20) | 13.15 s
    [Task 11/25]  Current/Best:   10.19/  17.80 GFLOPS | Progress: (20/20) | 15.83 s Done.
-
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 12/25]  Current/Best:    9.42/  18.19 GFLOPS | Progress: (4/20) | 3.77 s
    [Task 12/25]  Current/Best:   20.34/  20.34 GFLOPS | Progress: (8/20) | 7.02 s
    [Task 12/25]  Current/Best:   10.87/  20.34 GFLOPS | Progress: (12/20) | 9.96 s
    [Task 12/25]  Current/Best:    1.51/  20.34 GFLOPS | Progress: (16/20) | 13.83 s
    [Task 12/25]  Current/Best:   11.36/  20.34 GFLOPS | Progress: (20/20) | 16.61 s Done.
-
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 13/25]  Current/Best:    3.06/  16.36 GFLOPS | Progress: (4/20) | 4.98 s Done.
-     Done.
-
    [Task 13/25]  Current/Best:   21.93/  21.93 GFLOPS | Progress: (8/20) | 8.55 s
    [Task 13/25]  Current/Best:    1.57/  21.93 GFLOPS | Progress: (12/20) | 14.53 s
    [Task 13/25]  Current/Best:   20.16/  21.93 GFLOPS | Progress: (16/20) | 18.12 s
    [Task 13/25]  Current/Best:    8.67/  21.93 GFLOPS | Progress: (20/20) | 20.52 s Done.
-
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 14/25]  Current/Best:   10.58/  16.35 GFLOPS | Progress: (4/20) | 3.40 s
    [Task 14/25]  Current/Best:   15.22/  16.35 GFLOPS | Progress: (8/20) | 7.45 s
    [Task 14/25]  Current/Best:    5.76/  16.35 GFLOPS | Progress: (12/20) | 11.73 s
    [Task 14/25]  Current/Best:    2.75/  16.35 GFLOPS | Progress: (16/20) | 14.99 s
    [Task 14/25]  Current/Best:   15.73/  16.35 GFLOPS | Progress: (20/20) | 17.20 s
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 15/25]  Current/Best:    6.82/  19.41 GFLOPS | Progress: (4/20) | 7.47 s
    [Task 15/25]  Current/Best:   10.01/  19.41 GFLOPS | Progress: (8/20) | 9.32 s
    [Task 15/25]  Current/Best:   10.90/  20.43 GFLOPS | Progress: (12/20) | 11.64 s
    [Task 15/25]  Current/Best:    7.84/  20.43 GFLOPS | Progress: (16/20) | 16.06 s
    [Task 15/25]  Current/Best:   15.16/  20.43 GFLOPS | Progress: (20/20
 ) | 18.54 s
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 16/25]  Current/Best:   14.97/  14.97 GFLOPS | Progress: (4/20) | 4.27 s
    [Task 16/25]  Current/Best:   11.81/  17.87 GFLOPS | Progress: (8/20) | 6.16 s
    [Task 16/25]  Current/Best:   11.82/  17.87 GFLOPS | Progress: (12/20) | 8.16 s
    [Task 16/25]  Current/Best:   12.23/  17.87 GFLOPS | Progress: (16/20) | 10.70 s Done.
-     Done.
-
    [Task 16/25]  Current/Best:   13.14/  18.05 GFLOPS | Progress: (20/20) | 12.49 s Done.
-
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 17/25]  Current/Best:   19.33/  19.33 GFLOPS | Progress: (4/20) | 4.15 s
    [Task 17/25]  Current/Best:   18.74/  19.33 GFLOPS | Progress: (8/20) | 7.16 s
    [Task 17/25]  Current/Best:    3.10/  19.33 GFLOPS | Progress: (12/20) | 10.26 s
    [Task 17/25]  Current/Best:    6.22/  23.86 GFLOPS | Progress: (16/20) | 12.59 s
    [Task 17/25]  Current/Best:   11.19/  23.86 GFLOPS | Progress: (20/20) | 16.29 s Done.
-
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 18/25]  Current/Best:   16.03/  17.31 GFLOPS | Progress: (4/20) | 5.64 s
    [Task 18/25]  Current/Best:    9.82/  17.61 GFLOPS | Progress: (8/20) | 7.81 s
    [Task 18/25]  Current/Best:   12.20/  17.79 GFLOPS | Progress: (12/20) | 10.32 s
    [Task 18/25]  Current/Best:   14.46/  17.79 GFLOPS | Progress: (16/20) | 12.55 s
    [Task 18/25]  Current/Best:   16.76/  17.79 GFLOPS | Progress: (20/20) | 14.55 s Done.
-
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 19/25]  Current/Best:    3.09/  13.64 GFLOPS | Progress: (4/20) | 9.05 s
    [Task 19/25]  Current/Best:   12.62/  18.18 GFLOPS | Progress: (8/20) | 11.95 s
    [Task 19/25]  Current/Best:   11.02/  20.23 GFLOPS | Progress: (12/20) | 14.84 s
    [Task 19/25]  Current/Best:   18.88/  20.23 GFLOPS | Progress: (16/20) | 19.12 s
    [Task 19/25]  Current/Best:   12.81/  20.23 GFLOPS | Progress: (20/20) | 25.96 s Done.
-
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 20/25]  Current/Best:    9.86/   9.86 GFLOPS | Progress: (4/20) | 8.52 s
    [Task 20/25]  Current/Best:    7.68/   9.86 GFLOPS | Progress: (8/20) | 12.51 s
    [Task 20/25]  Current/Best:    9.92/  10.65 GFLOPS | Progress: (12/20) | 16.01 s
    [Task 20/25]  Current/Best:   11.62/  11.62 GFLOPS | Progress: (16/20) | 20.58 s
    [Task 20/25]  Current/Best:   12.27/  12.27 GFLOPS | Progress: (20/20) | 22.36 s
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 21/25]  Current/Best:    6.36/  18.55 GFLOPS | Progress: (4/20) | 4.26 s
    [Task 21/25]  Current/Best:   20.69/  20.69 GFLOPS | Progress: (8/20) | 6.20 s
    [Task 21/25]  Current/Best:    8.95/  20.69 GFLOPS | Progress: (12/20) | 7.98 s
    [Task 21/25]  Current/Best:   20.06/  20.69 GFLOPS | Progress: (16/20) | 9.64 s
    [Task 21/25]  Current/Best:    8.74/  20.69 GFLOPS | Progress: (20/20)
  | 14.36 s
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 22/25]  Current/Best:   10.41/  12.32 GFLOPS | Progress: (4/20) | 5.09 s
    [Task 22/25]  Current/Best:    2.40/  16.71 GFLOPS | Progress: (8/20) | 7.68 s
    [Task 22/25]  Current/Best:   14.85/  21.91 GFLOPS | Progress: (12/20) | 9.57 s
    [Task 22/25]  Current/Best:    6.19/  21.91 GFLOPS | Progress: (16/20) | 11.44 s
    [Task 22/25]  Current/Best:   12.39/  21.91 GFLOPS | Progress: (20/20) | 13.63 s Done.
-
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 23/25]  Current/Best:   12.14/  21.45 GFLOPS | Progress: (4/20) | 4.80 s
    [Task 23/25]  Current/Best:   13.40/  21.45 GFLOPS | Progress: (8/20) | 7.11 s
    [Task 23/25]  Current/Best:   16.48/  23.90 GFLOPS | Progress: (12/20) | 10.25 s
    [Task 23/25]  Current/Best:   10.70/  23.90 GFLOPS | Progress: (16/20) | 14.26 s
    [Task 23/25]  Current/Best:   14.79/  23.90 GFLOPS | Progress: (20/20) | 17.86 s Done.
-
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 24/25]  Current/Best:    3.34/   6.42 GFLOPS | Progress: (4/20) | 7.82 s
    [Task 24/25]  Current/Best:    3.41/   8.38 GFLOPS | Progress: (8/20) | 9.17 s
    [Task 24/25]  Current/Best:    1.16/   9.27 GFLOPS | Progress: (12/20) | 19.84 s
    [Task 24/25]  Current/Best:    5.17/   9.95 GFLOPS | Progress: (16/20) | 30.78 s Done.
-     Done.
-
    [Task 24/25]  Current/Best:    1.85/   9.95 GFLOPS | Progress: (20/20) | 42.27 s
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 25/25]  Current/Best:    8.44/   9.99 GFLOPS | Progress: (4/20) | 5.82 s
    [Task 25/25]  Current/Best:    8.11/   9.99 GFLOPS | Progress: (8/20) | 11.50 s
    [Task 25/25]  Current/Best:    1.55/   9.99 GFLOPS | Progress: (12/20) | 13.17 s
    [Task 25/25]  Current/Best:    2.99/   9.99 GFLOPS | Progress: (16/20) | 18.63 s
    [Task 25/25]  Current/Best:    8.72/   9.99 GFLOPS | Progress: (20/20) | 29.28 s
+
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  1/25]  Current/Best:   12.89/  17.10 GFLOPS | Progress: (4/20) | 7.57 s
    [Task  1/25]  Current/Best:    9.09/  17.97 GFLOPS | Progress: (8/20) | 12.27 s
    [Task  1/25]  Current/Best:   11.21/  22.27 GFLOPS | Progress: (12/20) | 14.39 s
    [Task  1/25]  Current/Best:    5.63/  22.27 GFLOPS | Progress: (16/20) | 18.13 s
    [Task  1/25]  Current/Best:   12.87/  22.27 GFLOPS | Progress: (20/20) | 21.43 s Done.
+
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  2/25]  Current/Best:    8.96/  15.28 GFLOPS | Progress: (4/20) | 4.03 s
    [Task  2/25]  Current/Best:    3.30/  19.94 GFLOPS | Progress: (8/20) | 5.70 s
    [Task  2/25]  Current/Best:   11.99/  23.19 GFLOPS | Progress: (12/20) | 7.81 s
    [Task  2/25]  Current/Best:   10.68/  23.19 GFLOPS | Progress: (16/20) | 9.45 s
    [Task  2/25]  Current/Best:   12.32/  23.19 GFLOPS | Progress: (20/20) | 10.83 s Done.
+
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  3/25]  Current/Best:   17.33/  17.33 GFLOPS | Progress: (4/20) | 4.08 s
    [Task  3/25]  Current/Best:    3.10/  22.78 GFLOPS | Progress: (8/20) | 7.84 s
    [Task  3/25]  Current/Best:   18.53/  22.78 GFLOPS | Progress: (12/20) | 10.33 s
    [Task  3/25]  Current/Best:   12.38/  22.78 GFLOPS | Progress: (16/20) | 13.01 s
    [Task  3/25]  Current/Best:    8.25/  22.78 GFLOPS | Progress: (20/20) | 16.16 s Done.
+
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  4/25]  Current/Best:    6.52/  22.10 GFLOPS | Progress: (4/20) | 4.04 s
    [Task  4/25]  Current/Best:   14.61/  22.10 GFLOPS | Progress: (8/20) | 6.40 s
    [Task  4/25]  Current/Best:   11.67/  22.10 GFLOPS | Progress: (12/20) | 11.79 s
    [Task  4/25]  Current/Best:   17.22/  22.10 GFLOPS | Progress: (16/20) | 13.58 s
    [Task  4/25]  Current/Best:   11.33/  22.10 GFLOPS | Progress: (20/20) | 21.09 s Done.
+
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  5/25]  Current/Best:   10.96/  18.91 GFLOPS | Progress: (4/20) | 3.87 s
    [Task  5/25]  Current/Best:   16.94/  18.91 GFLOPS | Progress: (8/20) | 6.47 s
    [Task  5/25]  Current/Best:    4.43/  18.91 GFLOPS | Progress: (12/20) | 8.71 s
    [Task  5/25]  Current/Best:    4.38/  18.91 GFLOPS | Progress: (16/20) | 11.28 s
    [Task  5/25]  Current/Best:   10.97/  18.91 GFLOPS | Progress: (20/20) | 13.18 s Done.
+
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  6/25]  Current/Best:    2.39/  15.94 GFLOPS | Progress: (4/20) | 4.98 s
    [Task  6/25]  Current/Best:    5.92/  15.94 GFLOPS | Progress: (8/20) | 9.66 s
    [Task  6/25]  Current/Best:    4.78/  15.94 GFLOPS | Progress: (12/20) | 12.57 s
    [Task  6/25]  Current/Best:   17.79/  17.79 GFLOPS | Progress: (16/20) | 19.57 s
    [Task  6/25]  Current/Best:   20.14/  20.14 GFLOPS | Progress: (20/20) | 22.19 s Done.
+
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  7/25]  Current/Best:   18.59/  18.59 GFLOPS | Progress: (4/20) | 4.29 s
    [Task  7/25]  Current/Best:   11.17/  18.59 GFLOPS | Progress: (8/20) | 6.72 s
    [Task  7/25]  Current/Best:   11.52/  18.59 GFLOPS | Progress: (12/20) | 9.78 s
    [Task  7/25]  Current/Best:   14.23/  22.64 GFLOPS | Progress: (16/20) | 12.62 s
    [Task  7/25]  Current/Best:   18.86/  22.64 GFLOPS | Progress: (20/20) | 14.63 s Done.
+
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  8/25]  Current/Best:    6.17/  14.25 GFLOPS | Progress: (4/20) | 8.28 s
    [Task  8/25]  Current/Best:    9.73/  15.95 GFLOPS | Progress: (8/20) | 12.96 s
    [Task  8/25]  Current/Best:    2.65/  20.35 GFLOPS | Progress: (12/20) | 23.01 s
    [Task  8/25]  Current/Best:   18.64/  20.35 GFLOPS | Progress: (16/20) | 25.38 s
    [Task  8/25]  Current/Best:    9.79/  20.35 GFLOPS | Progress: (20/20) | 32.74 s Done.
+
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  9/25]  Current/Best:   10.35/  21.47 GFLOPS | Progress: (4/20) | 3.49 s
    [Task  9/25]  Current/Best:    7.04/  21.47 GFLOPS | Progress: (8/20) | 9.50 s
    [Task  9/25]  Current/Best:    7.56/  21.47 GFLOPS | Progress: (12/20) | 13.80 s
    [Task  9/25]  Current/Best:   18.41/  22.63 GFLOPS | Progress: (16/20) | 16.64 s
    [Task  9/25]  Current/Best:   13.80/  22.63 GFLOPS | Progress: (20/20) | 18.92 s Done.
+
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 10/25]  Current/Best:    5.15/  15.01 GFLOPS | Progress: (4/20) | 4.22 s
    [Task 10/25]  Current/Best:   10.88/  16.14 GFLOPS | Progress: (8/20) | 6.48 s
    [Task 10/25]  Current/Best:    5.22/  16.50 GFLOPS | Progress: (12/20) | 9.20 s
    [Task 10/25]  Current/Best:   13.77/  18.22 GFLOPS | Progress: (16/20) | 11.45 s
    [Task 10/25]  Current/Best:    9.70/  19.70 GFLOPS | Progress: (20/20) | 13.85 s Done.
+
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 11/25]  Current/Best:    7.01/  23.16 GFLOPS | Progress: (4/20) | 4.25 s
    [Task 11/25]  Current/Best:    6.17/  23.95 GFLOPS | Progress: (8/20) | 6.87 s
    [Task 11/25]  Current/Best:    3.11/  23.95 GFLOPS | Progress: (12/20) | 10.89 s
    [Task 11/25]  Current/Best:    6.18/  23.95 GFLOPS | Progress: (16/20) | 14.62 s
    [Task 11/25]  Current/Best:   21.76/  23.95 GFLOPS | Progress: (20/20) | 17.73 s Done.
+
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 12/25]  Current/Best:    1.59/  15.12 GFLOPS | Progress: (4/20) | 8.62 s
    [Task 12/25]  Current/Best:   14.48/  18.00 GFLOPS | Progress: (8/20) | 15.03 s
    [Task 12/25]  Current/Best:   19.05/  19.44 GFLOPS | Progress: (12/20) | 16.75 s
    [Task 12/25]  Current/Best:    6.10/  19.44 GFLOPS | Progress: (16/20) | 20.20 s
    [Task 12/25]  Current/Best:   10.71/  19.44 GFLOPS | Progress: (20/20) | 29.91 s Done.
+
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 13/25]  Current/Best:   16.37/  16.37 GFLOPS | Progress: (4/20) | 5.34 s
    [Task 13/25]  Current/Best:   10.87/  16.37 GFLOPS | Progress: (8/20) | 8.61 s
    [Task 13/25]  Current/Best:    6.20/  18.29 GFLOPS | Progress: (12/20) | 12.23 s
    [Task 13/25]  Current/Best:   14.75/  18.29 GFLOPS | Progress: (16/20) | 16.23 s
    [Task 13/25]  Current/Best:   15.54/  18.29 GFLOPS | Progress: (20/20) | 19.29 s Done.
+
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 14/25]  Current/Best:   12.54/  12.54 GFLOPS | Progress: (4/20) | 8.71 s
    [Task 14/25]  Current/Best:    8.35/  20.71 GFLOPS | Progress: (8/20) | 15.62 s
    [Task 14/25]  Current/Best:   14.70/  20.71 GFLOPS | Progress: (12/20) | 18.77 s
    [Task 14/25]  Current/Best:    4.66/  20.71 GFLOPS | Progress: (16/20) | 21.04 s
    [Task 14/25]  Current/Best:    5.19/  20.77 GFLOPS | Progress: (20/20) | 23.43 s
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 15/25]  Current/Best:   16.31/  19.92 GFLOPS | Progress: (4/20) | 4.57 s
    [Task 15/25]  Current/Best:   16.42/  19.92 GFLOPS | Progress: (8/20) | 6.26 s Done.
+
    [Task 15/25]  Current/Best:   19.08/  19.92 GFLOPS | Progress: (12/20) | 8.86 s
    [Task 15/25]  Current/Best:   10.70/  23.81 GFLOPS | Progress: (16/20) | 11.51 s
    [Task 15/25]  Current/Best:   10.69/  23.81 GFLOPS | Progress: (20/20) | 13.06 s Done.
+
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 16/25]  Current/Best:   14.15/  14.15 GFLOPS | Progress: (4/20) | 4.21 s
    [Task 16/25]  Current/Best:    1.58/  14.15 GFLOPS | Progress: (8/20) | 7.42 s
    [Task 16/25]  Current/Best:    1.57/  17.66 GFLOPS | Progress: (12/20) | 11.38 s
    [Task 16/25]  Current/Best:    4.73/  20.82 GFLOPS | Progress: (16/20) | 13.25 s
    [Task 16/25]  Current/Best:   14.30/  20.82 GFLOPS | Progress: (20/20) | 15.65 s Done.
+
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 17/25]  Current/Best:    9.74/  16.64 GFLOPS | Progress: (4/20) | 4.59 s
    [Task 17/25]  Current/Best:   10.10/  19.31 GFLOPS | Progress: (8/20) | 7.31 s
    [Task 17/25]  Current/Best:    8.23/  19.31 GFLOPS | Progress: (12/20) | 9.87 s
    [Task 17/25]  Current/Best:   16.50/  19.31 GFLOPS | Progress: (16/20) | 12.68 s
    [Task 17/25]  Current/Best:    7.13/  19.31 GFLOPS | Progress: (20/20) | 15.71 s Done.
+
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 18/25]  Current/Best:    6.32/  20.33 GFLOPS | Progress: (4/20) | 3.86 s
    [Task 18/25]  Current/Best:   17.07/  20.33 GFLOPS | Progress: (8/20) | 6.32 s
    [Task 18/25]  Current/Best:   13.56/  20.33 GFLOPS | Progress: (12/20) | 8.59 s
    [Task 18/25]  Current/Best:    9.44/  21.40 GFLOPS | Progress: (16/20) | 10.83 s
    [Task 18/25]  Current/Best:   16.07/  21.40 GFLOPS | Progress: (20/20) | 12.73 s Done.
+
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 19/25]  Current/Best:   20.34/  20.34 GFLOPS | Progress: (4/20) | 7.51 s
    [Task 19/25]  Current/Best:   13.27/  20.34 GFLOPS | Progress: (8/20) | 10.21 s
    [Task 19/25]  Current/Best:    1.55/  20.34 GFLOPS | Progress: (12/20) | 14.43 s
    [Task 19/25]  Current/Best:   11.04/  20.34 GFLOPS | Progress: (16/20) | 20.60 s
    [Task 19/25]  Current/Best:   21.28/  22.64 GFLOPS | Progress: (20/20) | 25.25 s Done.
+
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 20/25]  Current/Best:   14.63/  14.63 GFLOPS | Progress: (4/20) | 4.19 s
    [Task 20/25]  Current/Best:   19.49/  19.49 GFLOPS | Progress: (8/20) | 6.23 s
    [Task 20/25]  Current/Best:   10.06/  19.49 GFLOPS | Progress: (12/20) | 9.00 s
    [Task 20/25]  Current/Best:   18.32/  21.26 GFLOPS | Progress: (16/20) | 12.32 s
    [Task 20/25]  Current/Best:    6.80/  21.26 GFLOPS | Progress: (20/20) | 15.74 s
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 21/25]  Current/Best:    7.59/  15.88 GFLOPS | Progress: (4/20) | 5.65 s Done.
+
    [Task 21/25]  Current/Best:    8.97/  17.91 GFLOPS | Progress: (8/20) | 7.97 s
    [Task 21/25]  Current/Best:   15.47/  17.91 GFLOPS | Progress: (12/20) | 9.95 s
    [Task 21/25]  Current/Best:   13.77/  18.76 GFLOPS | Progress: (16/20) | 12.39 s
    [Task 21/25]  Current/Best:    8.49/  18.76 GFLOPS | Progress: (20/20) | 14.63 s
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 22/25]  Current/Best:   10.94/  16.93 GFLOPS | Progress: (4/20) | 4.31 s
    [Task 22/25]  Current/Best:    7.28/  16.93 GFLOPS | Progress: (8/20) | 6.08 s
    [Task 22/25]  Current/Best:    4.73/  21.34 GFLOPS | Progress: (12/20) | 8.25 s
    [Task 22/25]  Current/Best:    9.71/  21.34 GFLOPS | Progress: (16/20) | 11.20 s
    [Task 22/25]  Current/Best:   15.43/  21.34 GFLOPS | Progress: (20/20) | 13.21 s Done.
+
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 23/25]  Current/Best:    5.57/  18.21 GFLOPS | Progress: (4/20) | 5.41 s
    [Task 23/25]  Current/Best:    6.15/  21.77 GFLOPS | Progress: (8/20) | 8.35 s
    [Task 23/25]  Current/Best:    5.02/  21.77 GFLOPS | Progress: (12/20) | 11.21 s
    [Task 23/25]  Current/Best:   18.52/  21.93 GFLOPS | Progress: (16/20) | 14.24 s
    [Task 23/25]  Current/Best:    6.10/  21.93 GFLOPS | Progress: (20/20) | 17.55 s Done.
+
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 24/25]  Current/Best:    6.31/   6.31 GFLOPS | Progress: (4/20) | 12.75 s
    [Task 24/25]  Current/Best:    2.65/   6.70 GFLOPS | Progress: (8/20) | 24.89 s
    [Task 24/25]  Current/Best:    7.64/   7.64 GFLOPS | Progress: (12/20) | 35.86 s
    [Task 24/25]  Current/Best:    5.65/   7.64 GFLOPS | Progress: (16/20) | 47.57 s
    [Task 24/25]  Current/Best:    8.22/   9.97 GFLOPS | Progress: (20/20) | 59.43 s
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 25/25]  Current/Best:    5.48/   5.48 GFLOPS | Progress: (4/20) | 6.88 s
    [Task 25/25]  Current/Best:    1.54/   8.82 GFLOPS | Progress: (8/20) | 8.49 s
    [Task 25/25]  Current/Best:    3.59/   8.82 GFLOPS | Progress: (12/20) | 19.44 s
    [Task 25/25]  Current/Best:    7.74/   8.82 GFLOPS | Progress: (16/20) | 30.38 s
    [Task 25/25]  Current/Best:    3.03/   8.82 GFLOPS | Progress: (20/
 20) | 32.49 s
 
 
 
@@ -664,8 +663,8 @@ Verify that the optimized model runs and produces the same results:
 
  .. code-block:: none
 
-    class='n02123045 tabby, tabby cat' with probability=0.621103
-    class='n02123159 tiger cat' with probability=0.356379
+    class='n02123045 tabby, tabby cat' with probability=0.621104
+    class='n02123159 tiger cat' with probability=0.356378
     class='n02124075 Egyptian cat' with probability=0.019712
     class='n02129604 tiger, Panthera tigris' with probability=0.001215
     class='n04040759 radiator' with probability=0.000262
@@ -722,8 +721,8 @@ improvement in comparing the optimized model to the unoptimized model.
 
  .. code-block:: none
 
-    optimized: {'mean': 425.82463468999777, 'median': 425.7106710500011, 'std': 0.8921467986797177}
-    unoptimized: {'mean': 510.8778525799971, 'median': 510.608007299993, 'std': 1.4270214411428404}
+    optimized: {'mean': 404.5399479699995, 'median': 404.1449735499896, 'std': 3.42485888438619}
+    unoptimized: {'mean': 512.2079894000011, 'median': 512.4371018000033, 'std': 2.4379111328350107}
 
 
 
@@ -746,7 +745,7 @@ profiling/benchmarking.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 11 minutes  47.645 seconds)
+   **Total running time of the script:** ( 12 minutes  11.809 seconds)
 
 
 .. _sphx_glr_download_tutorial_autotvm_relay_x86.py:
diff --git a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
index 48e95e2cb7..83138ebd3e 100644
--- a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
@@ -274,7 +274,7 @@ device and returns the measured cost. Network overhead is excluded.
 
  .. code-block:: none
 
-    1.241e-07 secs/op
+    1.28e-07 secs/op
 
 
 
diff --git a/docs/_sources/tutorial/intro_topi.rst.txt b/docs/_sources/tutorial/intro_topi.rst.txt
index ebc0e22f58..0209fec57a 100644
--- a/docs/_sources/tutorial/intro_topi.rst.txt
+++ b/docs/_sources/tutorial/intro_topi.rst.txt
@@ -270,7 +270,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
 
  .. code-block:: none
 
-    [stage(a, placeholder(a, 0x15f745b0)), stage(b, placeholder(b, 0x1063db50)), stage(T_add, compute(T_add, body=[a[ax0, ax1, ax2] + b[ax1, ax2]], axis=[T.iter_var(ax0, T.Range(0, 100), "DataPar", ""), T.iter_var(ax1, T.Range(0, 10), "DataPar", ""), T.iter_var(ax2, T.Range(0, 10), "DataPar", "")], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[a[ax0, ax1, ax2] * b[ax1, ax2]], axis=[T.iter_var(ax0, T.Range(0, 100), "DataPar", ""), T.iter_var(ax1, T [...]
+    [stage(a, placeholder(a, 0x271001e0)), stage(b, placeholder(b, 0x270c0f10)), stage(T_add, compute(T_add, body=[a[ax0, ax1, ax2] + b[ax1, ax2]], axis=[T.iter_var(ax0, T.Range(0, 100), "DataPar", ""), T.iter_var(ax1, T.Range(0, 10), "DataPar", ""), T.iter_var(ax2, T.Range(0, 10), "DataPar", "")], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[a[ax0, ax1, ax2] * b[ax1, ax2]], axis=[T.iter_var(ax0, T.Range(0, 100), "DataPar", ""), T.iter_var(ax1, T [...]
 
 
 
diff --git a/docs/_sources/tutorial/sg_execution_times.rst.txt b/docs/_sources/tutorial/sg_execution_times.rst.txt
index a051cd0e82..1f75f9fae2 100644
--- a/docs/_sources/tutorial/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorial/sg_execution_times.rst.txt
@@ -5,32 +5,32 @@
 
 Computation times
 =================
-**15:21.436** total execution time for **tutorial** files:
+**15:35.251** total execution time for **tutorial** files:
 
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``)                 | 11:47.645 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``)                 | 12:11.809 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:30.676 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:20.459 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)     | 00:59.428 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)     | 01:00.967 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``)                 | 00:35.126 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``)                 | 00:35.443 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``)               | 00:26.967 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``)               | 00:24.450 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``)                               | 00:00.815 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``)       | 00:01.126 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``)       | 00:00.597 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``)                               | 00:00.828 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.181 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.169 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorial_uma.py` (``uma.py``)                                             | 00:00.000 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)   | 00:00.000 | 0.0 MB |
-+------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorial_tvmc_python.py` (``tvmc_python.py``)                             | 00:00.000 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorial_introduction.py` (``introduction.py``)                           | 00:00.000 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``)   | 00:00.000 | 0.0 MB |
++------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorial_install.py` (``install.py``)                                     | 00:00.000 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
index 7815346862..1bd1f80608 100644
--- a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
@@ -286,7 +286,7 @@ helper function to run a profile of the TVM generated code.
  .. code-block:: none
 
     Numpy running time: 0.000007
-    naive: 0.000009
+    naive: 0.000007
 
 
 
@@ -389,7 +389,7 @@ compile and run this new schedule with the parallel operation applied:
 
  .. code-block:: none
 
-    parallel: 0.000006
+    parallel: 0.000007
 
 
 
@@ -498,10 +498,10 @@ We can now compare the different schedules
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                   numpy    7.148830000005546e-06                    1.0
-                   naive              8.8166e-06      1.2332927206260549
-                parallel    6.070600000000001e-06     0.8491739207667955
-                  vector             2.47903e-05      3.4677422739078656
+                   numpy    6.711370001539763e-06                    1.0
+                   naive              6.6762e-06      0.9947596390108585
+                parallel    6.9535000000000004e-06     1.036077581537702
+                  vector    2.4626500000000002e-05    3.6693700383602814
 
 
 
@@ -922,7 +922,7 @@ matrix multiplication.
 
  .. code-block:: none
 
-    Numpy running time: 0.018116
+    Numpy running time: 0.018786
 
 
 
@@ -980,7 +980,7 @@ optimizations.
 
  .. code-block:: none
 
-    none: 3.289662
+    none: 3.399821
 
 
 
@@ -1080,7 +1080,7 @@ schedule.
 
  .. code-block:: none
 
-    blocking: 0.303087
+    blocking: 0.300759
 
 
 
@@ -1164,7 +1164,7 @@ already cache friendly from our previous optimizations.
 
  .. code-block:: none
 
-    vectorization: 0.332680
+    vectorization: 0.344421
     @I.ir_module
     class Module:
         @T.prim_func
@@ -1230,7 +1230,7 @@ more cache friendly.
 
  .. code-block:: none
 
-    loop permutation: 0.117245
+    loop permutation: 0.115957
     @I.ir_module
     class Module:
         @T.prim_func
@@ -1321,7 +1321,7 @@ optimized schedule.
 
  .. code-block:: none
 
-    array packing: 0.109664
+    array packing: 0.108478
     @I.ir_module
     class Module:
         @T.prim_func
@@ -1404,7 +1404,7 @@ to `C` when all the block results are ready.
 
  .. code-block:: none
 
-    block caching: 0.110329
+    block caching: 0.110815
     @I.ir_module
     class Module:
         @T.prim_func
@@ -1478,7 +1478,7 @@ of thread-level parallelization.
 
  .. code-block:: none
 
-    parallelization: 0.146617
+    parallelization: 0.146343
     @I.ir_module
     class Module:
         @T.prim_func
@@ -1548,13 +1548,13 @@ working, we can compare the results.
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                    none            3.2896616203                     1.0
-                blocking            0.3030872027     0.09213324581157378
-           vectorization            0.3326799013     0.10112891224042106
-        loop permutation     0.11724515129999999    0.035640489762381045
-           array packing     0.10966384069999999    0.033335903006947935
-           block caching     0.11032873110000001    0.033538018141190645
-         parallelization            0.1466170277     0.04456903007751579
+                    none            3.3998205908                     1.0
+                blocking            0.3007591159     0.08846323147576131
+           vectorization            0.3444209805     0.10130563401845727
+        loop permutation            0.1159568229      0.0341067476365612
+           array packing            0.1084781797     0.03190703062201125
+           block caching     0.11081503379999999      0.0325943769209082
+         parallelization            0.1463433178     0.04304442363694387
 
 
 
@@ -1594,6 +1594,11 @@ operations with tunable parameters that allows you to automatically optimize
 the computation for specific platforms.
 
 
+.. rst-class:: sphx-glr-timing
+
+   **Total running time of the script:** ( 1 minutes  0.967 seconds)
+
+
 .. _sphx_glr_download_tutorial_tensor_expr_get_started.py:
 
 .. only:: html
diff --git a/docs/commit_hash b/docs/commit_hash
index c9aa7731c0..7bdb802766 100644
--- a/docs/commit_hash
+++ b/docs/commit_hash
@@ -1 +1 @@
-c2eee01e6eb64b3dfc3e49a63035d0fa32900539
+cfa65b26c1bd975daaef78c60b16989be0d23970
diff --git a/docs/how_to/compile_models/from_darknet.html b/docs/how_to/compile_models/from_darknet.html
index fe655ecc8c..4a1fa62995 100644
--- a/docs/how_to/compile_models/from_darknet.html
+++ b/docs/how_to/compile_models/from_darknet.html
@@ -585,7 +585,7 @@ class:[&#39;truck 0.9266&#39;] left:471 top:83 right:689 bottom:169
 class:[&#39;bicycle 0.9984&#39;] left:111 top:113 right:577 bottom:447
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  15.692 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  16.257 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-darknet-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7716f96385bd5abb6e822041e285be54/from_darknet.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_darknet.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_keras.html b/docs/how_to/compile_models/from_keras.html
index 2a74d3e12d..20f7f0cc9d 100644
--- a/docs/how_to/compile_models/from_keras.html
+++ b/docs/how_to/compile_models/from_keras.html
@@ -506,7 +506,7 @@ Tensorflow is also required since it’s used as the default backend of keras.</
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Relay top-1 id: 285, class name: Egyptian cat
 
 1/1 [==============================] - ETA: 0s
-1/1 [==============================] - 1s 917ms/step
+1/1 [==============================] - 1s 958ms/step
 Keras top-1 id: 285, class name: Egyptian cat
 </pre></div>
 </div>
diff --git a/docs/how_to/compile_models/from_mxnet.html b/docs/how_to/compile_models/from_mxnet.html
index c55c7551f3..6804b8360c 100644
--- a/docs/how_to/compile_models/from_mxnet.html
+++ b/docs/how_to/compile_models/from_mxnet.html
@@ -439,7 +439,7 @@
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;x&quot;</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#tuple" title="builtins.tuple" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">x</span><span class="o">.</span><span class="n">shape</span></a><span class="p">)</span>
 </pre></div>
 </div>
-<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip66409e78-3c6c-484b-aaff-ffc414c3d71d from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip74b370c8-6b03-402b-8791-7ba7f0d5f123 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
 x (1, 3, 224, 224)
 </pre></div>
 </div>
diff --git a/docs/how_to/compile_models/from_oneflow.html b/docs/how_to/compile_models/from_oneflow.html
index f6e6111e8a..876d3eb660 100644
--- a/docs/how_to/compile_models/from_oneflow.html
+++ b/docs/how_to/compile_models/from_oneflow.html
@@ -449,14 +449,13 @@ Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdo
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip&quot; to /workspace/.oneflow/flowvision_cache/resnet18.zip
 
   0%|          | 0.00/41.5M [00:00&lt;?, ?B/s]
- 15%|#5        | 6.33M/41.5M [00:00&lt;00:00, 53.8MB/s]
- 28%|##7       | 11.5M/41.5M [00:00&lt;00:00, 46.0MB/s]
- 38%|###8      | 15.9M/41.5M [00:00&lt;00:00, 43.9MB/s]
- 48%|####8     | 20.1M/41.5M [00:00&lt;00:00, 37.7MB/s]
- 58%|#####7    | 24.0M/41.5M [00:00&lt;00:00, 30.2MB/s]
- 77%|#######7  | 32.0M/41.5M [00:00&lt;00:00, 38.9MB/s]
- 92%|#########2| 38.3M/41.5M [00:00&lt;00:00, 44.0MB/s]
-100%|##########| 41.5M/41.5M [00:01&lt;00:00, 42.3MB/s]
+ 19%|#9        | 7.99M/41.5M [00:00&lt;00:00, 51.7MB/s]
+ 39%|###8      | 16.0M/41.5M [00:00&lt;00:00, 48.4MB/s]
+ 58%|#####7    | 24.0M/41.5M [00:00&lt;00:00, 45.0MB/s]
+ 77%|#######7  | 32.0M/41.5M [00:00&lt;00:00, 47.3MB/s]
+ 90%|########9 | 37.2M/41.5M [00:00&lt;00:00, 44.8MB/s]
+100%|#########9| 41.4M/41.5M [00:00&lt;00:00, 43.5MB/s]
+100%|##########| 41.5M/41.5M [00:00&lt;00:00, 45.3MB/s]
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/compile_models/from_pytorch.html b/docs/how_to/compile_models/from_pytorch.html
index d13412f190..a919ad73f4 100644
--- a/docs/how_to/compile_models/from_pytorch.html
+++ b/docs/how_to/compile_models/from_pytorch.html
@@ -432,12 +432,10 @@ be unstable.</p>
 Downloading: &quot;https://download.pytorch.org/models/resnet18-f37072fd.pth&quot; to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
 
   0%|          | 0.00/44.7M [00:00&lt;?, ?B/s]
- 18%|#7        | 7.99M/44.7M [00:00&lt;00:00, 59.1MB/s]
- 36%|###5      | 16.0M/44.7M [00:00&lt;00:00, 48.4MB/s]
- 54%|#####3    | 24.0M/44.7M [00:00&lt;00:00, 59.5MB/s]
- 72%|#######1  | 32.0M/44.7M [00:00&lt;00:00, 60.7MB/s]
- 90%|########9 | 40.0M/44.7M [00:00&lt;00:00, 58.3MB/s]
-100%|##########| 44.7M/44.7M [00:00&lt;00:00, 62.7MB/s]
+ 24%|##3       | 10.6M/44.7M [00:00&lt;00:00, 112MB/s]
+ 51%|#####1    | 22.9M/44.7M [00:00&lt;00:00, 122MB/s]
+ 77%|#######7  | 34.5M/44.7M [00:00&lt;00:00, 108MB/s]
+100%|##########| 44.7M/44.7M [00:00&lt;00:00, 108MB/s]
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/compile_models/from_tensorflow.html b/docs/how_to/compile_models/from_tensorflow.html
index e8d453bff9..06b96f6c76 100644
--- a/docs/how_to/compile_models/from_tensorflow.html
+++ b/docs/how_to/compile_models/from_tensorflow.html
@@ -649,7 +649,7 @@ banana (score = 0.00022)
 desk (score = 0.00019)
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  18.253 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  20.531 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-tensorflow-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7f1d3d1b878694c201c614c807cdebc8/from_tensorflow.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_tensorflow.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/sg_execution_times.html b/docs/how_to/compile_models/sg_execution_times.html
index 2d3118b8ef..b367a271a0 100644
--- a/docs/how_to/compile_models/sg_execution_times.html
+++ b/docs/how_to/compile_models/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-compile-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>06:10.782</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
+<p><strong>06:19.340</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 81%" />
@@ -349,43 +349,43 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></td>
-<td><p>01:18.253</p></td>
+<td><p>01:20.531</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></td>
-<td><p>01:15.692</p></td>
+<td><p>01:16.257</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></td>
-<td><p>00:50.434</p></td>
+<td><p>00:52.501</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></td>
-<td><p>00:34.219</p></td>
+<td><p>00:35.465</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
-<td><p>00:29.567</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
+<td><p>00:30.357</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
-<td><p>00:29.050</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
+<td><p>00:30.149</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
-<td><p>00:26.740</p></td>
+<td><p>00:26.165</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></td>
-<td><p>00:24.063</p></td>
+<td><p>00:24.732</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></td>
-<td><p>00:20.186</p></td>
+<td><p>00:20.531</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></td>
-<td><p>00:02.577</p></td>
+<td><p>00:02.651</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/deploy_models/deploy_model_on_adreno.html b/docs/how_to/deploy_models/deploy_model_on_adreno.html
index 067487cd49..6db8005f14 100644
--- a/docs/how_to/deploy_models/deploy_model_on_adreno.html
+++ b/docs/how_to/deploy_models/deploy_model_on_adreno.html
@@ -920,7 +920,7 @@ Top5 predictions:
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
- 2545.4545    2545.3663    2547.9816    2543.9309      1.2446
+ 2550.8269    2549.1817    2566.5008    2545.9899      5.4542
 </pre></div>
 </div>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-model-on-adreno-py">
diff --git a/docs/how_to/deploy_models/deploy_model_on_android.html b/docs/how_to/deploy_models/deploy_model_on_android.html
index 1d03a7e73a..c996eb1624 100644
--- a/docs/how_to/deploy_models/deploy_model_on_android.html
+++ b/docs/how_to/deploy_models/deploy_model_on_android.html
@@ -662,7 +662,7 @@ to the remote android device.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  15.6575      15.6385      15.8746      15.5296       0.0853
+  16.0102      15.8558      16.7513      15.6779       0.3372
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
index 68de77cfb7..c99332ed72 100644
--- a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
+++ b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
@@ -454,24 +454,24 @@ be unstable.</p>
 Downloading: &quot;https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth&quot; to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
 
   0%|          | 0.00/170M [00:00&lt;?, ?B/s]
-  5%|4         | 7.99M/170M [00:00&lt;00:02, 82.1MB/s]
-  9%|9         | 16.1M/170M [00:00&lt;00:01, 82.3MB/s]
- 15%|#5        | 26.1M/170M [00:00&lt;00:01, 92.4MB/s]
- 21%|##        | 35.0M/170M [00:00&lt;00:02, 59.3MB/s]
- 28%|##8       | 48.0M/170M [00:00&lt;00:01, 71.4MB/s]
- 36%|###5      | 61.1M/170M [00:00&lt;00:01, 88.1MB/s]
- 42%|####1     | 70.6M/170M [00:00&lt;00:01, 85.6MB/s]
- 47%|####6     | 79.5M/170M [00:01&lt;00:01, 87.2MB/s]
- 52%|#####1    | 88.3M/170M [00:01&lt;00:01, 81.7MB/s]
- 60%|######    | 102M/170M [00:01&lt;00:00, 99.3MB/s]
- 66%|######6   | 112M/170M [00:01&lt;00:00, 97.2MB/s]
- 72%|#######1  | 122M/170M [00:01&lt;00:00, 96.3MB/s]
- 77%|#######7  | 132M/170M [00:01&lt;00:00, 96.7MB/s]
- 83%|########3 | 141M/170M [00:01&lt;00:00, 98.5MB/s]
- 89%|########8 | 151M/170M [00:01&lt;00:00, 88.0MB/s]
- 94%|#########4| 160M/170M [00:01&lt;00:00, 76.6MB/s]
- 99%|#########8| 168M/170M [00:02&lt;00:00, 73.4MB/s]
-100%|##########| 170M/170M [00:02&lt;00:00, 83.2MB/s]
+  5%|4         | 7.99M/170M [00:00&lt;00:02, 74.9MB/s]
+  9%|8         | 15.1M/170M [00:00&lt;00:02, 59.8MB/s]
+ 13%|#3        | 22.9M/170M [00:00&lt;00:02, 68.1MB/s]
+ 18%|#7        | 30.3M/170M [00:00&lt;00:02, 53.7MB/s]
+ 21%|##1       | 35.9M/170M [00:00&lt;00:03, 46.6MB/s]
+ 24%|##3       | 40.7M/170M [00:00&lt;00:02, 47.0MB/s]
+ 33%|###2      | 56.0M/170M [00:00&lt;00:01, 74.1MB/s]
+ 38%|###7      | 64.0M/170M [00:01&lt;00:01, 74.4MB/s]
+ 43%|####3     | 73.4M/170M [00:01&lt;00:01, 80.8MB/s]
+ 52%|#####1    | 88.0M/170M [00:01&lt;00:00, 96.1MB/s]
+ 61%|######1   | 104M/170M [00:01&lt;00:00, 99.5MB/s]
+ 67%|######6   | 114M/170M [00:01&lt;00:00, 88.9MB/s]
+ 72%|#######1  | 122M/170M [00:01&lt;00:00, 87.8MB/s]
+ 77%|#######6  | 131M/170M [00:01&lt;00:00, 70.9MB/s]
+ 81%|########1 | 138M/170M [00:01&lt;00:00, 72.0MB/s]
+ 86%|########5 | 145M/170M [00:02&lt;00:00, 70.0MB/s]
+ 94%|#########4| 160M/170M [00:02&lt;00:00, 84.1MB/s]
+100%|##########| 170M/170M [00:02&lt;00:00, 78.1MB/s]
 /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/nn/functional.py:3897: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
   for i in range(dim)
 /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/detection/anchor_utils.py:124: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the &#39;trunc&#39; function NOT &#39;floor&#39;). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode=&#39;trunc&#39;), or for actual floor division, use torch.div(a, b, rounding_mode=& [...]
@@ -569,7 +569,7 @@ torchvision rcnn models.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Get 9 valid boxes
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  22.431 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  28.344 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-object-detection-pytorch-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7795da4b258c8feff986668b95ef57ad/deploy_object_detection_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_object_detection_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized.html b/docs/how_to/deploy_models/deploy_prequantized.html
index 7c57df1a95..1450cffda8 100644
--- a/docs/how_to/deploy_models/deploy_prequantized.html
+++ b/docs/how_to/deploy_models/deploy_prequantized.html
@@ -495,8 +495,9 @@ training. Other models require a full post training calibration.</p>
 Downloading: &quot;https://download.pytorch.org/models/mobilenet_v2-b0353104.pth&quot; to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
 
   0%|          | 0.00/13.6M [00:00&lt;?, ?B/s]
- 59%|#####8    | 7.99M/13.6M [00:00&lt;00:00, 61.9MB/s]
-100%|##########| 13.6M/13.6M [00:00&lt;00:00, 64.4MB/s]
+ 47%|####6     | 6.30M/13.6M [00:00&lt;00:00, 53.4MB/s]
+ 84%|########4 | 11.4M/13.6M [00:00&lt;00:00, 50.5MB/s]
+100%|##########| 13.6M/13.6M [00:00&lt;00:00, 39.7MB/s]
 </pre></div>
 </div>
 </div>
@@ -587,7 +588,7 @@ output values are identical out of 1000 outputs from mobilenet v2.</p>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  90.5913      90.3806      99.0003      90.0003       1.0226
+  90.2068      90.1677      90.8147      90.0469       0.1458
 </pre></div>
 </div>
 <div class="admonition note">
@@ -626,7 +627,7 @@ This includes support for the VNNI 8 bit dot product instruction (CascadeLake or
 <div class="section" id="deploy-a-quantized-tflite-model">
 <h2>Deploy a quantized TFLite Model<a class="headerlink" href="#deploy-a-quantized-tflite-model" title="Permalink to this headline">¶</a></h2>
 <p>TODO</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  11.835 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  12.919 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/fb8217c13f4351224c6cf3aacf1a87fc/deploy_prequantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized_tflite.html b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
index 1a1387de11..bb2b0351de 100644
--- a/docs/how_to/deploy_models/deploy_prequantized_tflite.html
+++ b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
@@ -580,7 +580,7 @@ TFLite Top-5 labels: [387 102 386 341 349]
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  119.7466     119.7560     124.1398     117.9933      0.6389
+  119.6592     119.5643     126.8230     119.0097      0.8249
 </pre></div>
 </div>
 <div class="admonition note">
@@ -608,7 +608,7 @@ network for ARM CPU</span></a>.</p></li>
 </ul>
 </div></blockquote>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  40.818 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  32.610 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-tflite-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/56691c7a27d45da61d112276334640d3/deploy_prequantized_tflite.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized_tflite.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_quantized.html b/docs/how_to/deploy_models/deploy_quantized.html
index 49ad15181d..9e0912f43a 100644
--- a/docs/how_to/deploy_models/deploy_quantized.html
+++ b/docs/how_to/deploy_models/deploy_quantized.html
@@ -521,7 +521,7 @@ for calibration. But the accuracy might be impacted.</p>
   DeprecationWarning,
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  30.392 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  40.483 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
index 174f6832dc..9c67559d98 100644
--- a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
+++ b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
@@ -463,23 +463,22 @@ to your device.</p>
 Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
 
   0%|          | 0/132723 [00:00&lt;?, ?KB/s]
-  5%|4         | 6457/132723 [00:00&lt;00:01, 64555.75KB/s]
- 11%|#         | 14541/132723 [00:00&lt;00:01, 74125.57KB/s]
- 17%|#7        | 22584/132723 [00:00&lt;00:01, 77002.52KB/s]
- 23%|##3       | 30612/132723 [00:00&lt;00:01, 78294.66KB/s]
- 29%|##9       | 38649/132723 [00:00&lt;00:01, 79039.49KB/s]
- 35%|###5      | 46553/132723 [00:00&lt;00:01, 78975.48KB/s]
- 41%|####1     | 54619/132723 [00:00&lt;00:00, 79521.24KB/s]
- 47%|####7     | 62721/132723 [00:00&lt;00:00, 79996.31KB/s]
- 53%|#####3    | 70817/132723 [00:00&lt;00:00, 80293.85KB/s]
- 59%|#####9    | 78847/132723 [00:01&lt;00:00, 80269.78KB/s]
- 66%|######5   | 86990/132723 [00:01&lt;00:00, 80619.81KB/s]
- 72%|#######1  | 95071/132723 [00:01&lt;00:00, 80675.10KB/s]
- 78%|#######7  | 103249/132723 [00:01&lt;00:00, 81005.19KB/s]
- 84%|########3 | 111350/132723 [00:01&lt;00:00, 80907.17KB/s]
- 90%|######### | 119477/132723 [00:01&lt;00:00, 81012.32KB/s]
- 96%|#########6| 127579/132723 [00:01&lt;00:00, 80864.64KB/s]
-100%|##########| 132723/132723 [00:01&lt;00:00, 79718.08KB/s]
+  5%|5         | 6645/132723 [00:00&lt;00:01, 66444.68KB/s]
+ 12%|#1        | 15500/132723 [00:00&lt;00:01, 79443.08KB/s]
+ 18%|#8        | 24276/132723 [00:00&lt;00:01, 83238.71KB/s]
+ 25%|##4       | 33050/132723 [00:00&lt;00:01, 85011.97KB/s]
+ 31%|###1      | 41552/132723 [00:00&lt;00:01, 84301.38KB/s]
+ 38%|###7      | 49984/132723 [00:00&lt;00:01, 81095.71KB/s]
+ 44%|####4     | 58717/132723 [00:00&lt;00:00, 83069.29KB/s]
+ 51%|#####     | 67526/132723 [00:00&lt;00:00, 84632.76KB/s]
+ 57%|#####7    | 76271/132723 [00:00&lt;00:00, 85499.51KB/s]
+ 64%|######4   | 85023/132723 [00:01&lt;00:00, 86114.52KB/s]
+ 71%|#######   | 93817/132723 [00:01&lt;00:00, 86665.19KB/s]
+ 77%|#######7  | 102637/132723 [00:01&lt;00:00, 87119.68KB/s]
+ 84%|########3 | 111464/132723 [00:01&lt;00:00, 87464.64KB/s]
+ 91%|######### | 120251/132723 [00:01&lt;00:00, 87584.54KB/s]
+ 97%|#########7| 129109/132723 [00:01&lt;00:00, 87880.14KB/s]
+100%|##########| 132723/132723 [00:01&lt;00:00, 85230.00KB/s]
 </pre></div>
 </div>
 <p>Create TVM runtime and do inference
@@ -518,7 +517,7 @@ Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from h
 <span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
 </pre></div>
 </div>
-<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  27.363 seconds)</p>
+<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  29.026 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-ssd-gluoncv-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/cccb17d28e5e8b2e94ea8cd5ec59f6ed/deploy_ssd_gluoncv.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_ssd_gluoncv.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/sg_execution_times.html b/docs/how_to/deploy_models/sg_execution_times.html
index ce1176fea8..2afdb80e1a 100644
--- a/docs/how_to/deploy_models/sg_execution_times.html
+++ b/docs/how_to/deploy_models/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-deploy-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>14:39.203</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
+<p><strong>14:51.021</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 86%" />
@@ -349,39 +349,39 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></td>
-<td><p>03:27.363</p></td>
+<td><p>03:29.026</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></td>
-<td><p>03:22.431</p></td>
+<td><p>03:28.344</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></td>
-<td><p>02:40.818</p></td>
+<td><p>02:32.610</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></td>
-<td><p>01:30.392</p></td>
+<td><p>01:40.483</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></td>
-<td><p>01:11.835</p></td>
+<td><p>01:12.919</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_adreno.html#sphx-glr-how-to-deploy-models-deploy-model-on-adreno-py"><span class="std std-ref">Deploy the Pretrained Model on Adreno</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_adreno.py</span></code>)</p></td>
-<td><p>00:53.054</p></td>
+<td><p>00:53.319</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></td>
-<td><p>00:39.334</p></td>
+<td><p>00:39.824</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_nano.html#sphx-glr-how-to-deploy-models-deploy-model-on-nano-py"><span class="std std-ref">Deploy the Pretrained Model on Jetson Nano</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_nano.py</span></code>)</p></td>
-<td><p>00:27.186</p></td>
+<td><p>00:27.427</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></td>
-<td><p>00:26.783</p></td>
+<td><p>00:27.063</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/bring_your_own_datatypes.html b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
index 641abf6248..ab570a2268 100644
--- a/docs/how_to/extend_tvm/bring_your_own_datatypes.html
+++ b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
@@ -619,7 +619,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
 <span class="n">module</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#dict" title="builtins.dict" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">params</span></a> <span class="o">=</span> <span class="n">get_mobilenet</span><span class="p">()</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zipa1643410-c026-4394-a685-fa334720df8f from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zipcb3154e2-9390-47b4-8140-e0c63d32f07a from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
 </pre></div>
 </div>
 <p>It’s easy to execute MobileNet with native TVM:</p>
diff --git a/docs/how_to/extend_tvm/sg_execution_times.html b/docs/how_to/extend_tvm/sg_execution_times.html
index b3b0dc3fc6..f7d6cab4ff 100644
--- a/docs/how_to/extend_tvm/sg_execution_times.html
+++ b/docs/how_to/extend_tvm/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-extend-tvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:51.247</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
+<p><strong>00:52.015</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 84%" />
@@ -349,15 +349,15 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></td>
-<td><p>00:47.558</p></td>
+<td><p>00:48.296</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></td>
-<td><p>00:02.625</p></td>
+<td><p>00:02.653</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></td>
-<td><p>00:01.057</p></td>
+<td><p>00:01.060</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/use_pass_instrument.html b/docs/how_to/extend_tvm/use_pass_instrument.html
index ce8ee4ee59..3195fc97d4 100644
--- a/docs/how_to/extend_tvm/use_pass_instrument.html
+++ b/docs/how_to/extend_tvm/use_pass_instrument.html
@@ -526,10 +526,10 @@ profile the execution time of each passes.</p>
 </pre></div>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 18341us [18341us] (48.70%; 48.70%)
-FoldScaleAxis: 19321us [7us] (51.30%; 51.30%)
-        FoldConstant: 19314us [1666us] (51.28%; 99.97%)
-                InferType: 17648us [17648us] (46.86%; 91.37%)
+InferType: 18158us [18158us] (48.55%; 48.55%)
+FoldScaleAxis: 19244us [7us] (51.45%; 51.45%)
+        FoldConstant: 19237us [1733us] (51.43%; 99.96%)
+                InferType: 17505us [17505us] (46.80%; 90.99%)
 </pre></div>
 </div>
 </div>
@@ -551,10 +551,10 @@ Refer to following sections and <a class="reference internal" href="../../refere
 </pre></div>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 18065us [18065us] (48.66%; 48.66%)
-FoldScaleAxis: 19058us [5us] (51.34%; 51.34%)
-        FoldConstant: 19053us [1658us] (51.32%; 99.97%)
-                InferType: 17395us [17395us] (46.86%; 91.30%)
+InferType: 17363us [17363us] (47.85%; 47.85%)
+FoldScaleAxis: 18921us [5us] (52.15%; 52.15%)
+        FoldConstant: 18916us [1709us] (52.13%; 99.97%)
+                InferType: 17208us [17208us] (47.42%; 90.97%)
 </pre></div>
 </div>
 <p>Register empty list to clear existing instruments.</p>
diff --git a/docs/how_to/optimize_operators/opt_conv_cuda.html b/docs/how_to/optimize_operators/opt_conv_cuda.html
index 14ba4e9f3f..7bab61c423 100644
--- a/docs/how_to/optimize_operators/opt_conv_cuda.html
+++ b/docs/how_to/optimize_operators/opt_conv_cuda.html
@@ -575,7 +575,7 @@ latency of convolution.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Convolution: </span><span class="si">%f</span><span class="s2"> ms&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">*</span> <span cl [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 54.193183 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 44.062721 ms
 </pre></div>
 </div>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-optimize-operators-opt-conv-cuda-py">
diff --git a/docs/how_to/optimize_operators/opt_conv_tensorcore.html b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
index a4edc7aadd..179e3441eb 100644
--- a/docs/how_to/optimize_operators/opt_conv_tensorcore.html
+++ b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
@@ -867,7 +867,7 @@ be able to run on our build server</p>
     <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;conv2d with tensor core: </span><span class="si">%f</span><span class="s2"> ms&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">* [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 8.941158 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 10.772263 ms
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/optimize_operators/opt_gemm.html b/docs/how_to/optimize_operators/opt_gemm.html
index d2ee313b83..aa51fd460d 100644
--- a/docs/how_to/optimize_operators/opt_gemm.html
+++ b/docs/how_to/optimize_operators/opt_gemm.html
@@ -472,8 +472,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Baseline: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018154
-Baseline: 3.297522
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018628
+Baseline: 3.397229
 </pre></div>
 </div>
 <p>In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -532,7 +532,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt1: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.298674
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.295933
 </pre></div>
 </div>
 <p>Here is the generated IR after blocking.</p>
@@ -589,7 +589,7 @@ vastly.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt2: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.336746
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.332504
 </pre></div>
 </div>
 <p>Here is the generated IR after vectorization.</p>
@@ -644,7 +644,7 @@ the access pattern for A matrix is more cache friendly.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt3: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.115573
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.116739
 </pre></div>
 </div>
 <p>Here is the generated IR after loop permutation.</p>
@@ -721,7 +721,7 @@ flattening.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt4: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.110064
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.109848
 </pre></div>
 </div>
 <p>Here is the generated IR after array packing.</p>
@@ -799,7 +799,7 @@ write to C when all the block results are ready.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt5: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111653
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.112016
 </pre></div>
 </div>
 <p>Here is the generated IR after blocking.</p>
@@ -879,7 +879,7 @@ class Module:
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt6: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">opt6_time</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.148195
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.147469
 </pre></div>
 </div>
 <p>Here is the generated IR after parallelization.</p>
diff --git a/docs/how_to/optimize_operators/sg_execution_times.html b/docs/how_to/optimize_operators/sg_execution_times.html
index 7b301fcaa6..2fa03d94fa 100644
--- a/docs/how_to/optimize_operators/sg_execution_times.html
+++ b/docs/how_to/optimize_operators/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-optimize-operators-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:34.691</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
+<p><strong>00:34.839</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 83%" />
@@ -349,15 +349,15 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></td>
-<td><p>00:31.987</p></td>
+<td><p>00:32.242</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></td>
-<td><p>00:01.528</p></td>
+<td><p>00:01.513</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></td>
-<td><p>00:01.176</p></td>
+<td><p>00:01.085</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
index e986dc484d..b50fe393b9 100644
--- a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
+++ b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-tune-with-autoscheduler-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>09:26.723</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
+<p><strong>09:28.448</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 85%" />
@@ -349,27 +349,27 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></td>
-<td><p>05:37.090</p></td>
+<td><p>05:46.341</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></td>
-<td><p>01:37.813</p></td>
+<td><p>01:38.610</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></td>
-<td><p>01:05.122</p></td>
+<td><p>01:05.810</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></td>
-<td><p>00:40.466</p></td>
+<td><p>00:31.182</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></td>
-<td><p>00:13.629</p></td>
+<td><p>00:13.841</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></td>
-<td><p>00:12.603</p></td>
+<td><p>00:12.664</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
index 83d4212476..e6844cd787 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
@@ -507,12 +507,12 @@ class Module:
         bias_1 = T.match_buffer(bias, (1, 512, 1, 1))
         compute_1 = T.match_buffer(compute, (1, 512, 7, 7))
         blockIdx_x = T.env_thread(&quot;blockIdx.x&quot;)
-        T.launch_thread(blockIdx_x, 28)
+        T.launch_thread(blockIdx_x, 16)
         conv2d_nchw = T.allocate([14], &quot;float32&quot;, &quot;local&quot;)
-        pad_temp_shared = T.allocate([72], &quot;float32&quot;, &quot;shared&quot;)
-        kernel_shared = T.allocate([3072], &quot;float32&quot;, &quot;shared&quot;)
+        pad_temp_shared = T.allocate([2592], &quot;float32&quot;, &quot;shared&quot;)
+        kernel_shared = T.allocate([9216], &quot;float32&quot;, &quot;shared&quot;)
         threadIdx_x = T.env_thread(&quot;threadIdx.x&quot;)
-        T.launch_thread(threadIdx_x, 64)
+        T.launch_thread(threadIdx_x, 112)
         conv2d_nchw_1 = T.buffer_decl((14,), data=conv2d_nchw, scope=&quot;local&quot;, align=32)
         conv2d_nchw_1[0] = T.float32(0)
         conv2d_nchw_1[1] = T.float32(0)
@@ -528,460 +528,146 @@ class Module:
         conv2d_nchw_1[11] = T.float32(0)
         conv2d_nchw_1[12] = T.float32(0)
         conv2d_nchw_1[13] = T.float32(0)
-        for rc_outer_outer, ry_outer_outer in T.grid(64, 3):
-            cse_var_2: T.int32 = rc_outer_outer * 72
-            cse_var_1: T.int32 = ry_outer_outer * 3
+        for rc_outer_outer in range(16):
+            cse_var_1: T.int32 = rc_outer_outer * 1568
             threadIdx_x_1 = T.env_thread(&quot;threadIdx.x&quot;)
-            pad_temp_shared_1 = T.buffer_decl((72,), data=pad_temp_shared, scope=&quot;shared&quot;)
-            with T.launch_thread(threadIdx_x_1, 64):
-                data_2 = T.buffer_decl((25088,), data=data_1.data)
-                if T.likely(threadIdx_x_1 &lt; 18):
-                    pad_temp_shared_1[threadIdx_x_1 * 4] = T.if_then_else(1 &lt;= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 &lt; 8 and 1 &lt;= threadIdx_x_1 * 4 % 9 and threadIdx_x_1 * 4 % 9 &lt; 8, data_2[rc_outer_outer * 392 + threadIdx_x_1 * 4 // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + threadIdx_x_1 * 4 % 9 - 8], T.float32(0))
-                if T.likely(threadIdx_x_1 &lt; 18):
-                    pad_temp_shared_1[threadIdx_x_1 * 4 + 1] = T.if_then_else(1 &lt;= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 &lt; 8 and 1 &lt;= (threadIdx_x_1 * 4 + 1) % 9 and (threadIdx_x_1 * 4 + 1) % 9 &lt; 8, data_2[rc_outer_outer * 392 + (threadIdx_x_1 * 4 + 1) // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + (threadIdx_x_1 * 4 + 1) % 9 - 8], T.float32(0))
-                if T.likely(threadIdx_x_1 &lt; 18):
-                    pad_temp_shared_1[threadIdx_x_1 * 4 + 2] = T.if_then_else(1 &lt;= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 &lt; 8 and 1 &lt;= (threadIdx_x_1 * 4 + 2) % 9 and (threadIdx_x_1 * 4 + 2) % 9 &lt; 8, data_2[rc_outer_outer * 392 + (threadIdx_x_1 * 4 + 2) // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + (threadIdx_x_1 * 4 + 2) % 9 - 8], T.float32(0))
-                if T.likely(threadIdx_x_1 &lt; 18):
-                    pad_temp_shared_1[threadIdx_x_1 * 4 + 3] = T.if_then_else(1 &lt;= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 &lt; 8 and 1 &lt;= (threadIdx_x_1 * 4 + 3) % 9 and (threadIdx_x_1 * 4 + 3) % 9 &lt; 8, data_2[rc_outer_outer * 392 + (threadIdx_x_1 * 4 + 3) // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + (threadIdx_x_1 * 4 + 3) % 9 - 8], T.float32(0))
-            threadIdx_x_2 = T.env_thread(&quot;threadIdx.x&quot;)
-            kernel_shared_1 = T.buffer_decl((3072,), data=kernel_shared, scope=&quot;shared&quot;)
-            kernel_2 = T.buffer_decl((2359296,), data=kernel_1.data)
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 64] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 64) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 128] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 128) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 192] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 36864]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 256] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 256) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 320] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 320) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 384] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 73728]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 448] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 448) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 512] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 512) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 576] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 110592]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 640] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 640) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 704] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 704) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 768] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 147456]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 832] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 832) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 896] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 896) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 960] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 184320]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1024] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1024) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1088] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1088) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1152] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 221184]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1216] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1216) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1280] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1280) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1344] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 258048]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1408] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1408) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1472] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1472) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1536] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 294912]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1600] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1600) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1664] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1664) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1728] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 331776]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1792] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1792) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1856] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1856) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1920] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 368640]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 1984] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1984) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2048] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2048) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2112] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 405504]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2176] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2176) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2240] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2240) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2304] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 442368]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2368] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2368) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2432] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2432) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2496] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 479232]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2560] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2560) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2624] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2624) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2688] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 516096]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2752] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2752) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2816] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2816) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2880] = kernel_2[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 552960]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 2944] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2944) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
-            with T.launch_thread(threadIdx_x_2, 64):
-                kernel_shared_1[threadIdx_x_2 + 3008] = kernel_2[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 3008) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[0] * kernel_shared_1[threadIdx_x * 48]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[9] * kernel_shared_1[threadIdx_x * 48 + 3]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 3]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 3]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 3]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 3]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 3]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 3]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[0] * kernel_shared_1[threadIdx_x * 48 + 24]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[9] * kernel_shared_1[threadIdx_x * 48 + 27]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48 + 24]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 27]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 24]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 27]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 24]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 27]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 24]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 27]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 24]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 27]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 24]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 27]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48 + 1]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 4]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 1]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 4]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 1]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 4]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 1]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 4]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 1]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 4]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 1]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 4]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 1]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 4]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48 + 25]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 28]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 25]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 28]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 25]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 28]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 25]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 28]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 25]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 28]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 25]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 28]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 25]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 28]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 2]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 5]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 2]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 5]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 2]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 5]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 2]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 5]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 2]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 5]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 2]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 5]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[8] * kernel_shared_1[threadIdx_x * 48 + 2]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[17] * kernel_shared_1[threadIdx_x * 48 + 5]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 26]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 29]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 26]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 29]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 26]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 29]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 26]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 29]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 26]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 29]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 26]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 29]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[8] * kernel_shared_1[threadIdx_x * 48 + 26]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[17] * kernel_shared_1[threadIdx_x * 48 + 29]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[18] * kernel_shared_1[threadIdx_x * 48 + 6]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[27] * kernel_shared_1[threadIdx_x * 48 + 9]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 6]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 9]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 6]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 9]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 6]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 9]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 6]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 9]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 6]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 9]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 6]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 9]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[18] * kernel_shared_1[threadIdx_x * 48 + 30]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[27] * kernel_shared_1[threadIdx_x * 48 + 33]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 30]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 33]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 30]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 33]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 30]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 33]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 30]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 33]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 30]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 33]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 30]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 33]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 7]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 10]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 7]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 10]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 7]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 10]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 7]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 10]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 7]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 10]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 7]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 10]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 7]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 10]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 31]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 34]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 31]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 34]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 31]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 34]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 31]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 34]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 31]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 34]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 31]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 34]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 31]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 34]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 8]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 11]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 8]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 11]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 8]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 11]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 8]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 11]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 8]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 11]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 8]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 11]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[26] * kernel_shared_1[threadIdx_x * 48 + 8]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[35] * kernel_shared_1[threadIdx_x * 48 + 11]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 32]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 35]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 32]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 35]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 32]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 35]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 32]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 35]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 32]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 35]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 32]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 35]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[26] * kernel_shared_1[threadIdx_x * 48 + 32]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[35] * kernel_shared_1[threadIdx_x * 48 + 35]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[36] * kernel_shared_1[threadIdx_x * 48 + 12]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[45] * kernel_shared_1[threadIdx_x * 48 + 15]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 12]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 15]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 12]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 15]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 12]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 15]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 12]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 15]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 12]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 15]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 12]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 15]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[36] * kernel_shared_1[threadIdx_x * 48 + 36]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[45] * kernel_shared_1[threadIdx_x * 48 + 39]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 36]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 39]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 36]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 39]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 36]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 39]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 36]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 39]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 36]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 39]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 36]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 39]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 13]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 16]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 13]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 16]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 13]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 16]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 13]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 16]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 13]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 16]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 13]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 16]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 13]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 16]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 37]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 40]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 37]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 40]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 37]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 40]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 37]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 40]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 37]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 40]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 37]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 40]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 37]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 40]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 14]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 17]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 14]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 17]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 14]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 17]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 14]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 17]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 14]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 17]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 14]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 17]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[44] * kernel_shared_1[threadIdx_x * 48 + 14]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[53] * kernel_shared_1[threadIdx_x * 48 + 17]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 38]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 41]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 38]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 41]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 38]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 41]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 38]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 41]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 38]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 41]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 38]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 41]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[44] * kernel_shared_1[threadIdx_x * 48 + 38]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[53] * kernel_shared_1[threadIdx_x * 48 + 41]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[54] * kernel_shared_1[threadIdx_x * 48 + 18]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[63] * kernel_shared_1[threadIdx_x * 48 + 21]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 18]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 21]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 18]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 21]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 18]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 21]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 18]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 21]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 18]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 21]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 18]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 21]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[54] * kernel_shared_1[threadIdx_x * 48 + 42]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[63] * kernel_shared_1[threadIdx_x * 48 + 45]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 42]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 45]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 42]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 45]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 42]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 45]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 42]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 45]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 42]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 45]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 42]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 45]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 19]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 22]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 19]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 22]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 19]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 22]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 19]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 22]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 19]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 22]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 19]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 22]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 19]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 22]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 43]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 46]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 43]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 46]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 43]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 46]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 43]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 46]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 43]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 46]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 43]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 46]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 43]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 46]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 20]
-            conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 23]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 20]
-            conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 23]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 20]
-            conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 23]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 20]
-            conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 23]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 20]
-            conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 23]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 20]
-            conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 23]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[62] * kernel_shared_1[threadIdx_x * 48 + 20]
-            conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[71] * kernel_shared_1[threadIdx_x * 48 + 23]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 44]
-            conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 47]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 44]
-            conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 47]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 44]
-            conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 47]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 44]
-            conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 47]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 44]
-            conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 47]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 44]
-            conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 47]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[62] * kernel_shared_1[threadIdx_x * 48 + 44]
-            conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[71] * kernel_shared_1[threadIdx_x * 48 + 47]
-        for i1_inner, i3_inner in T.grid(2, 7):
+            pad_temp_shared_1 = T.buffer_decl((2592,), data=pad_temp_shared, scope=&quot;shared&quot;)
+            data_2 = T.buffer_decl((25088,), data=data_1.data)
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(9 &lt;= threadIdx_x_1 % 81 and threadIdx_x_1 % 81 &lt; 72 and 1 &lt;= threadIdx_x_1 % 9 and threadIdx_x_1 % 9 &lt; 8, data_2[cse_var_1 + threadIdx_x_1 // 81 * 49 + threadIdx_x_1 % 81 // 9 * 7 + threadIdx_x_1 % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 112] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 31) % 81 and (threadIdx_x_1 + 31) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 4) % 9 and (threadIdx_x_1 + 4) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 112) // 81 * 49 + (threadIdx_x_1 + 31) % 81 // 9 * 7 + (threadIdx_x_1 + 4) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 224] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 62) % 81 and (threadIdx_x_1 + 62) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 8) % 9 and (threadIdx_x_1 + 8) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 224) // 81 * 49 + (threadIdx_x_1 + 62) % 81 // 9 * 7 + (threadIdx_x_1 + 8) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 336] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 12) % 81 and (threadIdx_x_1 + 12) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 3) % 9 and (threadIdx_x_1 + 3) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 336) // 81 * 49 + (threadIdx_x_1 + 12) % 81 // 9 * 7 + (threadIdx_x_1 + 3) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 448] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 43) % 81 and (threadIdx_x_1 + 43) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 7) % 9 and (threadIdx_x_1 + 7) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 448) // 81 * 49 + (threadIdx_x_1 + 43) % 81 // 9 * 7 + (threadIdx_x_1 + 7) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 560] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 74) % 81 and (threadIdx_x_1 + 74) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 2) % 9 and (threadIdx_x_1 + 2) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 560) // 81 * 49 + (threadIdx_x_1 + 74) % 81 // 9 * 7 + (threadIdx_x_1 + 2) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 672] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 24) % 81 and (threadIdx_x_1 + 24) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 6) % 9 and (threadIdx_x_1 + 6) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 672) // 81 * 49 + (threadIdx_x_1 + 24) % 81 // 9 * 7 + (threadIdx_x_1 + 6) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 784] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 55) % 81 and (threadIdx_x_1 + 55) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 1) % 9 and (threadIdx_x_1 + 1) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 784) // 81 * 49 + (threadIdx_x_1 + 55) % 81 // 9 * 7 + (threadIdx_x_1 + 1) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 896] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 5) % 81 and (threadIdx_x_1 + 5) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 5) % 9 and (threadIdx_x_1 + 5) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 896) // 81 * 49 + (threadIdx_x_1 + 5) % 81 // 9 * 7 + (threadIdx_x_1 + 5) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 1008] = T.if_then_else(1 &lt;= (threadIdx_x_1 // 9 + 4) % 9 and (threadIdx_x_1 + 36) % 81 &lt; 72 and 1 &lt;= threadIdx_x_1 % 9 and threadIdx_x_1 % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 1008) // 81 * 49 + (threadIdx_x_1 // 9 + 4) % 9 * 7 + threadIdx_x_1 % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 1120] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 67) % 81 and (threadIdx_x_1 + 67) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 4) % 9 and (threadIdx_x_1 + 4) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 1120) // 81 * 49 + (threadIdx_x_1 + 67) % 81 // 9 * 7 + (threadIdx_x_1 + 4) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 1232] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 17) % 81 and (threadIdx_x_1 + 17) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 8) % 9 and (threadIdx_x_1 + 8) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 1232) // 81 * 49 + (threadIdx_x_1 + 17) % 81 // 9 * 7 + (threadIdx_x_1 + 8) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 1344] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 48) % 81 and (threadIdx_x_1 + 48) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 3) % 9 and (threadIdx_x_1 + 3) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 1344) // 81 * 49 + (threadIdx_x_1 + 48) % 81 // 9 * 7 + (threadIdx_x_1 + 3) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 1456] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 79) % 81 and (threadIdx_x_1 + 79) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 7) % 9 and (threadIdx_x_1 + 7) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 1456) // 81 * 49 + (threadIdx_x_1 + 79) % 81 // 9 * 7 + (threadIdx_x_1 + 7) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 1568] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 29) % 81 and (threadIdx_x_1 + 29) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 2) % 9 and (threadIdx_x_1 + 2) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 1568) // 81 * 49 + (threadIdx_x_1 + 29) % 81 // 9 * 7 + (threadIdx_x_1 + 2) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 1680] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 60) % 81 and (threadIdx_x_1 + 60) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 6) % 9 and (threadIdx_x_1 + 6) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 1680) // 81 * 49 + (threadIdx_x_1 + 60) % 81 // 9 * 7 + (threadIdx_x_1 + 6) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 1792] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 10) % 81 and (threadIdx_x_1 + 10) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 1) % 9 and (threadIdx_x_1 + 1) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 1792) // 81 * 49 + (threadIdx_x_1 + 10) % 81 // 9 * 7 + (threadIdx_x_1 + 1) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 1904] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 41) % 81 and (threadIdx_x_1 + 41) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 5) % 9 and (threadIdx_x_1 + 5) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 1904) // 81 * 49 + (threadIdx_x_1 + 41) % 81 // 9 * 7 + (threadIdx_x_1 + 5) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 2016] = T.if_then_else(1 &lt;= (threadIdx_x_1 // 9 + 8) % 9 and (threadIdx_x_1 + 72) % 81 &lt; 72 and 1 &lt;= threadIdx_x_1 % 9 and threadIdx_x_1 % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 2016) // 81 * 49 + (threadIdx_x_1 // 9 + 8) % 9 * 7 + threadIdx_x_1 % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 2128] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 22) % 81 and (threadIdx_x_1 + 22) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 4) % 9 and (threadIdx_x_1 + 4) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 2128) // 81 * 49 + (threadIdx_x_1 + 22) % 81 // 9 * 7 + (threadIdx_x_1 + 4) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 2240] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 53) % 81 and (threadIdx_x_1 + 53) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 8) % 9 and (threadIdx_x_1 + 8) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 2240) // 81 * 49 + (threadIdx_x_1 + 53) % 81 // 9 * 7 + (threadIdx_x_1 + 8) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 2352] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 3) % 81 and (threadIdx_x_1 + 3) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 3) % 9 and (threadIdx_x_1 + 3) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 2352) // 81 * 49 + (threadIdx_x_1 + 3) % 81 // 9 * 7 + (threadIdx_x_1 + 3) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                pad_temp_shared_1[threadIdx_x_1 + 2464] = T.if_then_else(9 &lt;= (threadIdx_x_1 + 34) % 81 and (threadIdx_x_1 + 34) % 81 &lt; 72 and 1 &lt;= (threadIdx_x_1 + 7) % 9 and (threadIdx_x_1 + 7) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 2464) // 81 * 49 + (threadIdx_x_1 + 34) % 81 // 9 * 7 + (threadIdx_x_1 + 7) % 9 - 8], T.float32(0))
+            with T.launch_thread(threadIdx_x_1, 112):
+                if T.likely(threadIdx_x_1 &lt; 16):
+                    pad_temp_shared_1[threadIdx_x_1 + 2576] = T.if_then_else(threadIdx_x_1 &lt; 7 and 1 &lt;= (threadIdx_x_1 + 2) % 9 and (threadIdx_x_1 + 2) % 9 &lt; 8, data_2[cse_var_1 + (threadIdx_x_1 + 2576) // 81 * 49 + (threadIdx_x_1 + 65) % 81 // 9 * 7 + (threadIdx_x_1 + 2) - 8], T.float32(0))
+            kernel_shared_1 = T.buffer_decl((9216,), data=kernel_shared, scope=&quot;shared&quot;)
+            for ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer in range(4):
+                threadIdx_x_2 = T.env_thread(&quot;threadIdx.x&quot;)
+                T.launch_thread(threadIdx_x_2, 112)
+                kernel_2 = T.buffer_decl((2359296,), data=kernel_1.data)
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 32 + threadIdx_x_2 * 8) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 1] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 32 + threadIdx_x_2 * 8) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3 + 1]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 2] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 32 + threadIdx_x_2 * 8) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3 + 2]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 3] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 1) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 4] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 1) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3 + 1]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 5] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 1) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3 + 2]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 6] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 2) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 2) % 3 * 3]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 7] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 2) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 2) % 3 * 3 + 1]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 8] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 2) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 2) % 3 * 3 + 2]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 9] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + ((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8) // 3 + 1) % 32 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 10] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + ((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8) // 3 + 1) % 32 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3 + 1]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 11] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + ((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8) // 3 + 1) % 32 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3 + 2]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 12] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 4) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 13] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 4) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3 + 1]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 14] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 4) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3 + 2]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 15] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 5) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 2) % 3 * 3]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 16] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 5) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 2) % 3 * 3 + 1]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 17] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 5) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 2) % 3 * 3 + 2]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 18] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + ((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8) // 3 + 2) % 32 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 19] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + ((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8) // 3 + 2) % 32 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3 + 1]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 20] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + ((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8) // 3 + 2) % 32 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2 + threadIdx_x_2 * 2) % 3 * 3 + 2]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 21] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 7) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 22] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 7) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3 + 1]
+                if T.likely(ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7 + threadIdx_x_2 // 16 &lt; 24):
+                    kernel_shared_1[ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688 + threadIdx_x_2 * 24 + 23] = kernel_2[blockIdx_x * 147456 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28 + threadIdx_x_2 // 4) // 3 * 4608 + rc_outer_outer * 288 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896 + threadIdx_x_2 * 8 + 7) % 96 // 3 * 9 + (ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224 + threadIdx_x_2 * 2 + 1) % 3 * 3 + 2]
+            for rc_outer_inner, rx_outer_inner, ff_outer_inner, rc_inner in T.grid(2, 3, 2, 16):
+                cse_var_8: T.int32 = ff_outer_inner * 7
+                cse_var_7: T.int32 = cse_var_8 + 6
+                cse_var_6: T.int32 = cse_var_8 + 5
+                cse_var_5: T.int32 = cse_var_8 + 4
+                cse_var_4: T.int32 = cse_var_8 + 3
+                cse_var_3: T.int32 = cse_var_8 + 2
+                cse_var_2: T.int32 = cse_var_8 + 1
+                conv2d_nchw_1[cse_var_8] = conv2d_nchw_1[cse_var_8] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner]
+                conv2d_nchw_1[cse_var_2] = conv2d_nchw_1[cse_var_2] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 9] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner]
+                conv2d_nchw_1[cse_var_3] = conv2d_nchw_1[cse_var_3] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner]
+                conv2d_nchw_1[cse_var_4] = conv2d_nchw_1[cse_var_4] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 27] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner]
+                conv2d_nchw_1[cse_var_5] = conv2d_nchw_1[cse_var_5] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 36] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner]
+                conv2d_nchw_1[cse_var_6] = conv2d_nchw_1[cse_var_6] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 45] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner]
+                conv2d_nchw_1[cse_var_7] = conv2d_nchw_1[cse_var_7] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 54] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner]
+                conv2d_nchw_1[cse_var_8] = conv2d_nchw_1[cse_var_8] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 9] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 3]
+                conv2d_nchw_1[cse_var_2] = conv2d_nchw_1[cse_var_2] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 3]
+                conv2d_nchw_1[cse_var_3] = conv2d_nchw_1[cse_var_3] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 27] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 3]
+                conv2d_nchw_1[cse_var_4] = conv2d_nchw_1[cse_var_4] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 36] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 3]
+                conv2d_nchw_1[cse_var_5] = conv2d_nchw_1[cse_var_5] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 45] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 3]
+                conv2d_nchw_1[cse_var_6] = conv2d_nchw_1[cse_var_6] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 54] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 3]
+                conv2d_nchw_1[cse_var_7] = conv2d_nchw_1[cse_var_7] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 63] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 3]
+                conv2d_nchw_1[cse_var_8] = conv2d_nchw_1[cse_var_8] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 6]
+                conv2d_nchw_1[cse_var_2] = conv2d_nchw_1[cse_var_2] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 27] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 6]
+                conv2d_nchw_1[cse_var_3] = conv2d_nchw_1[cse_var_3] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 36] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 6]
+                conv2d_nchw_1[cse_var_4] = conv2d_nchw_1[cse_var_4] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 45] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 6]
+                conv2d_nchw_1[cse_var_5] = conv2d_nchw_1[cse_var_5] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 54] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 6]
+                conv2d_nchw_1[cse_var_6] = conv2d_nchw_1[cse_var_6] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 63] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 6]
+                conv2d_nchw_1[cse_var_7] = conv2d_nchw_1[cse_var_7] + pad_temp_shared_1[rc_outer_inner * 1296 + rc_inner * 81 + rx_outer_inner + threadIdx_x % 7 + 72] * kernel_shared_1[threadIdx_x // 7 * 576 + ff_outer_inner * 288 + rc_outer_inner * 144 + rc_inner * 9 + rx_outer_inner + 6]
+        for i1_inner, i2_inner in T.grid(2, 7):
             compute_2 = T.buffer_decl((25088,), data=compute_1.data)
             bias_2 = T.buffer_decl((512,), data=bias_1.data)
-            compute_2[blockIdx_x // 7 * 6272 + threadIdx_x * 98 + i1_inner * 49 + blockIdx_x % 7 * 7 + i3_inner] = T.max(conv2d_nchw_1[i1_inner * 7 + i3_inner] + bias_2[blockIdx_x // 7 * 128 + threadIdx_x * 2 + i1_inner], T.float32(0))
+            compute_2[blockIdx_x * 1568 + threadIdx_x // 7 * 98 + i1_inner * 49 + i2_inner * 7 + threadIdx_x % 7] = T.max(conv2d_nchw_1[i1_inner * 7 + i2_inner] + bias_2[blockIdx_x * 32 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
 </pre></div>
 </div>
 </div>
@@ -1015,7 +701,7 @@ class Module:
 <span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.356 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.325 ms
 </pre></div>
 </div>
 </div>
@@ -1046,19 +732,19 @@ conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_
 conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
 conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
 conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
-conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
+conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=16)
 conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
-conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
+conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=7)
 conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
 conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
 conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
 conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
-conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
-conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
+conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
+conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
 conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
-conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
-conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
-conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
+conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=16)
+conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=2)
+conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=3)
 conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
 conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
 conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
@@ -1067,13 +753,13 @@ compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
 compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
 compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
 compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
-compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
+compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=16)
 compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
-compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
+compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=7)
 compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
 compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
-compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
+compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
 compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
 s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
 s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -1091,16 +777,16 @@ s[compute].bind(compute_i0_o_o_i_i1_o_o_i_fused_i2_o_o_i_fused_i3_o_o_i_fused, t
 compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused = s[compute].fuse(compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i)
 s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread_axis(&quot;threadIdx.x&quot;))
 kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=24)
 s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=112)
 s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis(&quot;threadIdx.x&quot;))
 pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
 s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=112)
 s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis(&quot;threadIdx.x&quot;))
-s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, &quot;auto_unroll_max_step&quot;, 512)
+s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, &quot;auto_unroll_max_step&quot;, 64)
 s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, &quot;unroll_explicit&quot;, True)
 
 CUDA source code:
@@ -1118,10 +804,10 @@ CUDA source code:
   #define int64_t long long
   #define uint64_t unsigned long long
 #endif
-extern &quot;C&quot; __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+extern &quot;C&quot; __global__ void __launch_bounds__(112) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
   float conv2d_nchw[14];
-  __shared__ float pad_temp_shared[72];
-  __shared__ float kernel_shared[3072];
+  __shared__ float pad_temp_shared[2592];
+  __shared__ float kernel_shared[9216];
   conv2d_nchw[0] = 0.000000e+00f;
   conv2d_nchw[1] = 0.000000e+00f;
   conv2d_nchw[2] = 0.000000e+00f;
@@ -1136,411 +822,142 @@ extern &quot;C&quot; __global__ void __launch_bounds__(64) default_function_kern
   conv2d_nchw[11] = 0.000000e+00f;
   conv2d_nchw[12] = 0.000000e+00f;
   conv2d_nchw[13] = 0.000000e+00f;
-  for (int rc_outer_outer = 0; rc_outer_outer &lt; 64; ++rc_outer_outer) {
-    for (int ry_outer_outer = 0; ry_outer_outer &lt; 3; ++ry_outer_outer) {
-      __syncthreads();
-      if (((int)threadIdx.x) &lt; 18) {
-        pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) * 4) % 9))) &amp;&amp; (((((int)threadIdx.x) * 4) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
+  for (int rc_outer_outer = 0; rc_outer_outer &lt; 16; ++rc_outer_outer) {
+    __syncthreads();
+    pad_temp_shared[((int)threadIdx.x)] = (((((9 &lt;= (((int)threadIdx.x) % 81)) &amp;&amp; ((((int)threadIdx.x) % 81) &lt; 72)) &amp;&amp; (1 &lt;= (((int)threadIdx.x) % 9))) &amp;&amp; ((((int)threadIdx.x) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + ((((int)threadIdx.x) / 81) * 49)) + (((((int)threadIdx.x) % 81) / 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 112)] = (((((9 &lt;= ((((int)threadIdx.x) + 31) % 81)) &amp;&amp; (((((int)threadIdx.x) + 31) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 4) % 9))) &amp;&amp; (((((int)threadIdx.x) + 4) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 112) / 81) * 49)) + ((((((int)threadIdx.x) + 31) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 224)] = (((((9 &lt;= ((((int)threadIdx.x) + 62) % 81)) &amp;&amp; (((((int)threadIdx.x) + 62) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 8) % 9))) &amp;&amp; (((((int)threadIdx.x) + 8) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 224) / 81) * 49)) + ((((((int)threadIdx.x) + 62) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 336)] = (((((9 &lt;= ((((int)threadIdx.x) + 12) % 81)) &amp;&amp; (((((int)threadIdx.x) + 12) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 3) % 9))) &amp;&amp; (((((int)threadIdx.x) + 3) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 336) / 81) * 49)) + ((((((int)threadIdx.x) + 12) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 448)] = (((((9 &lt;= ((((int)threadIdx.x) + 43) % 81)) &amp;&amp; (((((int)threadIdx.x) + 43) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 7) % 9))) &amp;&amp; (((((int)threadIdx.x) + 7) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 448) / 81) * 49)) + ((((((int)threadIdx.x) + 43) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 560)] = (((((9 &lt;= ((((int)threadIdx.x) + 74) % 81)) &amp;&amp; (((((int)threadIdx.x) + 74) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 2) % 9))) &amp;&amp; (((((int)threadIdx.x) + 2) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 560) / 81) * 49)) + ((((((int)threadIdx.x) + 74) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 672)] = (((((9 &lt;= ((((int)threadIdx.x) + 24) % 81)) &amp;&amp; (((((int)threadIdx.x) + 24) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 6) % 9))) &amp;&amp; (((((int)threadIdx.x) + 6) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 672) / 81) * 49)) + ((((((int)threadIdx.x) + 24) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 784)] = (((((9 &lt;= ((((int)threadIdx.x) + 55) % 81)) &amp;&amp; (((((int)threadIdx.x) + 55) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 1) % 9))) &amp;&amp; (((((int)threadIdx.x) + 1) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 784) / 81) * 49)) + ((((((int)threadIdx.x) + 55) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 896)] = (((((9 &lt;= ((((int)threadIdx.x) + 5) % 81)) &amp;&amp; (((((int)threadIdx.x) + 5) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 5) % 9))) &amp;&amp; (((((int)threadIdx.x) + 5) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 896) / 81) * 49)) + ((((((int)threadIdx.x) + 5) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 1008)] = (((((1 &lt;= (((((int)threadIdx.x) / 9) + 4) % 9)) &amp;&amp; (((((int)threadIdx.x) + 36) % 81) &lt; 72)) &amp;&amp; (1 &lt;= (((int)threadIdx.x) % 9))) &amp;&amp; ((((int)threadIdx.x) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1008) / 81) * 49)) + ((((((int)threadIdx.x) / 9) + 4) % 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 1120)] = (((((9 &lt;= ((((int)threadIdx.x) + 67) % 81)) &amp;&amp; (((((int)threadIdx.x) + 67) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 4) % 9))) &amp;&amp; (((((int)threadIdx.x) + 4) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1120) / 81) * 49)) + ((((((int)threadIdx.x) + 67) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 1232)] = (((((9 &lt;= ((((int)threadIdx.x) + 17) % 81)) &amp;&amp; (((((int)threadIdx.x) + 17) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 8) % 9))) &amp;&amp; (((((int)threadIdx.x) + 8) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1232) / 81) * 49)) + ((((((int)threadIdx.x) + 17) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 1344)] = (((((9 &lt;= ((((int)threadIdx.x) + 48) % 81)) &amp;&amp; (((((int)threadIdx.x) + 48) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 3) % 9))) &amp;&amp; (((((int)threadIdx.x) + 3) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1344) / 81) * 49)) + ((((((int)threadIdx.x) + 48) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 1456)] = (((((9 &lt;= ((((int)threadIdx.x) + 79) % 81)) &amp;&amp; (((((int)threadIdx.x) + 79) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 7) % 9))) &amp;&amp; (((((int)threadIdx.x) + 7) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1456) / 81) * 49)) + ((((((int)threadIdx.x) + 79) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 1568)] = (((((9 &lt;= ((((int)threadIdx.x) + 29) % 81)) &amp;&amp; (((((int)threadIdx.x) + 29) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 2) % 9))) &amp;&amp; (((((int)threadIdx.x) + 2) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1568) / 81) * 49)) + ((((((int)threadIdx.x) + 29) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 1680)] = (((((9 &lt;= ((((int)threadIdx.x) + 60) % 81)) &amp;&amp; (((((int)threadIdx.x) + 60) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 6) % 9))) &amp;&amp; (((((int)threadIdx.x) + 6) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1680) / 81) * 49)) + ((((((int)threadIdx.x) + 60) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 1792)] = (((((9 &lt;= ((((int)threadIdx.x) + 10) % 81)) &amp;&amp; (((((int)threadIdx.x) + 10) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 1) % 9))) &amp;&amp; (((((int)threadIdx.x) + 1) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1792) / 81) * 49)) + ((((((int)threadIdx.x) + 10) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 1904)] = (((((9 &lt;= ((((int)threadIdx.x) + 41) % 81)) &amp;&amp; (((((int)threadIdx.x) + 41) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 5) % 9))) &amp;&amp; (((((int)threadIdx.x) + 5) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1904) / 81) * 49)) + ((((((int)threadIdx.x) + 41) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 2016)] = (((((1 &lt;= (((((int)threadIdx.x) / 9) + 8) % 9)) &amp;&amp; (((((int)threadIdx.x) + 72) % 81) &lt; 72)) &amp;&amp; (1 &lt;= (((int)threadIdx.x) % 9))) &amp;&amp; ((((int)threadIdx.x) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 2016) / 81) * 49)) + ((((((int)threadIdx.x) / 9) + 8) % 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 2128)] = (((((9 &lt;= ((((int)threadIdx.x) + 22) % 81)) &amp;&amp; (((((int)threadIdx.x) + 22) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 4) % 9))) &amp;&amp; (((((int)threadIdx.x) + 4) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 2128) / 81) * 49)) + ((((((int)threadIdx.x) + 22) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 2240)] = (((((9 &lt;= ((((int)threadIdx.x) + 53) % 81)) &amp;&amp; (((((int)threadIdx.x) + 53) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 8) % 9))) &amp;&amp; (((((int)threadIdx.x) + 8) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 2240) / 81) * 49)) + ((((((int)threadIdx.x) + 53) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 2352)] = (((((9 &lt;= ((((int)threadIdx.x) + 3) % 81)) &amp;&amp; (((((int)threadIdx.x) + 3) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 3) % 9))) &amp;&amp; (((((int)threadIdx.x) + 3) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 2352) / 81) * 49)) + ((((((int)threadIdx.x) + 3) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
+    pad_temp_shared[(((int)threadIdx.x) + 2464)] = (((((9 &lt;= ((((int)threadIdx.x) + 34) % 81)) &amp;&amp; (((((int)threadIdx.x) + 34) % 81) &lt; 72)) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 7) % 9))) &amp;&amp; (((((int)threadIdx.x) + 7) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 2464) / 81) * 49)) + ((((((int)threadIdx.x) + 34) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
+    if (((int)threadIdx.x) &lt; 16) {
+      pad_temp_shared[(((int)threadIdx.x) + 2576)] = ((((((int)threadIdx.x) &lt; 7) &amp;&amp; (1 &lt;= ((((int)threadIdx.x) + 2) % 9))) &amp;&amp; (((((int)threadIdx.x) + 2) % 9) &lt; 8)) ? data[(((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 2576) / 81) * 49)) + (((((int)threadIdx.x) + 65) / 9) * 7)) + ((int)threadIdx.x)) - 6)] : 0.000000e+00f);
+    }
+    for (int ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer = 0; ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer &lt; 4; ++ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer) {
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24))] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 32) + (((int)threadIdx.x) * 8)) % 96) / 3) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % 3) * 3))];
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 1)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 32) + (((int)threadIdx.x) * 8)) % 96) / 3) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % 3) * 3)) + 1)];
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 2)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 32) + (((int)threadIdx.x) * 8)) % 96) / 3) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % 3) * 3)) + 2)];
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 3)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 1) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 1) [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 4)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 1) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 1 [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 5)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 1) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 1 [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 6)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 2) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 2) [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 7)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 2) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 2 [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 8)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 2) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 2 [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 9)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) / 3) + 1) &amp; 31) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % 3 [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 10)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) / 3) + 1) &amp; 31) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 11)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) / 3) + 1) &amp; 31) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % [...]
       }
-      if (((int)threadIdx.x) &lt; 18) {
-        pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= (((((int)threadIdx.x) * 4) + 1) % 9))) &amp;&amp; ((((((int)threadIdx.x) * 4) + 1) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 12)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 4) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 1 [...]
       }
-      if (((int)threadIdx.x) &lt; 18) {
-        pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= (((((int)threadIdx.x) * 4) + 2) % 9))) &amp;&amp; ((((((int)threadIdx.x) * 4) + 2) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 13)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 4) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) +  [...]
       }
-      if (((int)threadIdx.x) &lt; 18) {
-        pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 &lt;= (ry_outer_outer + (((int)blockIdx.x) % 7))) &amp;&amp; ((ry_outer_outer + (((int)blockIdx.x) % 7)) &lt; 8)) &amp;&amp; (1 &lt;= (((((int)threadIdx.x) * 4) + 3) % 9))) &amp;&amp; ((((((int)threadIdx.x) * 4) + 3) % 9) &lt; 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 14)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 4) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) +  [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 15)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 5) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 2 [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 16)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 5) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) +  [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 17)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 5) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) +  [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 18)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) / 3) + 2) &amp; 31) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) %  [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 19)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) / 3) + 2) &amp; 31) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 20)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) / 3) + 2) &amp; 31) * 9)) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2) + (((int)threadIdx.x) * 2)) % [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 21)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 7) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) + 1 [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 22)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 7) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) +  [...]
+      }
+      if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 7) + (((int)threadIdx.x) &gt;&gt; 4)) &lt; 24) {
+        kernel_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 2688) + (((int)threadIdx.x) * 24)) + 23)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 28) + (((int)threadIdx.x) &gt;&gt; 2)) / 3) * 4608)) + (rc_outer_outer * 288)) + ((((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 896) + (((int)threadIdx.x) * 8)) + 7) % 96) / 3) * 9)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer * 224) + (((int)threadIdx.x) * 2)) +  [...]
+      }
+    }
+    __syncthreads();
+    for (int rc_outer_inner = 0; rc_outer_inner &lt; 2; ++rc_outer_inner) {
+      for (int rx_outer_inner = 0; rx_outer_inner &lt; 3; ++rx_outer_inner) {
+        for (int ff_outer_inner = 0; ff_outer_inner &lt; 2; ++ff_outer_inner) {
+          for (int rc_inner = 0; rc_inner &lt; 16; ++rc_inner) {
+            conv2d_nchw[(ff_outer_inner * 7)] = (conv2d_nchw[(ff_outer_inner * 7)] + (pad_temp_shared[((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7))] * kernel_shared[((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 1)] = (conv2d_nchw[((ff_outer_inner * 7) + 1)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 9)] * kernel_shared[((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 2)] = (conv2d_nchw[((ff_outer_inner * 7) + 2)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 3)] = (conv2d_nchw[((ff_outer_inner * 7) + 3)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 4)] = (conv2d_nchw[((ff_outer_inner * 7) + 4)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 5)] = (conv2d_nchw[((ff_outer_inner * 7) + 5)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 6)] = (conv2d_nchw[((ff_outer_inner * 7) + 6)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner)]));
+            conv2d_nchw[(ff_outer_inner * 7)] = (conv2d_nchw[(ff_outer_inner * 7)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 9)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 3)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 1)] = (conv2d_nchw[((ff_outer_inner * 7) + 1)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 3)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 2)] = (conv2d_nchw[((ff_outer_inner * 7) + 2)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 3)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 3)] = (conv2d_nchw[((ff_outer_inner * 7) + 3)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 3)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 4)] = (conv2d_nchw[((ff_outer_inner * 7) + 4)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 3)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 5)] = (conv2d_nchw[((ff_outer_inner * 7) + 5)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 3)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 6)] = (conv2d_nchw[((ff_outer_inner * 7) + 6)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 3)]));
+            conv2d_nchw[(ff_outer_inner * 7)] = (conv2d_nchw[(ff_outer_inner * 7)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 18)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 6)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 1)] = (conv2d_nchw[((ff_outer_inner * 7) + 1)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 27)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 6)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 2)] = (conv2d_nchw[((ff_outer_inner * 7) + 2)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 36)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 6)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 3)] = (conv2d_nchw[((ff_outer_inner * 7) + 3)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 45)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 6)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 4)] = (conv2d_nchw[((ff_outer_inner * 7) + 4)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 54)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 6)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 5)] = (conv2d_nchw[((ff_outer_inner * 7) + 5)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 6)]));
+            conv2d_nchw[((ff_outer_inner * 7) + 6)] = (conv2d_nchw[((ff_outer_inner * 7) + 6)] + (pad_temp_shared[(((((rc_outer_inner * 1296) + (rc_inner * 81)) + rx_outer_inner) + (((int)threadIdx.x) % 7)) + 72)] * kernel_shared[(((((((((int)threadIdx.x) / 7) * 576) + (ff_outer_inner * 288)) + (rc_outer_inner * 144)) + (rc_inner * 9)) + rx_outer_inner) + 6)]));
+          }
+        }
       }
-      kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
-      kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
-      kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
-      kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
-      kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
-      kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
-      kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
-      kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
-      kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
-      kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
-      kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
-      kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
-      kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
-      kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
-      kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
-      kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-      kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-      __syncthreads();
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-      conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-      conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-      conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-      conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-      conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-      conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
-      conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-      conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-      conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-      conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-      conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-      conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-      conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
-      conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
     }
   }
   for (int i1_inner = 0; i1_inner &lt; 2; ++i1_inner) {
-    for (int i3_inner = 0; i3_inner &lt; 7; ++i3_inner) {
-      compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
+    for (int i2_inner = 0; i2_inner &lt; 7; ++i2_inner) {
+      compute[(((((((int)blockIdx.x) * 1568) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + (i2_inner * 7)) + (((int)threadIdx.x) % 7))] = max((conv2d_nchw[((i1_inner * 7) + i2_inner)] + bias[(((((int)blockIdx.x) * 32) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
     }
   }
 }
@@ -1578,7 +995,7 @@ In the example below we resume the status and do more 5 trials.</p>
 Get devices for measurement successfully!
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes  37.090 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes  46.341 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/e3e540f3b477c0c52d8eb73e674e8ffd/tune_conv2d_layer_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_conv2d_layer_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
index bf674155c8..0493b7ff77 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
@@ -916,7 +916,7 @@ so we can read the log file and load the best schedules.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-   7.8484       7.8462       7.8573       7.8418       0.0065
+   7.8969       7.8932       7.9050       7.8925       0.0057
 </pre></div>
 </div>
 </div>
@@ -938,7 +938,7 @@ to learn how to use the RPC Tracker and RPC Server.
 To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
 with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
 </ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  5.122 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  5.810 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-cuda-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/eafe360d52540634c9eea0fa89e804bd/tune_network_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
index 5f4aea8c3e..de993870ea 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
@@ -935,7 +935,7 @@ so we can read the log file and load the best schedules.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  747.1292     746.5459     748.3151     746.5267      0.8386
+  751.3712     751.1382     753.5222     749.4532      1.6693
 </pre></div>
 </div>
 </div>
@@ -957,7 +957,7 @@ to learn how to use the RPC Tracker and RPC Server.
 To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
 with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
 </ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  37.813 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  38.610 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-x86-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/e416b94ca1090b0897c0f6e0df95b911/tune_network_x86.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_x86.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
index 6c6882a2ce..752980959e 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
@@ -635,26 +635,74 @@ class Module:
         placeholder_8 = T.match_buffer(placeholder_3, (33,), &quot;int32&quot;)
         placeholder_9 = T.match_buffer(placeholder_4, (128, 512))
         compute_1 = T.match_buffer(compute, (128, 512))
-        for i0_outer_i1_outer_fused in T.parallel(32):
-            compute_2 = T.allocate([2048], &quot;float32&quot;, &quot;global&quot;)
-            compute_3 = T.buffer_decl((2048,), data=compute_2)
-            for i_outer_inner, nb_j_inner in T.grid(8, 2):
-                for i_inner_init, j_init in T.grid(8, 16):
-                    compute_3[i_outer_inner * 256 + i_inner_init * 32 + nb_j_inner * 16 + j_init] = T.float32(0)
-                for elem_idx, i_inner, j in T.grid(T.let(cse_var_1, i0_outer_i1_outer_fused % 16 * 2 + nb_j_inner, placeholder_10[cse_var_1 + 1] - placeholder_10[cse_var_1]), 8, 16):
-                    cse_var_1 = T.var(&quot;int32&quot;)
+        for i0_outer_i1_outer_fused in T.parallel(128):
+            compute_2 = T.allocate([512], &quot;float32&quot;, &quot;global&quot;)
+            compute_3 = T.buffer_decl((512,), data=compute_2)
+            for i_outer_inner, nb_j_inner in T.grid(2, 2):
+                for i_inner_init in range(8):
+                    cse_var_1: T.int32 = i_outer_inner * 256 + i_inner_init * 32 + nb_j_inner * 16
+                    compute_3[cse_var_1] = T.float32(0)
+                    compute_3[cse_var_1 + 1] = T.float32(0)
+                    compute_3[cse_var_1 + 2] = T.float32(0)
+                    compute_3[cse_var_1 + 3] = T.float32(0)
+                    compute_3[cse_var_1 + 4] = T.float32(0)
+                    compute_3[cse_var_1 + 5] = T.float32(0)
+                    compute_3[cse_var_1 + 6] = T.float32(0)
+                    compute_3[cse_var_1 + 7] = T.float32(0)
+                    compute_3[cse_var_1 + 8] = T.float32(0)
+                    compute_3[cse_var_1 + 9] = T.float32(0)
+                    compute_3[cse_var_1 + 10] = T.float32(0)
+                    compute_3[cse_var_1 + 11] = T.float32(0)
+                    compute_3[cse_var_1 + 12] = T.float32(0)
+                    compute_3[cse_var_1 + 13] = T.float32(0)
+                    compute_3[cse_var_1 + 14] = T.float32(0)
+                    compute_3[cse_var_1 + 15] = T.float32(0)
+                for elem_idx, i_inner in T.grid(T.let(cse_var_2, i0_outer_i1_outer_fused % 16 * 2 + nb_j_inner, placeholder_10[cse_var_2 + 1] - placeholder_10[cse_var_2]), 8):
+                    cse_var_2 = T.var(&quot;int32&quot;)
                     placeholder_10 = T.buffer_decl((33,), &quot;int32&quot;, data=placeholder_8.data)
-                    cse_var_3: T.int32 = i0_outer_i1_outer_fused % 16 * 2 + nb_j_inner
-                    cse_var_2: T.int32 = i_outer_inner * 256 + i_inner * 32 + nb_j_inner * 16 + j
+                    cse_var_21: T.int32 = elem_idx * 16
+                    cse_var_20: T.int32 = i0_outer_i1_outer_fused % 16 * 2 + nb_j_inner
+                    cse_var_19: T.int32 = i_outer_inner * 256 + i_inner * 32 + nb_j_inner * 16
+                    cse_var_18: T.int32 = i0_outer_i1_outer_fused // 16 * 4096 + i_outer_inner * 2048 + i_inner * 256
+                    cse_var_17: T.int32 = cse_var_19 + 9
+                    cse_var_16: T.int32 = cse_var_19 + 8
+                    cse_var_15: T.int32 = cse_var_19 + 7
+                    cse_var_14: T.int32 = cse_var_19 + 6
+                    cse_var_13: T.int32 = cse_var_19 + 5
+                    cse_var_12: T.int32 = cse_var_19 + 4
+                    cse_var_11: T.int32 = cse_var_19 + 3
+                    cse_var_10: T.int32 = cse_var_19 + 2
+                    cse_var_9: T.int32 = cse_var_19 + 15
+                    cse_var_8: T.int32 = cse_var_19 + 14
+                    cse_var_7: T.int32 = cse_var_19 + 13
+                    cse_var_6: T.int32 = cse_var_19 + 12
+                    cse_var_5: T.int32 = cse_var_19 + 11
+                    cse_var_4: T.int32 = cse_var_19 + 10
+                    cse_var_3: T.int32 = cse_var_19 + 1
                     placeholder_11 = T.buffer_decl((78656,), data=placeholder_6.data)
                     placeholder_12 = T.buffer_decl((32768,), data=placeholder_5.data)
                     placeholder_13 = T.buffer_decl((4916,), &quot;int32&quot;, data=placeholder_7.data)
-                    compute_3[cse_var_2] = compute_3[cse_var_2] + placeholder_11[placeholder_10[cse_var_3] * 16 + elem_idx * 16 + j] * T.max(placeholder_12[i0_outer_i1_outer_fused // 16 * 16384 + i_outer_inner * 2048 + i_inner * 256 + placeholder_13[placeholder_10[cse_var_3] + elem_idx]], T.float32(0))
-            for i0_inner in range(64):
-                cse_var_4: T.int32 = i0_outer_i1_outer_fused // 16 * 32768 + i0_inner * 512 + i0_outer_i1_outer_fused % 16 * 32
+                    compute_3[cse_var_19] = compute_3[cse_var_19] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_3] = compute_3[cse_var_3] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 1] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_10] = compute_3[cse_var_10] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 2] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_11] = compute_3[cse_var_11] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 3] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_12] = compute_3[cse_var_12] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 4] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_13] = compute_3[cse_var_13] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 5] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_14] = compute_3[cse_var_14] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 6] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_15] = compute_3[cse_var_15] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 7] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_16] = compute_3[cse_var_16] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 8] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_17] = compute_3[cse_var_17] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 9] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_4] = compute_3[cse_var_4] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 10] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_5] = compute_3[cse_var_5] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 11] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_6] = compute_3[cse_var_6] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 12] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_7] = compute_3[cse_var_7] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 13] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_8] = compute_3[cse_var_8] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 14] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+                    compute_3[cse_var_9] = compute_3[cse_var_9] + placeholder_11[placeholder_10[cse_var_20] * 16 + cse_var_21 + 15] * T.max(placeholder_12[cse_var_18 + placeholder_13[placeholder_10[cse_var_20] + elem_idx]], T.float32(0))
+            for i0_inner in range(16):
+                cse_var_22: T.int32 = i0_outer_i1_outer_fused // 16 * 8192 + i0_inner * 512 + i0_outer_i1_outer_fused % 16 * 32
                 compute_4 = T.buffer_decl((65536,), data=compute_1.data)
                 placeholder_10 = T.buffer_decl((65536,), data=placeholder_9.data)
-                compute_4[cse_var_4:cse_var_4 + 32] = T.max(compute_3[i0_inner * 32:i0_inner * 32 + 32] + placeholder_10[cse_var_4:cse_var_4 + 32], T.Broadcast(T.float32(0), 32))
+                compute_4[cse_var_22:cse_var_22 + 32] = T.max(compute_3[i0_inner * 32:i0_inner * 32 + 32] + placeholder_10[cse_var_22:cse_var_22 + 32], T.Broadcast(T.float32(0), 32))
 </pre></div>
 </div>
 </div>
@@ -688,7 +736,7 @@ class Module:
 <span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.552 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.894 ms
 </pre></div>
 </div>
 <div class="admonition note">
diff --git a/docs/how_to/tune_with_autotvm/sg_execution_times.html b/docs/how_to/tune_with_autotvm/sg_execution_times.html
index 527ba6ecff..f6625da8e5 100644
--- a/docs/how_to/tune_with_autotvm/sg_execution_times.html
+++ b/docs/how_to/tune_with_autotvm/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-tune-with-autotvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:40.547</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
+<p><strong>00:24.192</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 84%" />
@@ -349,11 +349,11 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-conv2d-cuda-py"><span class="std std-ref">Tuning High Performance Convolution on NVIDIA GPUs</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_cuda.py</span></code>)</p></td>
-<td><p>00:40.515</p></td>
+<td><p>00:24.157</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_relay_x86.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-x86-py"><span class="std std-ref">Auto-tuning a Convolutional Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_x86.py</span></code>)</p></td>
-<td><p>00:00.019</p></td>
+<td><p>00:00.021</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-cuda-py"><span class="std std-ref">Auto-tuning a Convolutional Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_cuda.py</span></code>)</p></td>
diff --git a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
index 5ec959be0d..cd0b1ed6eb 100644
--- a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
+++ b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
@@ -568,7 +568,9 @@ for this template</p>
 waiting for device...
 device available
 Get devices for measurement successfully!
-No: 1   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+No: 1   GFLOPS: 50.48/50.48     result: MeasureResult(costs=(0.004586107448275862,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.907130241394043, timestamp=1674175038.7889755)        [(&#39;tile_f&#39;, [-1, 2, 1, 8]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 4]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9371037
+No: 2   GFLOPS: 93.65/93.65     result: MeasureResult(costs=(0.002471886048780488,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.077481985092163, timestamp=1674175039.5116773)        [(&#39;tile_f&#39;, [-1, 1, 16, 2]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 64, 2]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,5284705
+No: 3   GFLOPS: 0.00/93.65      result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
@@ -690,8 +692,9 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 32, 4, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 256]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2119117
-No: 2   GFLOPS: 0.00/0.00       result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 8, 1]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 64, 8]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,1666969
+No: 4   GFLOPS: 213.99/213.99   result: MeasureResult(costs=(0.0010818329677419354,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.1306028366088867, timestamp=1674175042.3393323)      [(&#39;tile_f&#39;, [-1, 4, 8, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 2, 4]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2782589
+No: 5   GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
@@ -813,10 +816,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 128, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 32, 4]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9380629
-No: 3   GFLOPS: 85.37/85.37     result: MeasureResult(costs=(0.00271172485,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.7610549926757812, timestamp=1674164482.4717371)      [(&#39;tile_f&#39;, [-1, 16, 8, 1]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 1, 4]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,8200091
-No: 4   GFLOPS: 20.14/85.37     result: MeasureResult(costs=(0.011493578555555556,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.423952102661133, timestamp=1674164484.2182357)        [(&#39;tile_f&#39;, [-1, 2, 1, 8]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 4]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9754717
-No: 5   GFLOPS: 0.00/85.37      result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 64, 2, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 64]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,7526534
+No: 6   GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
@@ -938,8 +939,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 2, 32]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 2, 128]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,3857892
-No: 6   GFLOPS: 0.00/85.37      result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 8, 2, 32]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 16, 8]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9404753
+No: 7   GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
@@ -1061,10 +1062,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 8, 8]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 32, 16]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2654456
-No: 7   GFLOPS: 1.25/85.37      result: MeasureResult(costs=(0.18517686449999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=6.310837507247925, timestamp=1674164491.677897)  [(&#39;tile_f&#39;, [-1, 16, 2, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 1, 1]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,3488114
-No: 8   GFLOPS: 10.05/85.37     result: MeasureResult(costs=(0.02303889716666667,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.039496898651123, timestamp=1674164492.631526)  [(&#39;tile_f&#39;, [-1, 1, 4, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 4, 32]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,7507959
-No: 9   GFLOPS: 0.00/85.37      result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 2, 64]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 2]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,1011105
+No: 8   GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
@@ -1186,10 +1185,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 16, 4, 8]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 16, 8]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,8822813
-No: 10  GFLOPS: 89.86/89.86     result: MeasureResult(costs=(0.0025763257049180328,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.712925434112549, timestamp=1674164495.5032163)       [(&#39;tile_f&#39;, [-1, 2, 4, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 2, 2]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,8366893
-No: 11  GFLOPS: 28.31/89.86     result: MeasureResult(costs=(0.008176561153846153,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5758357048034668, timestamp=1674164496.28428) [(&#39;tile_f&#39;, [-1, 8, 2, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 1]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,785731
-No: 12  GFLOPS: 0.00/89.86      result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 64, 1, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 256, 2]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,5484881
+No: 9   GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
@@ -1311,8 +1308,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 2, 256]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 16, 16]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,1103518
-No: 13  GFLOPS: 0.00/89.86      result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 8, 32, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 16, 2]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,8375713
+No: 10  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
@@ -1434,9 +1431,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 16, 4, 8]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 4, 16]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,5936633
-No: 14  GFLOPS: 7.37/89.86      result: MeasureResult(costs=(0.03141268725,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4329447746276855, timestamp=1674164497.9928796)      [(&#39;tile_f&#39;, [-1, 2, 1, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 8]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,687996
-No: 15  GFLOPS: 0.00/89.86      result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 16, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 1]), (&#39;tile_rc&#39;, [-1, 64, 4]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9187286
+No: 11  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
@@ -1558,9 +1554,8 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 1, 64]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 8, 16]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 1)],None,8650602
-No: 16  GFLOPS: 25.55/89.86     result: MeasureResult(costs=(0.0090591335,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4694478511810303, timestamp=1674164498.7831192)       [(&#39;tile_f&#39;, [-1, 4, 2, 8]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 2, 1]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,1360185
-No: 17  GFLOPS: 0.00/89.86      result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 8, 32, 1]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 8, 16]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2262743
+No: 12  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
     func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
@@ -1682,10 +1677,991 @@ Traceback (most recent call last):
   File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
   File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
     raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 32, 4, 4]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 4, 64]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2104640
-No: 18  GFLOPS: 154.49/154.49   result: MeasureResult(costs=(0.00149846352238806,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5423030853271484, timestamp=1674164500.5038612)        [(&#39;tile_f&#39;, [-1, 2, 2, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 2]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,3520989
-No: 19  GFLOPS: 193.05/193.05   result: MeasureResult(costs=(0.0011991573452380954,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.339512586593628, timestamp=1674164501.2104666)       [(&#39;tile_f&#39;, [-1, 2, 8, 4]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 2, 2]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,426922
-No: 20  GFLOPS: 274.72/274.72   result: MeasureResult(costs=(0.0008426869495798319,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3525335788726807, timestamp=1674164501.93028)        [(&#39;tile_f&#39;, [-1, 8, 4, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 1, 2]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,1973035
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 16, 2]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 32, 16]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,1107565
+No: 13  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
+    func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
+    func = build(s, args, target_host=task.target_host, runtime=runtime)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 227, in build
+    input_mod = lower(inputs, args, name=name, binds=binds)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 134, in lower
+    return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 276, in tvm._ffi._cy3.core.FuncCall
+  File &quot;tvm/_ffi/_cython/./base.pxi&quot;, line 181, in tvm._ffi._cy3.core.CHECK_CALL
+tvm._ffi.base.TVMError: Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 8, 32]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 512, 1]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 0)],None,420837
+No: 14  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
+    func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
+    func = build(s, args, target_host=task.target_host, runtime=runtime)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 227, in build
+    input_mod = lower(inputs, args, name=name, binds=binds)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 134, in lower
+    return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 276, in tvm._ffi._cy3.core.FuncCall
+  File &quot;tvm/_ffi/_cython/./base.pxi&quot;, line 181, in tvm._ffi._cy3.core.CHECK_CALL
+tvm._ffi.base.TVMError: Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 32, 8, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 8, 32]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,5185872
+No: 15  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
+    func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
+    func = build(s, args, target_host=task.target_host, runtime=runtime)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 227, in build
+    input_mod = lower(inputs, args, name=name, binds=binds)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 134, in lower
+    return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 276, in tvm._ffi._cy3.core.FuncCall
+  File &quot;tvm/_ffi/_cython/./base.pxi&quot;, line 181, in tvm._ffi._cy3.core.CHECK_CALL
+tvm._ffi.base.TVMError: Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 4, 1]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 16, 16]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,3232479
+No: 16  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
+    func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
+    func = build(s, args, target_host=task.target_host, runtime=runtime)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 227, in build
+    input_mod = lower(inputs, args, name=name, binds=binds)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 134, in lower
+    return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 276, in tvm._ffi._cy3.core.FuncCall
+  File &quot;tvm/_ffi/_cython/./base.pxi&quot;, line 181, in tvm._ffi._cy3.core.CHECK_CALL
+tvm._ffi.base.TVMError: Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 1, 2, 32]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 64, 4]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2606310
+No: 17  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
+    func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
+    func = build(s, args, target_host=task.target_host, runtime=runtime)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 227, in build
+    input_mod = lower(inputs, args, name=name, binds=binds)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 134, in lower
+    return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 276, in tvm._ffi._cy3.core.FuncCall
+  File &quot;tvm/_ffi/_cython/./base.pxi&quot;, line 181, in tvm._ffi._cy3.core.CHECK_CALL
+tvm._ffi.base.TVMError: Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 16, 16]), (&#39;tile_y&#39;, [-1, 1, 1, 7]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 1, 8]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 1, 3]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,6485563
+No: 18  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
+    func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
+    func = build(s, args, target_host=task.target_host, runtime=runtime)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 227, in build
+    input_mod = lower(inputs, args, name=name, binds=binds)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 134, in lower
+    return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 276, in tvm._ffi._cy3.core.FuncCall
+  File &quot;tvm/_ffi/_cython/./base.pxi&quot;, line 181, in tvm._ffi._cy3.core.CHECK_CALL
+tvm._ffi.base.TVMError: Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 128, 1, 4]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 1, 1, 7]), (&#39;tile_rc&#39;, [-1, 4, 32]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 0), (&#39;unroll_explicit&#39;, 1)],None,5571827
+No: 19  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
+    func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
+    func = build(s, args, target_host=task.target_host, runtime=runtime)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 227, in build
+    input_mod = lower(inputs, args, name=name, binds=binds)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 134, in lower
+    return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 276, in tvm._ffi._cy3.core.FuncCall
+  File &quot;tvm/_ffi/_cython/./base.pxi&quot;, line 181, in tvm._ffi._cy3.core.CHECK_CALL
+tvm._ffi.base.TVMError: Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 2, 4, 8]), (&#39;tile_y&#39;, [-1, 7, 1, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 4, 32]), (&#39;tile_ry&#39;, [-1, 1, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 1)],None,9441890
+No: 20  GFLOPS: 0.00/213.99     result: Traceback (most recent call last):
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 592, in __call__
+    func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 544, in _build_func_common
+    func = build(s, args, target_host=task.target_host, runtime=runtime)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 227, in build
+    input_mod = lower(inputs, args, name=name, binds=binds)
+  File &quot;/workspace/python/tvm/driver/build_module.py&quot;, line 134, in lower
+    return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 276, in tvm._ffi._cy3.core.FuncCall
+  File &quot;tvm/_ffi/_cython/./base.pxi&quot;, line 181, in tvm._ffi._cy3.core.CHECK_CALL
+tvm._ffi.base.TVMError: Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+Traceback (most recent call last):
+  24: TVMFuncCall
+        at ../src/runtime/c_runtime_api.cc:477
+  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  22: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  21: operator()
+        at ../include/tvm/runtime/packed_func.h:1730
+  20: unpack_call&lt;tvm::IRModule, 5, tvm::&lt;lambda(tvm::te::Schedule, const tvm::runtime::Array&lt;tvm::runtime::ObjectRef&gt;&amp;, const tvm::runtime::String&amp;, const tvm::runtime::Map&lt;tvm::te::Tensor, tvm::tir::Buffer&gt;&amp;, bool)&gt; &gt;
+        at ../include/tvm/runtime/packed_func.h:1670
+  19: run&lt;&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  18: run&lt;tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  17: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  16: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  15: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1630
+  14: run&lt;tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_&gt;
+        at ../include/tvm/runtime/packed_func.h:1645
+  13: operator()
+        at ../src/driver/driver_api.cc:395
+  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array&lt;tvm::runtime::ObjectRef, void&gt; const&amp;, std::__cxx11::basic_string&lt;char, std::char_traits&lt;char&gt;, std::allocator&lt;char&gt; &gt; const&amp;, std::unordered_map&lt;tvm::te::Tensor, tvm::tir::Buffer, std::hash&lt;tvm::te::Tensor&gt;, std::equal_to&lt;tvm::te::Tensor&gt;, std::allocator&lt;std::pair&lt;tvm::te::Tensor const, tvm::tir::Buffer&gt; &gt; &gt; const&amp;, tvm::GlobalVarSupply, bool)
+        at ../src/driver/driver_api.cc:381
+  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array&lt;tvm::transform::Pass, void&gt;)
+        at ../src/driver/driver_api.cc:276
+  10: tvm::transform::Pass::operator()(tvm::IRModule) const
+        at ../src/ir/transform.cc:258
+  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:451
+  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/ir/transform.cc:274
+  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&amp;) const
+        at ../src/tir/ir/transform.cc:100
+  5: tvm::runtime::TypedPackedFunc&lt;tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)&gt;::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+        at ../include/tvm/runtime/packed_func.h:1749
+  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher&lt;tvm::tir::PrimFunc&gt;::run&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::runtime::PackedFunc const&amp;, tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;)
+        at ../include/tvm/runtime/packed_func.h:1693
+  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()&lt;tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext&gt;(tvm::tir::PrimFunc&amp;&amp;, tvm::IRModule&amp;&amp;, tvm::transform::PassContext&amp;&amp;) const
+        at ../include/tvm/runtime/packed_func.h:1617
+  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+        at ../include/tvm/runtime/packed_func.h:1217
+  1: Call
+        at ../include/tvm/runtime/packed_func.h:1213
+  0: operator()
+        at ../src/runtime/c_runtime_api.cc:534
+  File &quot;tvm/_ffi/_cython/./packed_func.pxi&quot;, line 56, in tvm._ffi._cy3.core.tvm_callback
+  File &quot;/workspace/python/tvm/autotvm/measure/measure_methods.py&quot;, line 875, in verify_pass
+    raise InstantiationError(&quot;Skipped because of invalid gpu kernel&quot;)
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [(&#39;tile_f&#39;, [-1, 4, 1, 4]), (&#39;tile_y&#39;, [-1, 1, 7, 1]), (&#39;tile_x&#39;, [-1, 7, 1, 1]), (&#39;tile_rc&#39;, [-1, 16, 4]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 1500), (&#39;unroll_explicit&#39;, 0)],None,4341582
 </pre></div>
 </div>
 <p>Finally we can inspect the best config from log file, check correctness,
@@ -1724,9 +2700,9 @@ and measure running time.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Finish loading 20 records
 
 Best config:
-[(&#39;tile_f&#39;, [-1, 8, 4, 2]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 1, 2]), (&#39;tile_ry&#39;, [-1, 3, 1]), (&#39;tile_rx&#39;, [-1, 1, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,1973035
+[(&#39;tile_f&#39;, [-1, 4, 8, 1]), (&#39;tile_y&#39;, [-1, 1, 1, 1]), (&#39;tile_x&#39;, [-1, 1, 7, 1]), (&#39;tile_rc&#39;, [-1, 2, 4]), (&#39;tile_ry&#39;, [-1, 1, 3]), (&#39;tile_rx&#39;, [-1, 3, 1]), (&#39;auto_unroll_max_step&#39;, 512), (&#39;unroll_explicit&#39;, 0)],None,2782589
 Finish loading 20 records
-Time cost of this operator: 0.001221
+Time cost of this operator: 0.001132
 </pre></div>
 </div>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autotvm-tune-conv2d-cuda-py">
diff --git a/docs/how_to/work_with_microtvm/index.html b/docs/how_to/work_with_microtvm/index.html
index 1a01718ee3..3fe0435c61 100644
--- a/docs/how_to/work_with_microtvm/index.html
+++ b/docs/how_to/work_with_microtvm/index.html
@@ -246,6 +246,7 @@
 <li class="toctree-l3"><a class="reference internal" href="micro_aot.html">microTVM Host-Driven AoT</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_autotune.html">Autotuning with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_ethosu.html">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</a></li>
+<li class="toctree-l3"><a class="reference internal" href="micro_mlperftiny.html">Creating Your MLPerfTiny Submission with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_pytorch.html">microTVM PyTorch Tutorial</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_reference_vm.html">microTVM Reference Virtual Machines</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_tflite.html">microTVM with TFLite Models</a></li>
@@ -380,6 +381,9 @@ demonstrate how to tune and deploy models with microTVM.</p>
 </div><div class="sphx-glr-thumbcontainer" tooltip="This section contains an example of how to use TVM to run a model on an Arm(R) Cortex(R)-M55 CP..."><img alt="Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN" src="../../_images/sphx_glr_micro_ethosu_thumb.png" />
 <p><a class="reference internal" href="micro_ethosu.html#sphx-glr-how-to-work-with-microtvm-micro-ethosu-py"><span class="std std-ref">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</span></a></p>
   <div class="sphx-glr-thumbnail-title">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</div>
+</div><div class="sphx-glr-thumbcontainer" tooltip="This tutorial is showcasing building an MLPerfTiny submission using microTVM. This tutorial sho..."><img alt="Creating Your MLPerfTiny Submission with microTVM" src="../../_images/sphx_glr_micro_mlperftiny_thumb.png" />
+<p><a class="reference internal" href="micro_mlperftiny.html#sphx-glr-how-to-work-with-microtvm-micro-mlperftiny-py"><span class="std std-ref">Creating Your MLPerfTiny Submission with microTVM</span></a></p>
+  <div class="sphx-glr-thumbnail-title">Creating Your MLPerfTiny Submission with microTVM</div>
 </div><div class="sphx-glr-thumbcontainer" tooltip="This tutorial is showcasing microTVM host-driven AoT compilation with a PyTorch model. This tut..."><img alt="microTVM PyTorch Tutorial" src="../../_images/sphx_glr_micro_pytorch_thumb.png" />
 <p><a class="reference internal" href="micro_pytorch.html#sphx-glr-how-to-work-with-microtvm-micro-pytorch-py"><span class="std std-ref">microTVM PyTorch Tutorial</span></a></p>
   <div class="sphx-glr-thumbnail-title">microTVM PyTorch Tutorial</div>
diff --git a/docs/how_to/work_with_microtvm/micro_aot.html b/docs/how_to/work_with_microtvm/micro_aot.html
index 7ba2497dd5..83d9d07fb9 100644
--- a/docs/how_to/work_with_microtvm/micro_aot.html
+++ b/docs/how_to/work_with_microtvm/micro_aot.html
@@ -256,6 +256,7 @@
 </li>
 <li class="toctree-l3"><a class="reference internal" href="micro_autotune.html">Autotuning with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_ethosu.html">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</a></li>
+<li class="toctree-l3"><a class="reference internal" href="micro_mlperftiny.html">Creating Your MLPerfTiny Submission with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_pytorch.html">microTVM PyTorch Tutorial</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_reference_vm.html">microTVM Reference Virtual Machines</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_tflite.html">microTVM with TFLite Models</a></li>
diff --git a/docs/how_to/work_with_microtvm/micro_autotune.html b/docs/how_to/work_with_microtvm/micro_autotune.html
index 04831f815a..c5bd15a4e9 100644
--- a/docs/how_to/work_with_microtvm/micro_autotune.html
+++ b/docs/how_to/work_with_microtvm/micro_autotune.html
@@ -251,6 +251,7 @@
 </ul>
 </li>
 <li class="toctree-l3"><a class="reference internal" href="micro_ethosu.html">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</a></li>
+<li class="toctree-l3"><a class="reference internal" href="micro_mlperftiny.html">Creating Your MLPerfTiny Submission with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_pytorch.html">microTVM PyTorch Tutorial</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_reference_vm.html">microTVM Reference Virtual Machines</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_tflite.html">microTVM with TFLite Models</a></li>
@@ -646,10 +647,10 @@ the tuned operator.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build without Autotuning ##########
 Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  Measurements(us)
 ---------                                     ---                                           --------  -------  -----              ------  -------  ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  311.6     98.727   (1, 2, 10, 10, 3)  2       1        [311.6]
-tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.045     0.965    (1, 6, 10, 10)     1       1        [3.045]
-tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.973     0.308    (1, 1, 10, 10, 3)  1       1        [0.973]
-Total_time                                    -                                             315.618   -        -                  -       -        -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  310.8     98.686   (1, 2, 10, 10, 3)  2       1        [310.8]
+tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.186     1.012    (1, 6, 10, 10)     1       1        [3.186]
+tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.953     0.303    (1, 1, 10, 10, 3)  1       1        [0.953]
+Total_time                                    -                                             314.939   -        -                  -       -        -
 </pre></div>
 </div>
 </div>
@@ -701,10 +702,10 @@ Total_time                                    -
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build with Autotuning ##########
 Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  Measurements(us)
 ---------                                     ---                                           --------  -------  -----              ------  -------  ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  134.8     97.951   (1, 6, 10, 10, 1)  2       1        [134.8]
-tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.852     1.346    (1, 6, 10, 10)     1       1        [1.852]
-tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.968     0.703    (1, 1, 10, 10, 3)  1       1        [0.968]
-Total_time                                    -                                             137.62    -        -                  -       -        -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  102.8     97.511   (1, 6, 10, 10, 1)  2       1        [102.8]
+tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.773     1.682    (1, 6, 10, 10)     1       1        [1.773]
+tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.851     0.808    (1, 3, 10, 10, 1)  1       1        [0.851]
+Total_time                                    -                                             105.424   -        -                  -       -        -
 </pre></div>
 </div>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-autotune-py">
diff --git a/docs/how_to/work_with_microtvm/micro_ethosu.html b/docs/how_to/work_with_microtvm/micro_ethosu.html
index f90d079a1b..5897c66c68 100644
--- a/docs/how_to/work_with_microtvm/micro_ethosu.html
+++ b/docs/how_to/work_with_microtvm/micro_ethosu.html
@@ -48,7 +48,7 @@
     <script type="text/javascript" src="../../_static/js/tlcpack_theme.js"></script>
     <link rel="index" title="Index" href="../../genindex.html" />
     <link rel="search" title="Search" href="../../search.html" />
-    <link rel="next" title="microTVM PyTorch Tutorial" href="micro_pytorch.html" />
+    <link rel="next" title="Creating Your MLPerfTiny Submission with microTVM" href="micro_mlperftiny.html" />
     <link rel="prev" title="Autotuning with microTVM" href="micro_autotune.html" /> 
 </head>
 
@@ -261,6 +261,7 @@
 <li class="toctree-l4"><a class="reference internal" href="#running-the-demo-application">Running the demo application</a></li>
 </ul>
 </li>
+<li class="toctree-l3"><a class="reference internal" href="micro_mlperftiny.html">Creating Your MLPerfTiny Submission with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_pytorch.html">microTVM PyTorch Tutorial</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_reference_vm.html">microTVM Reference Virtual Machines</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_tflite.html">microTVM with TFLite Models</a></li>
@@ -887,7 +888,7 @@ classified as ‘tabby’.</p>
 
     <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
       
-        <a href="micro_pytorch.html" class="btn btn-neutral float-right" title="microTVM PyTorch Tutorial" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
+        <a href="micro_mlperftiny.html" class="btn btn-neutral float-right" title="Creating Your MLPerfTiny Submission with microTVM" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
       
       
         <a href="micro_autotune.html" class="btn btn-neutral float-left" title="Autotuning with microTVM" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
diff --git a/docs/how_to/work_with_microtvm/micro_mlperftiny.html b/docs/how_to/work_with_microtvm/micro_mlperftiny.html
new file mode 100644
index 0000000000..fb3cded3c8
--- /dev/null
+++ b/docs/how_to/work_with_microtvm/micro_mlperftiny.html
@@ -0,0 +1,796 @@
+
+
+
+
+
+
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+  <meta charset="utf-8">
+  
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  
+  <title>Creating Your MLPerfTiny Submission with microTVM &mdash; tvm 0.11.dev0 documentation</title>
+  
+
+  
+  <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
+  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/sg_gallery.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/sg_gallery-binder.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/sg_gallery-dataframe.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/sg_gallery-rendered-html.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/css/tlcpack_theme.css" type="text/css" />
+
+  
+  
+    <link rel="shortcut icon" href="../../_static/tvm-logo-square.png"/>
+  
+
+  
+  
+  
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
+        <script data-url_root="../../" id="documentation_options" src="../../_static/documentation_options.js"></script>
+        <script src="../../_static/jquery.js"></script>
+        <script src="../../_static/underscore.js"></script>
+        <script src="../../_static/doctools.js"></script>
+    
+    <script type="text/javascript" src="../../_static/js/theme.js"></script>
+
+    
+    <script type="text/javascript" src="../../_static/js/tlcpack_theme.js"></script>
+    <link rel="index" title="Index" href="../../genindex.html" />
+    <link rel="search" title="Search" href="../../search.html" />
+    <link rel="next" title="microTVM PyTorch Tutorial" href="micro_pytorch.html" />
+    <link rel="prev" title="Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN" href="micro_ethosu.html" /> 
+</head>
+
+<body class="wy-body-for-nav">
+
+   
+  <div class="wy-grid-for-nav">
+    
+    
+<header class="header">
+    <div class="innercontainer">
+      <div class="headerInner d-flex justify-content-between align-items-center">
+          <div class="headerLogo">
+               <a href="https://tvm.apache.org/"><img src=https://tvm.apache.org/assets/images/logo.svg alt="logo"></a>
+          </div>
+
+          <div id="headMenu" class="headerNav">
+            <button type="button" id="closeHeadMenu" class="navCloseBtn"><img src="../../_static/img/close-icon.svg" alt="Close"></button>
+             <ul class="nav">
+                <li class="nav-item">
+                   <a class="nav-link" href=https://tvm.apache.org/community>Community</a>
+                </li>
+                <li class="nav-item">
+                   <a class="nav-link" href=https://tvm.apache.org/download>Download</a>
+                </li>
+                <li class="nav-item">
+                   <a class="nav-link" href=https://tvm.apache.org/vta>VTA</a>
+                </li>
+                <li class="nav-item">
+                   <a class="nav-link" href=https://tvm.apache.org/blog>Blog</a>
+                </li>
+                <li class="nav-item">
+                   <a class="nav-link" href=https://tvm.apache.org/docs>Docs</a>
+                </li>
+                <li class="nav-item">
+                   <a class="nav-link" href=https://tvmconf.org>Conference</a>
+                </li>
+                <li class="nav-item">
+                   <a class="nav-link" href=https://github.com/apache/tvm/>Github</a>
+                </li>
+             </ul>
+               <div class="responsivetlcdropdown">
+                 <button type="button" class="btn-link">
+                   ASF
+                 </button>
+                 <ul>
+                     <li>
+                       <a href=https://apache.org/>Apache Homepage</a>
+                     </li>
+                     <li>
+                       <a href=https://www.apache.org/licenses/>License</a>
+                     </li>
+                     <li>
+                       <a href=https://www.apache.org/foundation/sponsorship.html>Sponsorship</a>
+                     </li>
+                     <li>
+                       <a href=https://www.apache.org/security/>Security</a>
+                     </li>
+                     <li>
+                       <a href=https://www.apache.org/foundation/thanks.html>Thanks</a>
+                     </li>
+                     <li>
+                       <a href=https://www.apache.org/events/current-event>Events</a>
+                     </li>
+                 </ul>
+               </div>
+          </div>
+            <div class="responsiveMenuIcon">
+              <button type="button" id="menuBtn" class="btn-menu"><img src="../../_static/img/menu-icon.svg" alt="Menu Icon"></button>
+            </div>
+
+            <div class="tlcDropdown">
+              <div class="dropdown">
+                <button type="button" class="btn-link dropdown-toggle" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">
+                  ASF
+                </button>
+                <div class="dropdown-menu dropdown-menu-right">
+                  <ul>
+                     <li>
+                       <a href=https://apache.org/>Apache Homepage</a>
+                     </li>
+                     <li>
+                       <a href=https://www.apache.org/licenses/>License</a>
+                     </li>
+                     <li>
+                       <a href=https://www.apache.org/foundation/sponsorship.html>Sponsorship</a>
+                     </li>
+                     <li>
+                       <a href=https://www.apache.org/security/>Security</a>
+                     </li>
+                     <li>
+                       <a href=https://www.apache.org/foundation/thanks.html>Thanks</a>
+                     </li>
+                     <li>
+                       <a href=https://www.apache.org/events/current-event>Events</a>
+                     </li>
+                  </ul>
+                </div>
+              </div>
+          </div>
+       </div>
+    </div>
+ </header>
+ 
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side fixed">
+      <div class="wy-side-scroll">
+        <div class="wy-side-nav-search" >
+          
+
+          
+            <a href="../../index.html">
+          
+
+          
+            
+            <img src="../../_static/tvm-logo-small.png" class="logo" alt="Logo"/>
+          
+          </a>
+
+          
+            
+            
+              <input type="checkbox" class="version-toggle-box" hidden id="version-toggle">
+              <label for="version-toggle" class="version-toggle-label">
+                  <div tabindex="0" class="version version-selector version-selector-show">
+                    0.11.dev0 <span class="chevron versions-hidden"><svg fill="none" height="24" viewBox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="m8 4 8 8-8 8" stroke="#000" stroke-linecap="round" stroke-linejoin="round" stroke-width="2"/></svg></span><span class="chevron versions-shown"><svg fill="none" height="24" viewBox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="m4 8 8 8 8-8" stroke="#000" stroke-linecap="round" stroke-linejoin="round [...]
+                  </div>
+                </label>
+                <div class="version-details wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
+                  <p class="caption" role="heading"><span class="caption-text">Versions</span></p>
+                  <ol style="text-align: left">
+                    
+                    
+                    
+                    
+                      <li><div class="version"><a style="font-size: 0.8em; padding: 4px" href="/">0.11.dev0 (main)</a></div></li>
+                    
+                    
+                    
+                    
+                      <li><div class="version"><a style="font-size: 0.8em; padding: 4px" href="v0.8.0/">v0.8.0</a></div></li>
+                    
+                    
+                    
+                    
+                      <li><div class="version"><a style="font-size: 0.8em; padding: 4px" href="v0.9.0/">v0.9.0</a></div></li>
+                    
+                    
+                    
+                    
+                      <li><div class="version"><a style="font-size: 0.8em; padding: 4px" href="v0.10.0/">v0.10.0</a></div></li>
+                    
+                  </ol>
+                </div>
+            
+          
+
+          
+<div role="search">
+  <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
+    <input type="text" name="q" placeholder="Search docs" />
+    <input type="hidden" name="check_keywords" value="yes" />
+    <input type="hidden" name="area" value="default" />
+  </form>
+</div>
+
+          
+        </div>
+
+        
+        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
+          
+            
+            
+              
+            
+            
+              <p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../install/index.html">Installing TVM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../contribute/index.html">Contributor Guide</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">User Guide</span></p>
+<ul class="current">
+<li class="toctree-l1"><a class="reference internal" href="../../tutorial/index.html">User Tutorial</a></li>
+<li class="toctree-l1 current"><a class="reference internal" href="../index.html">How To Guides</a><ul class="current">
+<li class="toctree-l2"><a class="reference internal" href="../compile_models/index.html">Compile Deep Learning Models</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../deploy/index.html">Deploy Models and Integrate TVM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../work_with_relay/index.html">Work With Relay</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../work_with_schedules/index.html">Work With Tensor Expression and Schedules</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../optimize_operators/index.html">Optimize Tensor Operators</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../tune_with_autotvm/index.html">Auto-Tune with Templates and AutoTVM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../tune_with_autoscheduler/index.html">Use AutoScheduler for Template-Free Scheduling</a></li>
+<li class="toctree-l2 current"><a class="reference internal" href="index.html">Work With microTVM</a><ul class="current">
+<li class="toctree-l3"><a class="reference internal" href="micro_aot.html">microTVM Host-Driven AoT</a></li>
+<li class="toctree-l3"><a class="reference internal" href="micro_autotune.html">Autotuning with microTVM</a></li>
+<li class="toctree-l3"><a class="reference internal" href="micro_ethosu.html">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</a></li>
+<li class="toctree-l3 current"><a class="current reference internal" href="#">Creating Your MLPerfTiny Submission with microTVM</a><ul>
+<li class="toctree-l4"><a class="reference internal" href="#install-microtvm-python-dependencies">Install microTVM Python dependencies</a></li>
+<li class="toctree-l4"><a class="reference internal" href="#install-zephyr">Install Zephyr</a></li>
+<li class="toctree-l4"><a class="reference internal" href="#install-cmsis-nn">Install CMSIS-NN</a></li>
+<li class="toctree-l4"><a class="reference internal" href="#import-python-dependencies">Import Python dependencies</a></li>
+<li class="toctree-l4"><a class="reference internal" href="#import-visual-wake-word-model">Import Visual Wake Word Model</a></li>
+<li class="toctree-l4"><a class="reference internal" href="#defining-target-runtime-and-executor">Defining Target, Runtime and Executor</a></li>
+<li class="toctree-l4"><a class="reference internal" href="#compile-the-model-and-export-model-library-format">Compile the model and export model library format</a></li>
+<li class="toctree-l4"><a class="reference internal" href="#generate-input-output-header-files">Generate input/output header files</a></li>
+<li class="toctree-l4"><a class="reference internal" href="#create-the-project-build-and-prepare-the-project-tar-file">Create the project, build and prepare the project tar file</a></li>
+<li class="toctree-l4"><a class="reference internal" href="#use-this-project-with-your-board">Use this project with your board</a></li>
+</ul>
+</li>
+<li class="toctree-l3"><a class="reference internal" href="micro_pytorch.html">microTVM PyTorch Tutorial</a></li>
+<li class="toctree-l3"><a class="reference internal" href="micro_reference_vm.html">microTVM Reference Virtual Machines</a></li>
+<li class="toctree-l3"><a class="reference internal" href="micro_tflite.html">microTVM with TFLite Models</a></li>
+<li class="toctree-l3"><a class="reference internal" href="micro_train.html">Training Vision Models for microTVM on Arduino</a></li>
+<li class="toctree-l3"><a class="reference internal" href="micro_tvmc.html">Executing a Tiny Model with TVMC Micro</a></li>
+</ul>
+</li>
+<li class="toctree-l2"><a class="reference internal" href="../extend_tvm/index.html">Extend TVM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../profile/index.html">Profile Models</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../errors.html">Handle TVM Errors</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../faq.html">Frequently Asked Questions</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../dev/tutorial/index.html">Developer Tutorial</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../dev/how_to/how_to.html">Developer How-To Guide</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Architecture  Guide</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../arch/index.html">Design and Architecture</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Topic Guides</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../topic/microtvm/index.html">microTVM: TVM on bare-metal</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../topic/vta/index.html">VTA: Versatile Tensor Accelerator</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Reference Guide</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../reference/langref/index.html">Language Reference</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../reference/api/python/index.html">Python API</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../reference/api/links.html">Other APIs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../reference/publications.html">Publications</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../genindex.html">Index</a></li>
+</ul>
+
+            
+          
+        </div>
+        
+      </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      
+      <nav class="wy-nav-top" aria-label="top navigation" data-toggle="wy-nav-top">
+        
+            <div class="togglemenu">
+
+            </div>
+            <div class="nav-content">
+              <!-- tvm -->
+              Table of Contents
+            </div>
+        
+      </nav>
+
+
+      <div class="wy-nav-content">
+        
+        <div class="rst-content">
+        
+
+          
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<div role="navigation" aria-label="breadcrumbs navigation">
+
+  <ul class="wy-breadcrumbs">
+    
+      <li><a href="../../index.html">Docs</a> <span class="br-arrow">></span></li>
+        
+          <li><a href="../index.html">How To Guides</a> <span class="br-arrow">></span></li>
+        
+          <li><a href="index.html">Work With microTVM</a> <span class="br-arrow">></span></li>
+        
+      <li>Creating Your MLPerfTiny Submission with microTVM</li>
+    
+    
+      
+      
+        
+      
+    
+    
+      <li class="wy-breadcrumbs-aside">
+        
+            
+            
+              <a href="https://github.com/apache/tvm/edit/main/docs/how_to/work_with_microtvm/micro_mlperftiny.rst" class="fa fa-github"> Edit on GitHub</a>
+            
+          
+        
+      </li>
+    
+  </ul>
+
+  
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+           <div itemprop="articleBody">
+            
+  <div class="sphx-glr-download-link-note admonition note">
+<p class="admonition-title">Note</p>
+<p>This tutorial can be used interactively with Google Colab! You can also click
+<a class="reference internal" href="#sphx-glr-download-how-to-work-with-microtvm-micro-mlperftiny-py"><span class="std std-ref">here</span></a> to run the Jupyter notebook locally.</p>
+<a class="reference external image-reference" href="https://colab.research.google.com/github/apache/tvm-site/blob/asf-site/docs/_downloads/1fc1683d67bee4f26703504a58d42578/micro_mlperftiny.ipynb"><img alt="https://raw.githubusercontent.com/tlc-pack/web-data/main/images/utilities/colab_button.svg" class="align-center" src="https://raw.githubusercontent.com/tlc-pack/web-data/main/images/utilities/colab_button.svg" width="300px" /></a>
+</div>
+<div class="sphx-glr-example-title section" id="creating-your-mlperftiny-submission-with-microtvm">
+<span id="tutorial-micro-mlperftiny"></span><span id="sphx-glr-how-to-work-with-microtvm-micro-mlperftiny-py"></span><h1>Creating Your MLPerfTiny Submission with microTVM<a class="headerlink" href="#creating-your-mlperftiny-submission-with-microtvm" title="Permalink to this headline">¶</a></h1>
+<p><strong>Authors</strong>:
+<a class="reference external" href="https://github.com/mehrdadh">Mehrdad Hessar</a></p>
+<p>This tutorial is showcasing building an MLPerfTiny submission using microTVM. This
+tutorial shows the steps to import a TFLite model from MLPerfTiny benchmark models,
+compile it with TVM and generate a Zephyr project which can be flashed to a Zephyr
+supported board to benchmark the model using EEMBC runner.</p>
+<div class="section" id="install-microtvm-python-dependencies">
+<h2>Install microTVM Python dependencies<a class="headerlink" href="#install-microtvm-python-dependencies" title="Permalink to this headline">¶</a></h2>
+<p>TVM does not include a package for Python serial communication, so
+we must install one before using microTVM. We will also need TFLite
+to load models.</p>
+<blockquote>
+<div><div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>%%shell
+pip install <span class="nv">pyserial</span><span class="o">==</span><span class="m">3</span>.5 <span class="nv">tflite</span><span class="o">==</span><span class="m">2</span>.1
+</pre></div>
+</div>
+</div></blockquote>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">os</span>
+<span class="kn">import</span> <span class="nn">pathlib</span>
+<span class="kn">import</span> <span class="nn">tarfile</span>
+<span class="kn">import</span> <span class="nn">tempfile</span>
+<span class="kn">import</span> <span class="nn">shutil</span>
+</pre></div>
+</div>
+</div>
+<div class="section" id="install-zephyr">
+<h2>Install Zephyr<a class="headerlink" href="#install-zephyr" title="Permalink to this headline">¶</a></h2>
+<blockquote>
+<div><div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>%%shell
+<span class="c1"># Install west and ninja</span>
+python3 -m pip install west
+apt-get install -y ninja-build
+
+<span class="c1"># Install ZephyrProject</span>
+<span class="nv">ZEPHYR_PROJECT_PATH</span><span class="o">=</span><span class="s2">&quot;/content/zephyrproject&quot;</span>
+<span class="nb">export</span> <span class="nv">ZEPHYR_BASE</span><span class="o">=</span><span class="si">${</span><span class="nv">ZEPHYR_PROJECT_PATH</span><span class="si">}</span>/zephyr
+west init <span class="si">${</span><span class="nv">ZEPHYR_PROJECT_PATH</span><span class="si">}</span>
+<span class="nb">cd</span> <span class="si">${</span><span class="nv">ZEPHYR_BASE</span><span class="si">}</span>
+git checkout v2.7-branch
+<span class="nb">cd</span> ..
+west update
+west zephyr-export
+chmod -R o+w <span class="si">${</span><span class="nv">ZEPHYR_PROJECT_PATH</span><span class="si">}</span>
+
+<span class="c1"># Install Zephyr SDK</span>
+<span class="nv">ZEPHYR_SDK_VERSION</span><span class="o">=</span><span class="m">0</span>.13.2
+<span class="nv">ZEPHYR_SDK_FILE</span><span class="o">=</span><span class="s2">&quot;/content/zephyr-sdk-linux-setup.run&quot;</span>
+wget --no-verbose -O <span class="nv">$ZEPHYR_SDK_FILE</span> <span class="se">\</span>
+    https://github.com/zephyrproject-rtos/sdk-ng/releases/download/v<span class="si">${</span><span class="nv">ZEPHYR_SDK_VERSION</span><span class="si">}</span>/zephyr-sdk-<span class="si">${</span><span class="nv">ZEPHYR_SDK_VERSION</span><span class="si">}</span>-linux-x86_64-setup.run
+chmod +x <span class="nv">$ZEPHYR_SDK_FILE</span>
+<span class="s2">&quot;</span><span class="nv">$ZEPHYR_SDK_FILE</span><span class="s2">&quot;</span> -- -d /content/zephyr-sdk --quiet
+
+<span class="c1"># Install python dependencies</span>
+python3 -m pip install -r <span class="s2">&quot;</span><span class="si">${</span><span class="nv">ZEPHYR_BASE</span><span class="si">}</span><span class="s2">/scripts/requirements.txt&quot;</span>
+</pre></div>
+</div>
+</div></blockquote>
+<p><strong>Note:</strong> Install CMSIS-NN only if you are interested to generate this submission
+using CMSIS-NN code generator.</p>
+</div>
+<div class="section" id="install-cmsis-nn">
+<h2>Install CMSIS-NN<a class="headerlink" href="#install-cmsis-nn" title="Permalink to this headline">¶</a></h2>
+<blockquote>
+<div><div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>%%shell
+<span class="nv">CMSIS_SHA</span><span class="o">=</span><span class="s2">&quot;51263182d16c92649a48144ba56c0945f9fce60e&quot;</span>
+<span class="nv">CMSIS_URL</span><span class="o">=</span><span class="s2">&quot;http://github.com/ARM-software/CMSIS_5/archive/</span><span class="si">${</span><span class="nv">CMSIS_SHA</span><span class="si">}</span><span class="s2">.tar.gz&quot;</span>
+<span class="nb">export</span> <span class="nv">CMSIS_PATH</span><span class="o">=</span>/content/cmsis
+<span class="nv">DOWNLOAD_PATH</span><span class="o">=</span><span class="s2">&quot;/content/</span><span class="si">${</span><span class="nv">CMSIS_SHA</span><span class="si">}</span><span class="s2">.tar.gz&quot;</span>
+mkdir <span class="si">${</span><span class="nv">CMSIS_PATH</span><span class="si">}</span>
+wget <span class="si">${</span><span class="nv">CMSIS_URL</span><span class="si">}</span> -O <span class="s2">&quot;</span><span class="si">${</span><span class="nv">DOWNLOAD_PATH</span><span class="si">}</span><span class="s2">&quot;</span>
+tar -xf <span class="s2">&quot;</span><span class="si">${</span><span class="nv">DOWNLOAD_PATH</span><span class="si">}</span><span class="s2">&quot;</span> -C <span class="si">${</span><span class="nv">CMSIS_PATH</span><span class="si">}</span> --strip-components<span class="o">=</span><span class="m">1</span>
+rm <span class="si">${</span><span class="nv">DOWNLOAD_PATH</span><span class="si">}</span>
+
+<span class="nv">CMSIS_NN_TAG</span><span class="o">=</span><span class="s2">&quot;v4.0.0&quot;</span>
+<span class="nv">CMSIS_NN_URL</span><span class="o">=</span><span class="s2">&quot;https://github.com/ARM-software/CMSIS-NN.git&quot;</span>
+git clone <span class="si">${</span><span class="nv">CMSIS_NN_URL</span><span class="si">}</span> --branch <span class="si">${</span><span class="nv">CMSIS_NN_TAG</span><span class="si">}</span> --single-branch <span class="si">${</span><span class="nv">CMSIS_PATH</span><span class="si">}</span>/CMSIS-NN
+</pre></div>
+</div>
+</div></blockquote>
+</div>
+<div class="section" id="import-python-dependencies">
+<h2>Import Python dependencies<a class="headerlink" href="#import-python-dependencies" title="Permalink to this headline">¶</a></h2>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tensorflow</span> <span class="k">as</span> <span class="nn">tf</span>
+<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
+
+<span class="kn">import</span> <span class="nn">tvm</span>
+<span class="kn">from</span> <span class="nn">tvm</span> <span class="kn">import</span> <span class="n">relay</span>
+<span class="kn">from</span> <span class="nn">tvm.relay.backend</span> <span class="kn">import</span> <span class="n">Executor</span><span class="p">,</span> <span class="n">Runtime</span>
+<span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="kn">import</span> <span class="n">download_testdata</span>
+<span class="kn">from</span> <span class="nn">tvm.micro</span> <span class="kn">import</span> <a href="../../reference/api/python/micro.html#tvm.micro.export_model_library_format" title="tvm.micro.export_model_library_format" class="sphx-glr-backref-module-tvm-micro sphx-glr-backref-type-py-function"><span class="n">export_model_library_format</span></a>
+<span class="kn">from</span> <span class="nn">tvm.micro.model_library_format</span> <span class="kn">import</span> <span class="n">generate_c_interface_header</span>
+<span class="kn">from</span> <span class="nn">tvm.micro.testing.utils</span> <span class="kn">import</span> <span class="p">(</span>
+    <span class="n">create_header_file</span><span class="p">,</span>
+    <span class="n">mlf_extract_workspace_size_bytes</span><span class="p">,</span>
+<span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="section" id="import-visual-wake-word-model">
+<h2>Import Visual Wake Word Model<a class="headerlink" href="#import-visual-wake-word-model" title="Permalink to this headline">¶</a></h2>
+<p>To begin with, download and import the Visual Wake Word (VWW) TFLite model from MLPerfTiny.
+This model is originally from <a class="reference external" href="https://github.com/mlcommons/tiny">MLPerf Tiny repository</a>.
+We also capture metadata information from the TFLite model such as input/output name,
+quantization parameters, etc. which will be used in following steps.</p>
+<p>We use indexing for various models to build the submission. The indices are defined as follows:
+To build another model, you need to update the model URL, the short name and index number.</p>
+<blockquote>
+<div><ul class="simple">
+<li><p>Keyword Spotting(KWS) 1</p></li>
+<li><p>Visual Wake Word(VWW) 2</p></li>
+<li><p>Anomaly Detection(AD) 3</p></li>
+<li><p>Image Classification(IC) 4</p></li>
+</ul>
+</div></blockquote>
+<p>If you would like to build the submission with CMSIS-NN, modify USE_CMSIS environment variable.</p>
+<blockquote>
+<div><div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span> <span class="nv">USE_CMSIS</span><span class="o">=</span><span class="m">1</span>
+</pre></div>
+</div>
+</div></blockquote>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">MODEL_URL</span> <span class="o">=</span> <span class="s2">&quot;https://github.com/mlcommons/tiny/raw/bceb91c5ad2e2deb295547d81505721d3a87d578/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite&quot;</span>
+<span class="n">MODEL_PATH</span> <span class="o">=</span> <span class="n">download_testdata</span><span class="p">(</span><span class="n">MODEL_URL</span><span class="p">,</span> <span class="s2">&quot;vww_96_int8.tflite&quot;</span><span class="p">,</span> <span class="n">module</span><span class="o">=</span><span class="s2">&quot;model&quot;</span><span class="p">)</span>
+
+<span class="n">MODEL_SHORT_NAME</span> <span class="o">=</span> <span class="s2">&quot;VWW&quot;</span>
+<span class="n">MODEL_INDEX</span> <span class="o">=</span> <span class="mi">2</span>
+
+<span class="n">USE_CMSIS</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;TVM_USE_CMSIS&quot;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
+
+<span class="n">tflite_model_buf</span> <span class="o">=</span> <span class="nb">open</span><span class="p">(</span><span class="n">MODEL_PATH</span><span class="p">,</span> <span class="s2">&quot;rb&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">read</span><span class="p">()</span>
+<span class="k">try</span><span class="p">:</span>
+    <span class="kn">import</span> <span class="nn">tflite</span>
+
+    <span class="n">tflite_model</span> <span class="o">=</span> <span class="n">tflite</span><span class="o">.</span><span class="n">Model</span><span class="o">.</span><span class="n">GetRootAsModel</span><span class="p">(</span><span class="n">tflite_model_buf</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span>
+<span class="k">except</span> <span class="ne">AttributeError</span><span class="p">:</span>
+    <span class="kn">import</span> <span class="nn">tflite.Model</span>
+
+    <span class="n">tflite_model</span> <span class="o">=</span> <span class="n">tflite</span><span class="o">.</span><span class="n">Model</span><span class="o">.</span><span class="n">Model</span><span class="o">.</span><span class="n">GetRootAsModel</span><span class="p">(</span><span class="n">tflite_model_buf</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span>
+
+<span class="n">interpreter</span> <span class="o">=</span> <span class="n">tf</span><span class="o">.</span><span class="n">lite</span><span class="o">.</span><span class="n">Interpreter</span><span class="p">(</span><span class="n">model_path</span><span class="o">=</span><span class="nb">str</span><span class="p">(</span><span class="n">MODEL_PATH</span><span class="p">))</span>
+<span class="n">interpreter</span><span class="o">.</span><span class="n">allocate_tensors</span><span class="p">()</span>
+<span class="n">input_details</span> <span class="o">=</span> <span class="n">interpreter</span><span class="o">.</span><span class="n">get_input_details</span><span class="p">()</span>
+<span class="n">output_details</span> <span class="o">=</span> <span class="n">interpreter</span><span class="o">.</span><span class="n">get_output_details</span><span class="p">()</span>
+
+<span class="n">input_name</span> <span class="o">=</span> <span class="n">input_details</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="s2">&quot;name&quot;</span><span class="p">]</span>
+<span class="n">input_shape</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">input_details</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="s2">&quot;shape&quot;</span><span class="p">])</span>
+<span class="n">input_dtype</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">dtype</span><span class="p">(</span><span class="n">input_details</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="s2">&quot;dtype&quot;</span><span class="p">])</span><span class="o">.</span><span class="n">name</span>
+<span class="n">output_name</span> <span class="o">=</span> <span class="n">output_details</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="s2">&quot;name&quot;</span><span class="p">]</span>
+<span class="n">output_shape</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">output_details</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="s2">&quot;shape&quot;</span><span class="p">])</span>
+<span class="n">output_dtype</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">dtype</span><span class="p">(</span><span class="n">output_details</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="s2">&quot;dtype&quot;</span><span class="p">])</span><span class="o">.</span><span class="n">name</span>
+
+<span class="c1"># We extract quantization information from TFLite model.</span>
+<span class="c1"># This is required for all models except Anomaly Detection,</span>
+<span class="c1"># because for other models we send quantized data to interpreter</span>
+<span class="c1"># from host, however, for AD model we send floating data and quantization</span>
+<span class="c1"># happens on the microcontroller.</span>
+<span class="k">if</span> <span class="n">MODEL_SHORT_NAME</span> <span class="o">!=</span> <span class="s2">&quot;AD&quot;</span><span class="p">:</span>
+    <span class="n">quant_output_scale</span> <span class="o">=</span> <span class="n">output_details</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="s2">&quot;quantization_parameters&quot;</span><span class="p">][</span><span class="s2">&quot;scales&quot;</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
+    <span class="n">quant_output_zero_point</span> <span class="o">=</span> <span class="n">output_details</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="s2">&quot;quantization_parameters&quot;</span><span class="p">][</span><span class="s2">&quot;zero_points&quot;</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
+
+<span class="n">relay_mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/frontend.html#tvm.relay.frontend.from_tflite" title="tvm.relay.frontend.from_tflite" class="sphx-glr-backref-module-tvm-relay-frontend sphx-glr-backref-type-py-function"><span class="n">relay</span><span class="o">.</span><span class="n">frontend</span><span class="o">.</span><span class="n">from_tflite</span></a><span class="p">(</span>
+    <span class="n">tflite_model</span><span class="p">,</span> <span class="n">shape_dict</span><span class="o">=</span><span class="p">{</span><span class="n">input_name</span><span class="p">:</span> <span class="n">input_shape</span><span class="p">},</span> <span class="n">dtype_dict</span><span class="o">=</span><span class="p">{</span><span class="n">input_name</span><span class="p">:</span> <span class="n">input_dtype</span><span class="p">}</span>
+<span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="section" id="defining-target-runtime-and-executor">
+<h2>Defining Target, Runtime and Executor<a class="headerlink" href="#defining-target-runtime-and-executor" title="Permalink to this headline">¶</a></h2>
+<p>Now we need to define the target, runtime and executor to compile this model. In this tutorial,
+we use Ahead-of-Time (AoT) compilation and we build a standalone project. This is different
+than using AoT with host-driven mode where the target would communicate with host using host-driven
+AoT executor to run inference.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># Use the C runtime (crt)</span>
+<span class="n">RUNTIME</span> <span class="o">=</span> <span class="n">Runtime</span><span class="p">(</span><span class="s2">&quot;crt&quot;</span><span class="p">)</span>
+
+<span class="c1"># Use the AoT executor with `unpacked-api=True` and `interface-api=c`. `interface-api=c` forces</span>
+<span class="c1"># the compiler to generate C type function APIs and `unpacked-api=True` forces the compiler</span>
+<span class="c1"># to generate minimal unpacked format inputs which reduces the stack memory usage on calling</span>
+<span class="c1"># inference layers of the model.</span>
+<span class="n">EXECUTOR</span> <span class="o">=</span> <span class="n">Executor</span><span class="p">(</span>
+    <span class="s2">&quot;aot&quot;</span><span class="p">,</span>
+    <span class="p">{</span><span class="s2">&quot;unpacked-api&quot;</span><span class="p">:</span> <span class="kc">True</span><span class="p">,</span> <span class="s2">&quot;interface-api&quot;</span><span class="p">:</span> <span class="s2">&quot;c&quot;</span><span class="p">,</span> <span class="s2">&quot;workspace-byte-alignment&quot;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span>
+<span class="p">)</span>
+
+<span class="c1"># Select a Zephyr board</span>
+<span class="n">BOARD</span> <span class="o">=</span> <a href="https://docs.python.org/3/library/os.html#os.getenv" title="os.getenv" class="sphx-glr-backref-module-os sphx-glr-backref-type-py-function"><span class="n">os</span><span class="o">.</span><span class="n">getenv</span></a><span class="p">(</span><span class="s2">&quot;TVM_MICRO_BOARD&quot;</span><span class="p">,</span> <span class="n">default</span><span class="o">=</span><span class="s2">&quot;nucleo_l4r5zi&quot;</span><spa [...]
+
+<span class="c1"># Get the the full target description using the BOARD</span>
+<span class="n">TARGET</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">micro</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">get_target</span><span class="p">(</span><span class="s2">&quot;zephyr&quot;</span><span class="p">,</span> <span class="n">BOARD</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="section" id="compile-the-model-and-export-model-library-format">
+<h2>Compile the model and export model library format<a class="headerlink" href="#compile-the-model-and-export-model-library-format" title="Permalink to this headline">¶</a></h2>
+<p>Now, we compile the model for the target. Then, we generate model
+library format for the compiled model. We also need to calculate the
+workspace size that is required for the compiled model.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">config</span> <span class="o">=</span> <span class="p">{</span><span class="s2">&quot;tir.disable_vectorize&quot;</span><span class="p">:</span> <span class="kc">True</span><span class="p">}</span>
+<span class="k">if</span> <span class="n">USE_CMSIS</span><span class="p">:</span>
+    <span class="kn">from</span> <span class="nn">tvm.relay.op.contrib</span> <span class="kn">import</span> <span class="n">cmsisnn</span>
+
+    <span class="n">config</span><span class="p">[</span><span class="s2">&quot;relay.ext.cmsisnn.options&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span><span class="s2">&quot;mcpu&quot;</span><span class="p">:</span> <span class="n">TARGET</span><span class="o">.</span><span class="n">mcpu</span><span class="p">}</span>
+    <span class="n">relay_mod</span> <span class="o">=</span> <span class="n">cmsisnn</span><span class="o">.</span><span class="n">partition_for_cmsisnn</span><span class="p">(</span><span class="n">relay_mod</span><span class="p">,</span> <span class="n">params</span><span class="p">,</span> <span class="n">mcpu</span><span class="o">=</span><span class="n">TARGET</span><span class="o">.</span><span class="n">mcpu</span><span class="p">)</span>
+
+<span class="k">with</span> <a href="../../reference/api/python/ir.html#tvm.transform.PassContext" title="tvm.transform.PassContext" class="sphx-glr-backref-module-tvm-transform sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">tvm</span><span class="o">.</span><span class="n">transform</span><span class="o">.</span><span class="n">PassContext</span></a><span class="p">(</span><span class="n">opt_level</span><span class="o">=</span><span class="mi">3</span><span c [...]
+    <span class="n">module</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">build</span><span class="p">(</span>
+        <span class="n">relay_mod</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="n">TARGET</span><span class="p">,</span> <span class="n">params</span><span class="o">=</span><span class="n">params</span><span class="p">,</span> <span class="n">runtime</span><span class="o">=</span><span class="n">RUNTIME</span><span class="p">,</span> <span class="n">executor</span><span class="o">=</span><span class="n">EXECUTOR</span>
+    <span class="p">)</span>
+
+<span class="n">temp_dir</span> <span class="o">=</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.utils.tempdir" title="tvm.contrib.utils.tempdir" class="sphx-glr-backref-module-tvm-contrib-utils sphx-glr-backref-type-py-function"><span class="n">tvm</span><span class="o">.</span><span class="n">contrib</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">tempdir</span></a><span class="p">()</span>
+<span class="n">model_tar_path</span> <span class="o">=</span> <span class="n">temp_dir</span> <span class="o">/</span> <span class="s2">&quot;model.tar&quot;</span>
+<a href="../../reference/api/python/micro.html#tvm.micro.export_model_library_format" title="tvm.micro.export_model_library_format" class="sphx-glr-backref-module-tvm-micro sphx-glr-backref-type-py-function"><span class="n">export_model_library_format</span></a><span class="p">(</span><span class="n">module</span><span class="p">,</span> <span class="n">model_tar_path</span><span class="p">)</span>
+<span class="n">workspace_size</span> <span class="o">=</span> <span class="n">mlf_extract_workspace_size_bytes</span><span class="p">(</span><span class="n">model_tar_path</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="section" id="generate-input-output-header-files">
+<h2>Generate input/output header files<a class="headerlink" href="#generate-input-output-header-files" title="Permalink to this headline">¶</a></h2>
+<p>To create a microTVM standalone project with AoT, we need to generate
+input and output header files. These header files are used to connect
+the input and output API from generated code to the rest of the
+standalone project. For this specific submission, we only need to generate
+output header file since the input API call is handled differently.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">extra_tar_dir</span> <span class="o">=</span> <a href="../../reference/api/python/contrib.html#tvm.contrib.utils.tempdir" title="tvm.contrib.utils.tempdir" class="sphx-glr-backref-module-tvm-contrib-utils sphx-glr-backref-type-py-function"><span class="n">tvm</span><span class="o">.</span><span class="n">contrib</span><span class="o">.</span><span class="n">utils</span><span class="o">.</s [...]
+<span class="n">extra_tar_file</span> <span class="o">=</span> <span class="n">extra_tar_dir</span> <span class="o">/</span> <span class="s2">&quot;extra.tar&quot;</span>
+
+<span class="k">with</span> <a href="https://docs.python.org/3/library/tarfile.html#tarfile.open" title="tarfile.open" class="sphx-glr-backref-module-tarfile sphx-glr-backref-type-py-function"><span class="n">tarfile</span><span class="o">.</span><span class="n">open</span></a><span class="p">(</span><span class="n">extra_tar_file</span><span class="p">,</span> <span class="s2">&quot;w:gz&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">tf</span><span class= [...]
+    <span class="k">with</span> <a href="https://docs.python.org/3/library/tempfile.html#tempfile.TemporaryDirectory" title="tempfile.TemporaryDirectory" class="sphx-glr-backref-module-tempfile sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">tempfile</span><span class="o">.</span><span class="n">TemporaryDirectory</span></a><span class="p">()</span> <span class="k">as</span> <span class="n">tar_temp_dir</span><span class="p">:</span>
+        <span class="n">model_files_path</span> <span class="o">=</span> <a href="https://docs.python.org/3/library/os.path.html#os.path.join" title="os.path.join" class="sphx-glr-backref-module-os-path sphx-glr-backref-type-py-function"><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span></a><span class="p">(</span><span class="n">tar_temp_dir</span><span class="p">,</span> <span class="s2">&quot;include&quot;</s [...]
+        <a href="https://docs.python.org/3/library/os.html#os.mkdir" title="os.mkdir" class="sphx-glr-backref-module-os sphx-glr-backref-type-py-function"><span class="n">os</span><span class="o">.</span><span class="n">mkdir</span></a><span class="p">(</span><span class="n">model_files_path</span><span class="p">)</span>
+        <span class="n">header_path</span> <span class="o">=</span> <span class="n">generate_c_interface_header</span><span class="p">(</span>
+            <span class="n">module</span><span class="o">.</span><span class="n">libmod_name</span><span class="p">,</span> <span class="p">[</span><span class="n">input_name</span><span class="p">],</span> <span class="p">[</span><span class="n">output_name</span><span class="p">],</span> <span class="p">[],</span> <span class="p">{},</span> <span class="p">[],</span> <span class="mi">0</span><span class="p">,</span> <span class="n">model_files_path</span><span class="p">,</span> <span  [...]
+        <span class="p">)</span>
+        <span class="n">tf</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">header_path</span><span class="p">,</span> <span class="n">arcname</span><span class="o">=</span><a href="https://docs.python.org/3/library/os.path.html#os.path.relpath" title="os.path.relpath" class="sphx-glr-backref-module-os-path sphx-glr-backref-type-py-function"><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span>< [...]
+
+    <span class="n">create_header_file</span><span class="p">(</span>
+        <span class="s2">&quot;output_data&quot;</span><span class="p">,</span>
+        <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span>
+            <span class="n">shape</span><span class="o">=</span><span class="n">output_shape</span><span class="p">,</span>
+            <span class="n">dtype</span><span class="o">=</span><span class="n">output_dtype</span><span class="p">,</span>
+        <span class="p">),</span>
+        <span class="s2">&quot;include&quot;</span><span class="p">,</span>
+        <span class="n">tf</span><span class="p">,</span>
+    <span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="section" id="create-the-project-build-and-prepare-the-project-tar-file">
+<h2>Create the project, build and prepare the project tar file<a class="headerlink" href="#create-the-project-build-and-prepare-the-project-tar-file" title="Permalink to this headline">¶</a></h2>
+<p>Now that we have the compiled model as a model library format,
+we can generate the full project using Zephyr template project. First,
+we prepare the project options, then build the project. Finally, we
+cleanup the temporary files and move the submission project to the
+current working directory which could be downloaded and used on
+your development kit.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">input_total_size</span> <span class="o">=</span> <span class="mi">1</span>
+<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">input_shape</span><span class="p">)):</span>
+    <span class="n">input_total_size</span> <span class="o">*=</span> <span class="n">input_shape</span><span class="p">[</span><span class="n">i</span><span class="p">]</span>
+
+<span class="n">template_project_path</span> <span class="o">=</span> <a href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="pathlib.Path" class="sphx-glr-backref-module-pathlib sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">pathlib</span><span class="o">.</span><span class="n">Path</span></a><span class="p">(</span><a href="../../reference/api/python/micro.html#tvm.micro.get_microtvm_template_projects" title="tvm.micro.get_microtvm_templa [...]
+<span class="n">project_options</span> <span class="o">=</span> <span class="p">{</span>
+    <span class="s2">&quot;extra_files_tar&quot;</span><span class="p">:</span> <span class="nb">str</span><span class="p">(</span><span class="n">extra_tar_file</span><span class="p">),</span>
+    <span class="s2">&quot;project_type&quot;</span><span class="p">:</span> <span class="s2">&quot;mlperftiny&quot;</span><span class="p">,</span>
+    <span class="s2">&quot;board&quot;</span><span class="p">:</span> <span class="n">BOARD</span><span class="p">,</span>
+    <span class="s2">&quot;compile_definitions&quot;</span><span class="p">:</span> <span class="p">[</span>
+        <span class="sa">f</span><span class="s2">&quot;-DWORKSPACE_SIZE=</span><span class="si">{</span><span class="n">workspace_size</span> <span class="o">+</span> <span class="mi">512</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">,</span>  <span class="c1"># Memory workspace size, 512 is a temporary offset</span>
+        <span class="c1"># since the memory calculation is not accurate.</span>
+        <span class="sa">f</span><span class="s2">&quot;-DTARGET_MODEL=</span><span class="si">{</span><span class="n">MODEL_INDEX</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">,</span>  <span class="c1"># Sets the model index for project compilation.</span>
+        <span class="sa">f</span><span class="s2">&quot;-DTH_MODEL_VERSION=EE_MODEL_VERSION_</span><span class="si">{</span><span class="n">MODEL_SHORT_NAME</span><span class="si">}</span><span class="s2">01&quot;</span><span class="p">,</span>  <span class="c1"># Sets model version. This is required by MLPerfTiny API.</span>
+        <span class="sa">f</span><span class="s2">&quot;-DMAX_DB_INPUT_SIZE=</span><span class="si">{</span><span class="n">input_total_size</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">,</span>  <span class="c1"># Max size of the input data array.</span>
+    <span class="p">],</span>
+<span class="p">}</span>
+
+<span class="k">if</span> <span class="n">MODEL_SHORT_NAME</span> <span class="o">!=</span> <span class="s2">&quot;AD&quot;</span><span class="p">:</span>
+    <span class="n">project_options</span><span class="p">[</span><span class="s2">&quot;compile_definitions&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;-DOUT_QUANT_SCALE=</span><span class="si">{</span><span class="n">quant_output_scale</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+    <span class="n">project_options</span><span class="p">[</span><span class="s2">&quot;compile_definitions&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;-DOUT_QUANT_ZERO=</span><span class="si">{</span><span class="n">quant_output_zero_point</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+<span class="k">if</span> <span class="n">USE_CMSIS</span><span class="p">:</span>
+    <span class="n">project_options</span><span class="p">[</span><span class="s2">&quot;compile_definitions&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;-DCOMPILE_WITH_CMSISNN=1&quot;</span><span class="p">)</span>
+
+<span class="c1"># Note: You might need to adjust this based on the board that you are using.</span>
+<span class="n">project_options</span><span class="p">[</span><span class="s2">&quot;config_main_stack_size&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="mi">4000</span>
+
+<span class="k">if</span> <span class="n">USE_CMSIS</span><span class="p">:</span>
+    <span class="n">project_options</span><span class="p">[</span><span class="s2">&quot;cmsis_path&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;CMSIS_PATH&quot;</span><span class="p">,</span> <span class="s2">&quot;/content/cmsis&quot;</span><span class="p">)</span>
+
+<span class="n">generated_project_dir</span> <span class="o">=</span> <span class="n">temp_dir</span> <span class="o">/</span> <span class="s2">&quot;project&quot;</span>
+
+<span class="n">project</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">micro</span><span class="o">.</span><span class="n">project</span><span class="o">.</span><span class="n">generate_project_from_mlf</span><span class="p">(</span>
+    <span class="n">template_project_path</span><span class="p">,</span> <span class="n">generated_project_dir</span><span class="p">,</span> <span class="n">model_tar_path</span><span class="p">,</span> <span class="n">project_options</span>
+<span class="p">)</span>
+<span class="n">project</span><span class="o">.</span><span class="n">build</span><span class="p">()</span>
+
+<span class="c1"># Cleanup the build directory and extra artifacts</span>
+<a href="https://docs.python.org/3/library/shutil.html#shutil.rmtree" title="shutil.rmtree" class="sphx-glr-backref-module-shutil sphx-glr-backref-type-py-function"><span class="n">shutil</span><span class="o">.</span><span class="n">rmtree</span></a><span class="p">(</span><span class="n">generated_project_dir</span> <span class="o">/</span> <span class="s2">&quot;build&quot;</span><span class="p">)</span>
+<span class="p">(</span><span class="n">generated_project_dir</span> <span class="o">/</span> <span class="s2">&quot;model.tar&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">unlink</span><span class="p">()</span>
+
+<span class="n">project_tar_path</span> <span class="o">=</span> <a href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="pathlib.Path" class="sphx-glr-backref-module-pathlib sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">pathlib</span><span class="o">.</span><span class="n">Path</span></a><span class="p">(</span><a href="https://docs.python.org/3/library/os.html#os.getcwd" title="os.getcwd" class="sphx-glr-backref-module-os sphx-glr-backref [...]
+<span class="k">with</span> <a href="https://docs.python.org/3/library/tarfile.html#tarfile.open" title="tarfile.open" class="sphx-glr-backref-module-tarfile sphx-glr-backref-type-py-function"><span class="n">tarfile</span><span class="o">.</span><span class="n">open</span></a><span class="p">(</span><span class="n">project_tar_path</span><span class="p">,</span> <span class="s2">&quot;w:tar&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">tar</span><span cl [...]
+    <span class="n">tar</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">generated_project_dir</span><span class="p">,</span> <span class="n">arcname</span><span class="o">=</span><a href="https://docs.python.org/3/library/os.path.html#os.path.basename" title="os.path.basename" class="sphx-glr-backref-module-os-path sphx-glr-backref-type-py-function"><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o"> [...]
+
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;The generated project is located here: </span><span class="si">{</span><span class="n">project_tar_path</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="section" id="use-this-project-with-your-board">
+<h2>Use this project with your board<a class="headerlink" href="#use-this-project-with-your-board" title="Permalink to this headline">¶</a></h2>
+<p>Now that we have the generated project, you can use this project locally
+to flash your board and prepare it for EEMBC runner software.
+To do this follow these steps:</p>
+<blockquote>
+<div><div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>tar -xf project.tar
+<span class="nb">cd</span> project
+mkdir build
+cmake ..
+make -j2
+west flash
+</pre></div>
+</div>
+</div></blockquote>
+<p>Now you can connect your board to EEMBC runner using this
+<a class="reference external" href="https://github.com/eembc/energyrunner">instructions</a>
+and benchmark this model on your board.</p>
+<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-mlperftiny-py">
+<div class="sphx-glr-download sphx-glr-download-python docutils container">
+<p><a class="reference download internal" download="" href="../../_downloads/4577847d31fa9ec38d0a6dda3e1d178d/micro_mlperftiny.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_mlperftiny.py</span></code></a></p>
+</div>
+<div class="sphx-glr-download sphx-glr-download-jupyter docutils container">
+<p><a class="reference download internal" download="" href="../../_downloads/1fc1683d67bee4f26703504a58d42578/micro_mlperftiny.ipynb"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Jupyter</span> <span class="pre">notebook:</span> <span class="pre">micro_mlperftiny.ipynb</span></code></a></p>
+</div>
+</div>
+<p class="sphx-glr-signature"><a class="reference external" href="https://sphinx-gallery.github.io">Gallery generated by Sphinx-Gallery</a></p>
+</div>
+</div>
+
+
+           </div>
+           
+          </div>
+          
+
+<footer>
+
+    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
+      
+        <a href="micro_pytorch.html" class="btn btn-neutral float-right" title="microTVM PyTorch Tutorial" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
+      
+      
+        <a href="micro_ethosu.html" class="btn btn-neutral float-left" title="Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+      
+    </div>
+
+<div id="button" class="backtop"><img src="../../_static/img/right.svg" alt="backtop"/> </div>
+<section class="footerSec">
+    <div class="footerHeader">
+      <div class="d-flex align-md-items-center justify-content-between flex-column flex-md-row">
+        <div class="copywrite d-flex align-items-center">
+          <h5 id="copy-right-info">© 2022 Apache Software Foundation | All rights reserved</h5>
+        </div>
+      </div>
+
+    </div>
+
+    <div>
+      <div class="footernote">Copyright © 2022 The Apache Software Foundation. Apache TVM, Apache, the Apache feather, and the Apache TVM project logo are either trademarks or registered trademarks of the Apache Software Foundation.</div>
+    </div>
+
+</section>
+</footer>
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+  
+
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.12.9/umd/popper.min.js" integrity="sha384-ApNbgh9B+Y1QKtv3Rn7W3mgPxhU9K/ScQsAP7hUibX39j7fakFPskvXusvfa0b4Q" crossorigin="anonymous"></script>
+    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script>
+
+  </body>
+  <script type="text/javascript">
+      jQuery(function () {
+          SphinxRtdTheme.Navigation.enable(true);
+      });
+  </script>
+
+  
+  
+    
+    <!-- Theme Analytics -->
+    <script>
+    (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+      m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+    })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
+
+    ga('create', 'UA-75982049-2', 'auto');
+    ga('send', 'pageview');
+    </script>
+
+    
+   
+
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/how_to/work_with_microtvm/micro_pytorch.html b/docs/how_to/work_with_microtvm/micro_pytorch.html
index 2f9af81144..216499fa2f 100644
--- a/docs/how_to/work_with_microtvm/micro_pytorch.html
+++ b/docs/how_to/work_with_microtvm/micro_pytorch.html
@@ -49,7 +49,7 @@
     <link rel="index" title="Index" href="../../genindex.html" />
     <link rel="search" title="Search" href="../../search.html" />
     <link rel="next" title="microTVM Reference Virtual Machines" href="micro_reference_vm.html" />
-    <link rel="prev" title="Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN" href="micro_ethosu.html" /> 
+    <link rel="prev" title="Creating Your MLPerfTiny Submission with microTVM" href="micro_mlperftiny.html" /> 
 </head>
 
 <body class="wy-body-for-nav">
@@ -246,6 +246,7 @@
 <li class="toctree-l3"><a class="reference internal" href="micro_aot.html">microTVM Host-Driven AoT</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_autotune.html">Autotuning with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_ethosu.html">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</a></li>
+<li class="toctree-l3"><a class="reference internal" href="micro_mlperftiny.html">Creating Your MLPerfTiny Submission with microTVM</a></li>
 <li class="toctree-l3 current"><a class="current reference internal" href="#">microTVM PyTorch Tutorial</a><ul>
 <li class="toctree-l4"><a class="reference internal" href="#install-microtvm-python-dependencies">Install microTVM Python dependencies</a></li>
 <li class="toctree-l4"><a class="reference internal" href="#load-a-pre-trained-pytorch-model">Load a pre-trained PyTorch model</a></li>
@@ -453,7 +454,8 @@ download a cat image and preprocess it to use as the model input.</p>
 Downloading: &quot;https://download.pytorch.org/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth&quot; to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2_qnnpack_37f702c5.pth
 
   0%|          | 0.00/3.42M [00:00&lt;?, ?B/s]
-100%|##########| 3.42M/3.42M [00:00&lt;00:00, 49.5MB/s]
+ 61%|######    | 2.09M/3.42M [00:00&lt;00:00, 19.9MB/s]
+100%|##########| 3.42M/3.42M [00:00&lt;00:00, 30.4MB/s]
 /workspace/python/tvm/relay/frontend/pytorch_utils.py:47: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
   return LooseVersion(torch_ver) &gt; ver
 /venv/apache-tvm-py3.7/lib/python3.7/site-packages/setuptools/_distutils/version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
@@ -577,7 +579,7 @@ via the host <cite>main.cc`</cite> or if a Zephyr emulated board is selected as
 Torch top-1 id: 282, class name: tiger cat
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  8.520 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  9.130 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-pytorch-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/12b9ecc04c41abaa12022061771821d1/micro_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_pytorch.py</span></code></a></p>
@@ -603,7 +605,7 @@ Torch top-1 id: 282, class name: tiger cat
         <a href="micro_reference_vm.html" class="btn btn-neutral float-right" title="microTVM Reference Virtual Machines" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
       
       
-        <a href="micro_ethosu.html" class="btn btn-neutral float-left" title="Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="micro_mlperftiny.html" class="btn btn-neutral float-left" title="Creating Your MLPerfTiny Submission with microTVM" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
       
     </div>
 
diff --git a/docs/how_to/work_with_microtvm/micro_reference_vm.html b/docs/how_to/work_with_microtvm/micro_reference_vm.html
index 0589cb016d..74697a166e 100644
--- a/docs/how_to/work_with_microtvm/micro_reference_vm.html
+++ b/docs/how_to/work_with_microtvm/micro_reference_vm.html
@@ -246,6 +246,7 @@
 <li class="toctree-l3"><a class="reference internal" href="micro_aot.html">microTVM Host-Driven AoT</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_autotune.html">Autotuning with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_ethosu.html">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</a></li>
+<li class="toctree-l3"><a class="reference internal" href="micro_mlperftiny.html">Creating Your MLPerfTiny Submission with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_pytorch.html">microTVM PyTorch Tutorial</a></li>
 <li class="toctree-l3 current"><a class="current reference internal" href="#">microTVM Reference Virtual Machines</a><ul>
 <li class="toctree-l4"><a class="reference internal" href="#how-it-works">How it works</a></li>
diff --git a/docs/how_to/work_with_microtvm/micro_tflite.html b/docs/how_to/work_with_microtvm/micro_tflite.html
index 195c704ce2..80fc2152d0 100644
--- a/docs/how_to/work_with_microtvm/micro_tflite.html
+++ b/docs/how_to/work_with_microtvm/micro_tflite.html
@@ -246,6 +246,7 @@
 <li class="toctree-l3"><a class="reference internal" href="micro_aot.html">microTVM Host-Driven AoT</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_autotune.html">Autotuning with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_ethosu.html">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</a></li>
+<li class="toctree-l3"><a class="reference internal" href="micro_mlperftiny.html">Creating Your MLPerfTiny Submission with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_pytorch.html">microTVM PyTorch Tutorial</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_reference_vm.html">microTVM Reference Virtual Machines</a></li>
 <li class="toctree-l3 current"><a class="current reference internal" href="#">microTVM with TFLite Models</a><ul>
diff --git a/docs/how_to/work_with_microtvm/micro_train.html b/docs/how_to/work_with_microtvm/micro_train.html
index c55bfe99c8..1f9459f4d6 100644
--- a/docs/how_to/work_with_microtvm/micro_train.html
+++ b/docs/how_to/work_with_microtvm/micro_train.html
@@ -246,6 +246,7 @@
 <li class="toctree-l3"><a class="reference internal" href="micro_aot.html">microTVM Host-Driven AoT</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_autotune.html">Autotuning with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_ethosu.html">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</a></li>
+<li class="toctree-l3"><a class="reference internal" href="micro_mlperftiny.html">Creating Your MLPerfTiny Submission with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_pytorch.html">microTVM PyTorch Tutorial</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_reference_vm.html">microTVM Reference Virtual Machines</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_tflite.html">microTVM with TFLite Models</a></li>
@@ -523,7 +524,7 @@ take about <strong>2 minutes</strong> to download the Stanford Cars, while COCO
 <a href="https://docs.python.org/3/library/shutil.html#shutil.move" title="shutil.move" class="sphx-glr-backref-module-shutil sphx-glr-backref-type-py-function"><span class="n">shutil</span><span class="o">.</span><span class="n">move</span></a><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><a href="https://docs.python.org/3/library/stdtypes.html#str" title="builtins.str" class="sphx-glr-backref-module-builtins sphx-glr-backref-typ [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&#39;/tmp/tmp9vmiouqx/images/random&#39;
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&#39;/tmp/tmp9frj0wj1/images/random&#39;
 </pre></div>
 </div>
 </div>
@@ -583,8 +584,8 @@ objects to other stuff? We can display some examples from our datasets using <co
     <span class="n">plt</span><span class="o">.</span><span class="n">axis</span><span class="p">(</span><span class="s2">&quot;off&quot;</span><span class="p">)</span>
 </pre></div>
 </div>
-<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmp9vmiouqx/images/target contains 8144 images
-/tmp/tmp9vmiouqx/images/random contains 5000 images
+<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmp9frj0wj1/images/target contains 8144 images
+/tmp/tmp9frj0wj1/images/random contains 5000 images
 </pre></div>
 </div>
 </div>
@@ -696,13 +697,13 @@ the time on our validation set).</p>
 </pre></div>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Epoch 1/3
-328/328 - 47s - loss: 0.2139 - accuracy: 0.9263 - val_loss: 0.1829 - val_accuracy: 0.9305 - 47s/epoch - 143ms/step
+328/328 - 47s - loss: 0.2199 - accuracy: 0.9239 - val_loss: 0.1185 - val_accuracy: 0.9581 - 47s/epoch - 142ms/step
 Epoch 2/3
-328/328 - 43s - loss: 0.0909 - accuracy: 0.9648 - val_loss: 0.1100 - val_accuracy: 0.9668 - 43s/epoch - 132ms/step
+328/328 - 43s - loss: 0.1003 - accuracy: 0.9617 - val_loss: 0.1516 - val_accuracy: 0.9486 - 43s/epoch - 132ms/step
 Epoch 3/3
-328/328 - 43s - loss: 0.0661 - accuracy: 0.9757 - val_loss: 0.0951 - val_accuracy: 0.9645 - 43s/epoch - 131ms/step
+328/328 - 43s - loss: 0.0737 - accuracy: 0.9728 - val_loss: 0.1310 - val_accuracy: 0.9615 - 43s/epoch - 132ms/step
 
-&lt;keras.callbacks.History object at 0x7f6790261dd0&gt;
+&lt;keras.callbacks.History object at 0x7fe689e4a810&gt;
 </pre></div>
 </div>
 </div>
@@ -962,7 +963,7 @@ as intended.</p>
 <p>From here, we could modify the model to read live images from the camera - we have another
 Arduino tutorial for how to do that <a class="reference external" href="https://github.com/guberti/tvm-arduino-demos/tree/master/examples/person_detection">on GitHub</a>. Alternatively, we could also
 <a class="reference external" href="https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_autotune.html">use TVM’s autotuning capabilities</a> to dramatically improve the model’s performance.</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 4 minutes  21.567 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 4 minutes  29.197 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-train-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/b52cec46baf4f78d6bcd94cbe269c8a6/micro_train.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_train.py</span></code></a></p>
diff --git a/docs/how_to/work_with_microtvm/micro_tvmc.html b/docs/how_to/work_with_microtvm/micro_tvmc.html
index 2cf0e06293..40fca76442 100644
--- a/docs/how_to/work_with_microtvm/micro_tvmc.html
+++ b/docs/how_to/work_with_microtvm/micro_tvmc.html
@@ -246,6 +246,7 @@
 <li class="toctree-l3"><a class="reference internal" href="micro_aot.html">microTVM Host-Driven AoT</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_autotune.html">Autotuning with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_ethosu.html">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</a></li>
+<li class="toctree-l3"><a class="reference internal" href="micro_mlperftiny.html">Creating Your MLPerfTiny Submission with microTVM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_pytorch.html">microTVM PyTorch Tutorial</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_reference_vm.html">microTVM Reference Virtual Machines</a></li>
 <li class="toctree-l3"><a class="reference internal" href="micro_tflite.html">microTVM with TFLite Models</a></li>
diff --git a/docs/how_to/work_with_microtvm/sg_execution_times.html b/docs/how_to/work_with_microtvm/sg_execution_times.html
index fe73471278..fad165773d 100644
--- a/docs/how_to/work_with_microtvm/sg_execution_times.html
+++ b/docs/how_to/work_with_microtvm/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-work-with-microtvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>06:34.382</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
+<p><strong>06:42.755</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 83%" />
@@ -349,23 +349,23 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="micro_train.html#sphx-glr-how-to-work-with-microtvm-micro-train-py"><span class="std std-ref">Training Vision Models for microTVM on Arduino</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_train.py</span></code>)</p></td>
-<td><p>04:21.567</p></td>
+<td><p>04:29.197</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="micro_pytorch.html#sphx-glr-how-to-work-with-microtvm-micro-pytorch-py"><span class="std std-ref">microTVM PyTorch Tutorial</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_pytorch.py</span></code>)</p></td>
-<td><p>01:08.520</p></td>
+<td><p>01:09.130</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="micro_autotune.html#sphx-glr-how-to-work-with-microtvm-micro-autotune-py"><span class="std std-ref">Autotuning with microTVM</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_autotune.py</span></code>)</p></td>
-<td><p>00:51.553</p></td>
+<td><p>00:51.744</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="micro_aot.html#sphx-glr-how-to-work-with-microtvm-micro-aot-py"><span class="std std-ref">microTVM Host-Driven AoT</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_aot.py</span></code>)</p></td>
-<td><p>00:08.921</p></td>
+<td><p>00:08.855</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="micro_tflite.html#sphx-glr-how-to-work-with-microtvm-micro-tflite-py"><span class="std std-ref">microTVM with TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tflite.py</span></code>)</p></td>
-<td><p>00:03.821</p></td>
+<td><p>00:03.827</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="micro_tvmc.html#sphx-glr-how-to-work-with-microtvm-micro-tvmc-py"><span class="std std-ref">Executing a Tiny Model with TVMC Micro</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tvmc.py</span></code>)</p></td>
@@ -380,6 +380,10 @@
 <td><p>00:00.000</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
+<tr class="row-odd"><td><p><a class="reference internal" href="micro_mlperftiny.html#sphx-glr-how-to-work-with-microtvm-micro-mlperftiny-py"><span class="std std-ref">Creating Your MLPerfTiny Submission with microTVM</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_mlperftiny.py</span></code>)</p></td>
+<td><p>00:00.000</p></td>
+<td><p>0.0 MB</p></td>
+</tr>
 </tbody>
 </table>
 </div>
diff --git a/docs/how_to/work_with_relay/sg_execution_times.html b/docs/how_to/work_with_relay/sg_execution_times.html
index 036a08090c..c0c1660efb 100644
--- a/docs/how_to/work_with_relay/sg_execution_times.html
+++ b/docs/how_to/work_with_relay/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-work-with-relay-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:44.389</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
+<p><strong>00:45.175</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 84%" />
@@ -349,15 +349,15 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="using_pipeline_executor.html#sphx-glr-how-to-work-with-relay-using-pipeline-executor-py"><span class="std std-ref">Using Pipeline Executor in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_pipeline_executor.py</span></code>)</p></td>
-<td><p>00:32.546</p></td>
+<td><p>00:32.926</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="using_external_lib.html#sphx-glr-how-to-work-with-relay-using-external-lib-py"><span class="std std-ref">Using External Libraries in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_external_lib.py</span></code>)</p></td>
-<td><p>00:10.301</p></td>
+<td><p>00:10.433</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="build_gcn.html#sphx-glr-how-to-work-with-relay-build-gcn-py"><span class="std std-ref">Building a Graph Convolutional Network</span></a> (<code class="docutils literal notranslate"><span class="pre">build_gcn.py</span></code>)</p></td>
-<td><p>00:01.536</p></td>
+<td><p>00:01.810</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="using_relay_viz.html#sphx-glr-how-to-work-with-relay-using-relay-viz-py"><span class="std std-ref">Use Relay Visualizer to Visualize Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_relay_viz.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_schedules/intrin_math.html b/docs/how_to/work_with_schedules/intrin_math.html
index 9941165884..a6be60d150 100644
--- a/docs/how_to/work_with_schedules/intrin_math.html
+++ b/docs/how_to/work_with_schedules/intrin_math.html
@@ -535,7 +535,7 @@ The following example customizes CUDA lowering rule for <code class="code docuti
 <a href="../../reference/api/python/ir.html#tvm.ir.register_intrin_lowering" title="tvm.ir.register_intrin_lowering" class="sphx-glr-backref-module-tvm-ir sphx-glr-backref-type-py-function"><span class="n">register_intrin_lowering</span></a><span class="p">(</span><span class="s2">&quot;tir.exp&quot;</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">f</span><span class="o">= [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&lt;function my_cuda_math_rule at 0x7f65bd33a9e0&gt;
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>&lt;function my_cuda_math_rule at 0x7fe68a33d290&gt;
 </pre></div>
 </div>
 <p>Register the rule to TVM with override option to override existing rule.
diff --git a/docs/how_to/work_with_schedules/sg_execution_times.html b/docs/how_to/work_with_schedules/sg_execution_times.html
index 375d333163..4828b5a431 100644
--- a/docs/how_to/work_with_schedules/sg_execution_times.html
+++ b/docs/how_to/work_with_schedules/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-work-with-schedules-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:07.835</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
+<p><strong>00:07.592</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 83%" />
@@ -349,23 +349,23 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="intrin_math.html#sphx-glr-how-to-work-with-schedules-intrin-math-py"><span class="std std-ref">Intrinsics and Math Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">intrin_math.py</span></code>)</p></td>
-<td><p>00:05.271</p></td>
+<td><p>00:05.077</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tensorize.html#sphx-glr-how-to-work-with-schedules-tensorize-py"><span class="std std-ref">Use Tensorize to Leverage Hardware Intrinsics</span></a> (<code class="docutils literal notranslate"><span class="pre">tensorize.py</span></code>)</p></td>
-<td><p>00:01.193</p></td>
+<td><p>00:01.155</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="reduction.html#sphx-glr-how-to-work-with-schedules-reduction-py"><span class="std std-ref">Reduction</span></a> (<code class="docutils literal notranslate"><span class="pre">reduction.py</span></code>)</p></td>
-<td><p>00:00.585</p></td>
+<td><p>00:00.577</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="scan.html#sphx-glr-how-to-work-with-schedules-scan-py"><span class="std std-ref">Scan and Recurrent Kernel</span></a> (<code class="docutils literal notranslate"><span class="pre">scan.py</span></code>)</p></td>
-<td><p>00:00.566</p></td>
+<td><p>00:00.562</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="extern_op.html#sphx-glr-how-to-work-with-schedules-extern-op-py"><span class="std std-ref">External Tensor Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">extern_op.py</span></code>)</p></td>
-<td><p>00:00.115</p></td>
+<td><p>00:00.116</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="schedule_primitives.html#sphx-glr-how-to-work-with-schedules-schedule-primitives-py"><span class="std std-ref">Schedule Primitives in TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">schedule_primitives.py</span></code>)</p></td>
@@ -373,7 +373,7 @@
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tedd.html#sphx-glr-how-to-work-with-schedules-tedd-py"><span class="std std-ref">Use Tensor Expression Debug Display (TEDD) for Visualization</span></a> (<code class="docutils literal notranslate"><span class="pre">tedd.py</span></code>)</p></td>
-<td><p>00:00.031</p></td>
+<td><p>00:00.032</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tuple_inputs.html#sphx-glr-how-to-work-with-schedules-tuple-inputs-py"><span class="std std-ref">Compute and Reduce with Tuple Inputs</span></a> (<code class="docutils literal notranslate"><span class="pre">tuple_inputs.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_schedules/tensorize.html b/docs/how_to/work_with_schedules/tensorize.html
index 9ad14914cd..cc92110685 100644
--- a/docs/how_to/work_with_schedules/tensorize.html
+++ b/docs/how_to/work_with_schedules/tensorize.html
@@ -586,7 +586,7 @@ class Module:
         B_1 = T.match_buffer(B, (512, 64))
         C_1 = T.match_buffer(C, (1024, 512))
         i = T.var(&quot;int32&quot;)
-        T.attr(T.iter_var(i, None, &quot;DataPar&quot;, &quot;&quot;), &quot;pragma_import_llvm&quot;, &quot;; ModuleID = &#39;/tmp/tmpuyepqa17/input0.cc&#39;\nsource_filename = \&quot;/tmp/tmpuyepqa17/input0.cc\&quot;\ntarget datalayout = \&quot;e-m:e-i64:64-f80:128-n8:16:32:64-S128\&quot;\ntarget triple = \&quot;x86_64-pc-linux-gnu\&quot;\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca  [...]
+        T.attr(T.iter_var(i, None, &quot;DataPar&quot;, &quot;&quot;), &quot;pragma_import_llvm&quot;, &quot;; ModuleID = &#39;/tmp/tmpgb2rg8u1/input0.cc&#39;\nsource_filename = \&quot;/tmp/tmpgb2rg8u1/input0.cc\&quot;\ntarget datalayout = \&quot;e-m:e-i64:64-f80:128-n8:16:32:64-S128\&quot;\ntarget triple = \&quot;x86_64-pc-linux-gnu\&quot;\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca  [...]
         for i, j_outer in T.grid(1024, 32):
             T.call_extern(&quot;int32&quot;, &quot;gemv_update&quot;, T.tvm_access_ptr(T.type_annotation(&quot;float32&quot;), C_1.data, i * 512 + j_outer * 16, 16, 2), T.tvm_access_ptr(T.type_annotation(&quot;float32&quot;), A_1.data, i * 64, 64, 1), T.tvm_access_ptr(T.type_annotation(&quot;float32&quot;), B_1.data, j_outer * 1024, 1024, 1), 16, 64, 64)
 </pre></div>
diff --git a/docs/install/nnpack.html b/docs/install/nnpack.html
index 23d2181e9d..1ef28de467 100644
--- a/docs/install/nnpack.html
+++ b/docs/install/nnpack.html
@@ -229,17 +229,7 @@
               <p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
 <ul class="current">
 <li class="toctree-l1 current"><a class="reference internal" href="index.html">Installing TVM</a><ul class="current">
-<li class="toctree-l2 current"><a class="reference internal" href="from_source.html">Install from Source</a><ul class="current">
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#developers-get-source-from-github">Developers: Get Source from Github</a></li>
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#build-the-shared-library">Build the Shared Library</a></li>
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#python-package-installation">Python Package Installation</a></li>
-<li class="toctree-l3 current"><a class="reference internal" href="from_source.html#install-contrib-libraries">Install Contrib Libraries</a><ul class="current">
-<li class="toctree-l4 current"><a class="current reference internal" href="#">NNPACK Contrib Installation</a></li>
-</ul>
-</li>
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#enable-c-tests">Enable C++ Tests</a></li>
-</ul>
-</li>
+<li class="toctree-l2"><a class="reference internal" href="from_source.html">Install from Source</a></li>
 <li class="toctree-l2"><a class="reference internal" href="docker.html">Docker Images</a></li>
 <li class="toctree-l2 current"><a class="current reference internal" href="#">NNPACK Contrib Installation</a><ul>
 <li class="toctree-l3"><a class="reference internal" href="#conditions">Conditions</a></li>
diff --git a/docs/objects.inv b/docs/objects.inv
index 1a9184f6c2..acfb066919 100644
Binary files a/docs/objects.inv and b/docs/objects.inv differ
diff --git a/docs/reference/api/python/auto_scheduler.html b/docs/reference/api/python/auto_scheduler.html
index 9589f83df6..ea853c881c 100644
--- a/docs/reference/api/python/auto_scheduler.html
+++ b/docs/reference/api/python/auto_scheduler.html
@@ -1615,7 +1615,7 @@ history states as starting point to perform Evolutionary Search).</p></li>
 
 <dl class="py class">
 <dt class="sig sig-object py" id="tvm.auto_scheduler.SketchPolicy">
-<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">SketchPolicy</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">program_cost_model</span></span><span class="o"><span class="pre">=</span></span><span class="defau [...]
+<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">SketchPolicy</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">program_cost_model</span></span><span class="o"><span class="pre">=</span></span><span class="defau [...]
 <dd><p>The search policy that searches in a hierarchical search space defined by sketches.
 The policy randomly samples programs from the space defined by sketches and use evolutionary
 search to fine-tune them.</p>
@@ -1899,7 +1899,7 @@ Candidates:
 
 <dl class="py function">
 <dt class="sig sig-object py" id="tvm.auto_scheduler.auto_schedule">
-<span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">auto_schedule</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">search_policy</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em clas [...]
+<span class="sig-prename descclassname"><span class="pre">tvm.auto_scheduler.</span></span><span class="sig-name descname"><span class="pre">auto_schedule</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">search_policy</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em clas [...]
 <dd><p>THIS API IS DEPRECATED.</p>
 <p>Run auto scheduling search for a task.</p>
 <dl class="field-list simple">
diff --git a/docs/reference/api/typedoc/classes/bytestreamreader.html b/docs/reference/api/typedoc/classes/bytestreamreader.html
index ea0713005d..280a098e18 100644
--- a/docs/reference/api/typedoc/classes/bytestreamreader.html
+++ b/docs/reference/api/typedoc/classes/bytestreamreader.html
@@ -119,7 +119,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -141,7 +141,7 @@
 					<div class="tsd-signature tsd-kind-icon">bytes<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Uint8Array</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/rpc_server.ts#L43">rpc_server.ts:43</a></li>
 						</ul>
 					</aside>
 				</section>
@@ -151,7 +151,7 @@
 					<div class="tsd-signature tsd-kind-icon">offset<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">number</span><span class="tsd-signature-symbol"> = 0</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/rpc_server.ts#L42">rpc_server.ts:42</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/rpc_server.ts#L42">rpc_server.ts:42</a></li>
 						</ul>
 					</aside>
 				</section>
@@ -168,7 +168,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/rpc_server.ts#L63">rpc_server.ts:63</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/rpc_server.ts#L63">rpc_server.ts:63</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">Uint8Array</span></h4>
@@ -185,7 +185,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/rpc_server.ts#L49">rpc_server.ts:49</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/rpc_server.ts#L49">rpc_server.ts:49</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">number</span></h4>
@@ -202,7 +202,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/rpc_server.ts#L57">rpc_server.ts:57</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/rpc_server.ts#L57">rpc_server.ts:57</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">number</span></h4>
diff --git a/docs/reference/api/typedoc/classes/cachedcallstack.html b/docs/reference/api/typedoc/classes/cachedcallstack.html
index c97ac2a90d..ca7755865b 100644
--- a/docs/reference/api/typedoc/classes/cachedcallstack.html
+++ b/docs/reference/api/typedoc/classes/cachedcallstack.html
@@ -144,7 +144,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/memory.ts#L223">memory.ts:223</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/memory.ts#L223">memory.ts:223</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-parameters-title">Parameters</h4>
@@ -172,7 +172,7 @@
 					<div class="tsd-signature tsd-kind-icon">temp<wbr>Args<span class="tsd-signature-symbol">:</span> <span class="tsd-signature-type">Array</span><span class="tsd-signature-symbol">&lt;</span><a href="../interfaces/disposable.html" class="tsd-signature-type">Disposable</a><span class="tsd-signature-symbol">&gt;</span><span class="tsd-signature-symbol"> = []</span></div>
 					<aside class="tsd-sources">
 						<ul>
-							<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/memory.ts#L208">memory.ts:208</a></li>
+							<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/memory.ts#L208">memory.ts:208</a></li>
 						</ul>
 					</aside>
 					<div class="tsd-comment tsd-typography">
@@ -194,7 +194,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/memory.ts#L312">memory.ts:312</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/memory.ts#L312">memory.ts:312</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -226,7 +226,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/memory.ts#L284">memory.ts:284</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/memory.ts#L284">memory.ts:284</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -262,7 +262,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/memory.ts#L388">memory.ts:388</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/memory.ts#L388">memory.ts:388</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -300,7 +300,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/memory.ts#L376">memory.ts:376</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/memory.ts#L376">memory.ts:376</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -340,7 +340,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/memory.ts#L267">memory.ts:267</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/memory.ts#L267">memory.ts:267</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -373,7 +373,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/memory.ts#L243">memory.ts:243</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/memory.ts#L243">memory.ts:243</a></li>
 								</ul>
 							</aside>
 							<h4 class="tsd-returns-title">Returns <span class="tsd-signature-type">void</span></h4>
@@ -390,7 +390,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/memory.ts#L321">memory.ts:321</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/memory.ts#L321">memory.ts:321</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -422,7 +422,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/memory.ts#L252">memory.ts:252</a></li>
+									<li>Defined in <a href="https://github.com/apache/tvm/blob/cfa65b26c/web/src/memory.ts#L252">memory.ts:252</a></li>
 								</ul>
 							</aside>
 							<div class="tsd-comment tsd-typography">
@@ -444,7 +444,7 @@
 						<li class="tsd-description">
 							<aside class="tsd-sources">
 								<ul>
-									<li>Defined in <a href="https://github.com/apache/tvm/blob/c2eee01e6/web/src/memory.ts#L359">memory.ts:359</a></li>
... 2784 lines suppressed ...