You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2023/01/28 00:13:49 UTC

[tvm-site] branch asf-site updated: deploying docs (apache/tvm@c2cc01910c1b88ad2b593a138c8b984385d47db8)

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new 326f5b9a23 deploying docs (apache/tvm@c2cc01910c1b88ad2b593a138c8b984385d47db8)
326f5b9a23 is described below

commit 326f5b9a23c303dea9c5a1127734da0a49d299ea
Author: tvm-bot <95...@users.noreply.github.com>
AuthorDate: Sat Jan 28 00:13:43 2023 +0000

    deploying docs (apache/tvm@c2cc01910c1b88ad2b593a138c8b984385d47db8)
---
 .../micro_pytorch.ipynb                            |    8 +-
 .../micro_pytorch.py                               |   18 +-
 .../micro_mlperftiny.ipynb                         |    4 +-
 .../micro_tflite.py                                |   72 +-
 .../micro_mlperftiny.py                            |    7 +-
 .../micro_ethosu.ipynb                             |    2 +-
 .../micro_tflite.ipynb                             |   48 +-
 .../micro_tvmc.ipynb                               |   52 +-
 .../micro_reference_vm.py                          |  159 -
 .../micro_reference_vm.ipynb                       |   43 -
 .../micro_autotune.py                              |   13 +-
 .../micro_train.ipynb                              |    4 +-
 .../micro_ethosu.py                                |    6 +-
 .../micro_train.py                                 |    9 +-
 .../micro_aot.ipynb                                |    8 +-
 .../eb483c672b88006c331115968e0ffd9b/micro_tvmc.py |   43 +-
 .../micro_autotune.ipynb                           |    6 +-
 .../f8a7209a0e66b246185bfc41bbc82f54/micro_aot.py  |   17 +-
 docs/_images/sphx_glr_micro_reference_vm_thumb.png |  Bin 26794 -> 0 bytes
 docs/_images/sphx_glr_micro_train_001.png          |  Bin 298784 -> 348666 bytes
 docs/_images/sphx_glr_micro_train_thumb.png        |  Bin 22856 -> 24386 bytes
 .../how_to/compile_models/from_darknet.rst.txt     |    2 +-
 .../how_to/compile_models/from_keras.rst.txt       |    2 +-
 .../how_to/compile_models/from_mxnet.rst.txt       |    2 +-
 .../how_to/compile_models/from_oneflow.rst.txt     |    2 +-
 .../how_to/compile_models/from_pytorch.rst.txt     |    2 +-
 .../how_to/compile_models/from_tensorflow.rst.txt  |    2 +-
 .../compile_models/sg_execution_times.rst.txt      |   22 +-
 .../deploy_models/deploy_model_on_adreno.rst.txt   |    4 +-
 .../deploy_models/deploy_model_on_android.rst.txt  |    2 +-
 .../deploy_object_detection_pytorch.rst.txt        |    4 +-
 .../deploy_models/deploy_prequantized.rst.txt      |    6 +-
 .../deploy_prequantized_tflite.rst.txt             |    4 +-
 .../how_to/deploy_models/deploy_quantized.rst.txt  |    2 +-
 .../deploy_models/deploy_ssd_gluoncv.rst.txt       |    4 +-
 .../deploy_models/sg_execution_times.rst.txt       |   20 +-
 .../extend_tvm/bring_your_own_datatypes.rst.txt    |    2 +-
 .../how_to/extend_tvm/sg_execution_times.rst.txt   |   10 +-
 .../how_to/extend_tvm/use_pass_instrument.rst.txt  |   16 +-
 .../optimize_operators/opt_conv_cuda.rst.txt       |    2 +-
 .../optimize_operators/opt_conv_tensorcore.rst.txt |    2 +-
 .../how_to/optimize_operators/opt_gemm.rst.txt     |   16 +-
 .../optimize_operators/sg_execution_times.rst.txt  |    6 +-
 .../sg_execution_times.rst.txt                     |   14 +-
 .../tune_conv2d_layer_cuda.rst.txt                 | 3660 ++++++++++++--------
 .../tune_network_cuda.rst.txt                      |    4 +-
 .../tune_network_x86.rst.txt                       |    4 +-
 .../tune_sparse_x86.rst.txt                        |   88 +-
 .../tune_with_autotvm/sg_execution_times.rst.txt   |   12 +-
 .../tune_with_autotvm/tune_conv2d_cuda.rst.txt     |  569 ++-
 .../how_to/work_with_microtvm/index.rst.txt        |  100 +-
 .../how_to/work_with_microtvm/micro_aot.rst.txt    |   39 +-
 .../work_with_microtvm/micro_autotune.rst.txt      |   59 +-
 .../how_to/work_with_microtvm/micro_ethosu.rst.txt |   44 +-
 .../work_with_microtvm/micro_mlperftiny.rst.txt    |   31 +-
 .../work_with_microtvm/micro_pytorch.rst.txt       |   48 +-
 .../work_with_microtvm/micro_reference_vm.rst.txt  |  185 -
 .../how_to/work_with_microtvm/micro_tflite.rst.txt |  171 +-
 .../how_to/work_with_microtvm/micro_train.rst.txt  |   41 +-
 .../how_to/work_with_microtvm/micro_tvmc.rst.txt   |   45 +-
 .../work_with_microtvm/sg_execution_times.rst.txt  |   38 +-
 .../work_with_relay/sg_execution_times.rst.txt     |    8 +-
 .../how_to/work_with_schedules/intrin_math.rst.txt |    2 +-
 .../work_with_schedules/sg_execution_times.rst.txt |   18 +-
 .../how_to/work_with_schedules/tensorize.rst.txt   |    2 +-
 docs/_sources/topic/microtvm/index.rst.txt         |   11 +-
 .../tutorials/autotvm/sg_execution_times.rst.txt   |    6 +-
 .../frontend/deploy_classification.rst.txt         |    2 +-
 .../tutorials/frontend/deploy_detection.rst.txt    |    2 +-
 .../tutorials/frontend/sg_execution_times.rst.txt  |    6 +-
 .../tutorials/optimize/sg_execution_times.rst.txt  |    6 +-
 .../topic/vta/tutorials/sg_execution_times.rst.txt |    6 +-
 .../tutorial/auto_scheduler_matmul_x86.rst.txt     |   13 +-
 docs/_sources/tutorial/autotvm_matmul_x86.rst.txt  |   20 +-
 docs/_sources/tutorial/autotvm_relay_x86.rst.txt   |   53 +-
 .../tutorial/cross_compilation_and_rpc.rst.txt     |    2 +-
 docs/_sources/tutorial/intro_topi.rst.txt          |    2 +-
 docs/_sources/tutorial/sg_execution_times.rst.txt  |   18 +-
 .../tutorial/tensor_expr_get_started.rst.txt       |   46 +-
 docs/commit_hash                                   |    2 +-
 docs/how_to/compile_models/from_darknet.html       |    2 +-
 docs/how_to/compile_models/from_keras.html         |    2 +-
 docs/how_to/compile_models/from_mxnet.html         |    2 +-
 docs/how_to/compile_models/from_oneflow.html       |   14 +-
 docs/how_to/compile_models/from_pytorch.html       |    9 +-
 docs/how_to/compile_models/from_tensorflow.html    |    2 +-
 docs/how_to/compile_models/sg_execution_times.html |   22 +-
 .../deploy_models/deploy_model_on_adreno.html      |    4 +-
 .../deploy_models/deploy_model_on_android.html     |    2 +-
 .../deploy_object_detection_pytorch.html           |   42 +-
 docs/how_to/deploy_models/deploy_prequantized.html |    9 +-
 .../deploy_models/deploy_prequantized_tflite.html  |    4 +-
 docs/how_to/deploy_models/deploy_quantized.html    |    2 +-
 docs/how_to/deploy_models/deploy_ssd_gluoncv.html  |   38 +-
 docs/how_to/deploy_models/sg_execution_times.html  |   24 +-
 .../extend_tvm/bring_your_own_datatypes.html       |    2 +-
 docs/how_to/extend_tvm/index.html                  |    4 +-
 docs/how_to/extend_tvm/sg_execution_times.html     |   10 +-
 docs/how_to/extend_tvm/use_pass_instrument.html    |   16 +-
 docs/how_to/optimize_operators/opt_conv_cuda.html  |    2 +-
 .../optimize_operators/opt_conv_tensorcore.html    |    2 +-
 docs/how_to/optimize_operators/opt_gemm.html       |   16 +-
 .../optimize_operators/sg_execution_times.html     |    6 +-
 .../sg_execution_times.html                        |   14 +-
 .../tune_conv2d_layer_cuda.html                    | 3660 ++++++++++++--------
 .../tune_with_autoscheduler/tune_network_cuda.html |    4 +-
 .../tune_with_autoscheduler/tune_network_x86.html  |    4 +-
 .../tune_with_autoscheduler/tune_sparse_x86.html   |   88 +-
 .../tune_with_autotvm/sg_execution_times.html      |   16 +-
 .../how_to/tune_with_autotvm/tune_conv2d_cuda.html |  569 ++-
 docs/how_to/work_with_microtvm/index.html          |   72 +-
 docs/how_to/work_with_microtvm/micro_aot.html      |   44 +-
 docs/how_to/work_with_microtvm/micro_autotune.html |   58 +-
 docs/how_to/work_with_microtvm/micro_ethosu.html   |   31 +-
 .../work_with_microtvm/micro_mlperftiny.html       |   32 +-
 docs/how_to/work_with_microtvm/micro_pytorch.html  |   49 +-
 .../work_with_microtvm/micro_reference_vm.html     |  578 ----
 docs/how_to/work_with_microtvm/micro_tflite.html   |  186 +-
 docs/how_to/work_with_microtvm/micro_train.html    |   50 +-
 docs/how_to/work_with_microtvm/micro_tvmc.html     |  132 +-
 .../work_with_microtvm/sg_execution_times.html     |   34 +-
 .../how_to/work_with_relay/sg_execution_times.html |    8 +-
 docs/how_to/work_with_schedules/intrin_math.html   |    2 +-
 .../work_with_schedules/sg_execution_times.html    |   18 +-
 docs/how_to/work_with_schedules/tensorize.html     |    2 +-
 docs/objects.inv                                   |  Bin 24316 -> 24231 bytes
 docs/reference/api/python/auto_scheduler.html      |    4 +-
 .../api/typedoc/classes/bytestreamreader.html      |   12 +-
 .../api/typedoc/classes/cachedcallstack.html       |   34 +-
 docs/reference/api/typedoc/classes/dldatatype.html |   12 +-
 docs/reference/api/typedoc/classes/dldevice.html   |   10 +-
 .../reference/api/typedoc/classes/environment.html |   12 +-
 docs/reference/api/typedoc/classes/ffilibrary.html |   20 +-
 .../api/typedoc/classes/graphexecutor.html         |   16 +-
 docs/reference/api/typedoc/classes/instance.html   |   40 +-
 docs/reference/api/typedoc/classes/memory.html     |   34 +-
 docs/reference/api/typedoc/classes/module.html     |   10 +-
 docs/reference/api/typedoc/classes/ndarray.html    |   22 +-
 .../api/typedoc/classes/packedfunccell.html        |    6 +-
 docs/reference/api/typedoc/classes/rpcserver.html  |   14 +-
 docs/reference/api/typedoc/classes/scalar.html     |    6 +-
 .../api/typedoc/classes/webgpucontext.html         |   12 +-
 docs/reference/api/typedoc/enums/argtypecode.html  |   30 +-
 .../api/typedoc/enums/aynccallbackcode.html        |    4 +-
 .../api/typedoc/enums/dldatatypecode.html          |    8 +-
 .../api/typedoc/enums/rpcserverstate.html          |   12 +-
 docs/reference/api/typedoc/enums/sizeof.html       |   18 +-
 docs/reference/api/typedoc/index.html              |  112 +-
 .../api/typedoc/interfaces/disposable.html         |    2 +-
 .../api/typedoc/interfaces/functioninfo.html       |    6 +-
 .../api/typedoc/interfaces/libraryprovider.html    |    4 +-
 docs/searchindex.js                                |    2 +-
 docs/topic/microtvm/index.html                     |   11 +-
 .../vta/tutorials/autotvm/sg_execution_times.html  |    6 +-
 .../tutorials/frontend/deploy_classification.html  |    2 +-
 .../vta/tutorials/frontend/deploy_detection.html   |    2 +-
 .../vta/tutorials/frontend/sg_execution_times.html |   10 +-
 .../vta/tutorials/optimize/sg_execution_times.html |    6 +-
 docs/topic/vta/tutorials/sg_execution_times.html   |    6 +-
 docs/tutorial/auto_scheduler_matmul_x86.html       |    8 +-
 docs/tutorial/autotvm_matmul_x86.html              |   20 +-
 docs/tutorial/autotvm_relay_x86.html               |  274 +-
 docs/tutorial/cross_compilation_and_rpc.html       |    2 +-
 docs/tutorial/intro_topi.html                      |    2 +-
 docs/tutorial/sg_execution_times.html              |   22 +-
 docs/tutorial/tensor_expr_get_started.html         |   46 +-
 166 files changed, 7181 insertions(+), 5615 deletions(-)

diff --git a/docs/_downloads/09df7d9b9c90a2a1bdd570520693fd9f/micro_pytorch.ipynb b/docs/_downloads/09df7d9b9c90a2a1bdd570520693fd9f/micro_pytorch.ipynb
index 5a3642bdf3..e49c84bd1c 100644
--- a/docs/_downloads/09df7d9b9c90a2a1bdd570520693fd9f/micro_pytorch.ipynb
+++ b/docs/_downloads/09df7d9b9c90a2a1bdd570520693fd9f/micro_pytorch.ipynb
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n\n# microTVM PyTorch Tutorial\n**Authors**:\n[Mehrdad Hessar](https://github.com/mehrdadh)\n\nThis tutorial is showcasing microTVM host-driven AoT compilation with\na PyTorch model. This tutorial can be executed on a x86 CPU using C runtime (CRT).\n\n**Note:** This tutorial only runs on x86 CPU using CRT and does not run on Zephyr\nsince the model would not fit on our current supported Zephyr boards.\n"
+        "\n\n# 4. microTVM PyTorch Tutorial\n**Authors**:\n[Mehrdad Hessar](https://github.com/mehrdadh)\n\nThis tutorial is showcasing microTVM host-driven AoT compilation with\na PyTorch model. This tutorial can be executed on a x86 CPU using C runtime (CRT).\n\n**Note:** This tutorial only runs on x86 CPU using CRT and does not run on Zephyr\nsince the model would not fit on our current supported Zephyr boards.\n"
       ]
     },
     {
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "import pathlib\nimport torch\nimport torchvision\nfrom torchvision import transforms\nimport numpy as np\nfrom PIL import Image\n\nimport tvm\nfrom tvm import relay\nfrom tvm.contrib.download import download_testdata\nfrom tvm.relay.backend import Executor"
+        "import pathlib\nimport torch\nimport torchvision\nfrom torchvision import transforms\nimport numpy as np\nfrom PIL import Image\n\nimport tvm\nfrom tvm import relay\nfrom tvm.contrib.download import download_testdata\nfrom tvm.relay.backend import Executor\nimport tvm.micro.testing"
       ]
     },
     {
@@ -69,7 +69,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Define Target, Runtime and Executor\n\nIn this tutorial we use AOT host-driven executor. To compile the model\nfor an emulated embedded environment on an x86 machine we use C runtime (CRT)\nand we use `host` micro target. Using this setup, TVM compiles the model\nfor C runtime which can run on a x86 CPU machine with the same flow that\nwould run on a physical microcontroller.\n\n\n"
+        "## Define Target, Runtime and Executor\n\nIn this tutorial we use AOT host-driven executor. To compile the model\nfor an emulated embedded environment on an x86 machine we use C runtime (CRT)\nand we use `host` micro target. Using this setup, TVM compiles the model\nfor C runtime which can run on a x86 CPU machine with the same flow that\nwould run on a physical microcontroller.\nCRT Uses the main() from `src/runtime/crt/host/main.cc`\nTo use physical hardware, replace `board` w [...]
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "# Simulate a microcontroller on the host machine. Uses the main() from `src/runtime/crt/host/main.cc`\n# To use physical hardware, replace \"host\" with another physical micro target, e.g. `nrf52840`\n# or `mps2_an521`. See more more target examples in micro_train.py and micro_tflite.py tutorials.\ntarget = tvm.target.target.micro(\"host\")\n\n# Use the C runtime (crt) and enable static linking by setting system-lib to True\nruntime = tvm.relay.backend.Runtime(\"crt\", {\"system [...]
+        "target = tvm.micro.testing.get_target(platform=\"crt\", board=None)\n\n# Use the C runtime (crt) and enable static linking by setting system-lib to True\nruntime = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": True})\n\n# Use the AOT executor rather than graph or vm executors. Don't use unpacked API or C calling style.\nexecutor = Executor(\"aot\")"
       ]
     },
     {
diff --git a/docs/_downloads/12b9ecc04c41abaa12022061771821d1/micro_pytorch.py b/docs/_downloads/12b9ecc04c41abaa12022061771821d1/micro_pytorch.py
index 370e4d7e80..a7f5f10280 100644
--- a/docs/_downloads/12b9ecc04c41abaa12022061771821d1/micro_pytorch.py
+++ b/docs/_downloads/12b9ecc04c41abaa12022061771821d1/micro_pytorch.py
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-.. _tutorial-micro-Pytorch:
+.. _tutorial-micro-pytorch:
 
-microTVM PyTorch Tutorial
-===========================
+4. microTVM PyTorch Tutorial
+============================
 **Authors**:
 `Mehrdad Hessar <https://github.com/mehrdadh>`_
 
@@ -46,6 +46,7 @@ import tvm
 from tvm import relay
 from tvm.contrib.download import download_testdata
 from tvm.relay.backend import Executor
+import tvm.micro.testing
 
 ##################################
 # Load a pre-trained PyTorch model
@@ -91,13 +92,14 @@ relay_mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)
 # and we use `host` micro target. Using this setup, TVM compiles the model
 # for C runtime which can run on a x86 CPU machine with the same flow that
 # would run on a physical microcontroller.
+# CRT Uses the main() from `src/runtime/crt/host/main.cc`
+# To use physical hardware, replace `board` with another physical micro target, e.g. `nrf5340dk_nrf5340_cpuapp`
+# or `mps2_an521` and change the platform type to Zephyr.
+# See more target examples in :ref:`Training Vision Models for microTVM on Arduino <tutorial-micro-train-arduino>`
+# and :ref:`microTVM TFLite Tutorial<tutorial_micro_tflite>`.
 #
 
-
-# Simulate a microcontroller on the host machine. Uses the main() from `src/runtime/crt/host/main.cc`
-# To use physical hardware, replace "host" with another physical micro target, e.g. `nrf52840`
-# or `mps2_an521`. See more more target examples in micro_train.py and micro_tflite.py tutorials.
-target = tvm.target.target.micro("host")
+target = tvm.micro.testing.get_target(platform="crt", board=None)
 
 # Use the C runtime (crt) and enable static linking by setting system-lib to True
 runtime = tvm.relay.backend.Runtime("crt", {"system-lib": True})
diff --git a/docs/_downloads/1fc1683d67bee4f26703504a58d42578/micro_mlperftiny.ipynb b/docs/_downloads/1fc1683d67bee4f26703504a58d42578/micro_mlperftiny.ipynb
index e9dd7442c4..3bd38786c0 100644
--- a/docs/_downloads/1fc1683d67bee4f26703504a58d42578/micro_mlperftiny.ipynb
+++ b/docs/_downloads/1fc1683d67bee4f26703504a58d42578/micro_mlperftiny.ipynb
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n\n# Creating Your MLPerfTiny Submission with microTVM\n**Authors**:\n[Mehrdad Hessar](https://github.com/mehrdadh)\n\nThis tutorial is showcasing building an MLPerfTiny submission using microTVM. This\ntutorial shows the steps to import a TFLite model from MLPerfTiny benchmark models,\ncompile it with TVM and generate a Zephyr project which can be flashed to a Zephyr\nsupported board to benchmark the model using EEMBC runner.\n"
+        "\n\n# 8. Creating Your MLPerfTiny Submission with microTVM\n**Authors**:\n[Mehrdad Hessar](https://github.com/mehrdadh)\n\nThis tutorial is showcasing building an MLPerfTiny submission using microTVM. This\ntutorial shows the steps to import a TFLite model from MLPerfTiny benchmark models,\ncompile it with TVM and generate a Zephyr project which can be flashed to a Zephyr\nsupported board to benchmark the model using EEMBC runner.\n"
       ]
     },
     {
@@ -105,7 +105,7 @@
       },
       "outputs": [],
       "source": [
-        "import tensorflow as tf\nimport numpy as np\n\nimport tvm\nfrom tvm import relay\nfrom tvm.relay.backend import Executor, Runtime\nfrom tvm.contrib.download import download_testdata\nfrom tvm.micro import export_model_library_format\nfrom tvm.micro.model_library_format import generate_c_interface_header\nfrom tvm.micro.testing.utils import (\n    create_header_file,\n    mlf_extract_workspace_size_bytes,\n)"
+        "import tensorflow as tf\nimport numpy as np\n\nimport tvm\nfrom tvm import relay\nfrom tvm.relay.backend import Executor, Runtime\nfrom tvm.contrib.download import download_testdata\nfrom tvm.micro import export_model_library_format\nfrom tvm.micro.model_library_format import generate_c_interface_header\nimport tvm.micro.testing\nfrom tvm.micro.testing.utils import (\n    create_header_file,\n    mlf_extract_workspace_size_bytes,\n)"
       ]
     },
     {
diff --git a/docs/_downloads/2fb9ae7bf124f72614a43137cf2919cb/micro_tflite.py b/docs/_downloads/2fb9ae7bf124f72614a43137cf2919cb/micro_tflite.py
index 0770d472c9..67b3e66e33 100644
--- a/docs/_downloads/2fb9ae7bf124f72614a43137cf2919cb/micro_tflite.py
+++ b/docs/_downloads/2fb9ae7bf124f72614a43137cf2919cb/micro_tflite.py
@@ -15,9 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-.. _microTVM-with-TFLite:
+.. _tutorial_micro_tflite:
 
-microTVM with TFLite Models
+2. microTVM TFLite Tutorial
 ===========================
 **Author**: `Tom Gall <https://github.com/tom-gall>`_
 
@@ -55,11 +55,16 @@ import tempfile
 import numpy as np
 
 import tvm
+import tvm.micro
+import tvm.micro.testing
 from tvm import relay
 import tvm.contrib.utils
+from tvm.micro import export_model_library_format
 from tvm.contrib.download import download_testdata
 
-model_url = "https://people.linaro.org/~tom.gall/sine_model.tflite"
+model_url = (
+    "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/sine_model.tflite"
+)
 model_file = "sine_model.tflite"
 model_path = download_testdata(model_url, model_file, module="data")
 
@@ -105,55 +110,44 @@ mod, params = relay.frontend.from_tflite(
 #
 # Now we create a build config for relay, turning off two options and then calling relay.build which
 # will result in a C source file for the selected TARGET. When running on a simulated target of the
-# same architecture as the host (where this Python script is executed) choose "host" below for the
+# same architecture as the host (where this Python script is executed) choose "crt" below for the
 # TARGET, the C Runtime as the RUNTIME and a proper board/VM to run it (Zephyr will create the right
 # QEMU VM based on BOARD. In the example below the x86 arch is selected and a x86 VM is picked up accordingly:
 #
 RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": True})
-TARGET = tvm.target.target.micro("host")
+TARGET = tvm.micro.testing.get_target("crt")
 
-#
-# Compiling for physical hardware
-#  When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The
-#  STM32F746 Nucleo target and board is chosen in the example below. Another option would be to
-#  choose the STM32F746 Discovery board instead. Since that board has the same MCU as the Nucleo
-#  board but a couple of wirings and configs differ, it's necessary to select the "stm32f746g_disco"
-#  board to generated the right firmware image.
-#
+# When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The
+# STM32L4R5ZI Nucleo target and board is chosen in the example below. You could change the testing
+# board by simply exporting `TVM_MICRO_BOARD` variable with a different Zephyr supported board.
 
 if use_physical_hw:
-    boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
-    with open(boards_file) as f:
-        boards = json.load(f)
     BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
     SERIAL = os.getenv("TVM_MICRO_SERIAL", default=None)
-    TARGET = tvm.target.target.micro(boards[BOARD]["model"])
+    TARGET = tvm.micro.testing.get_target("zephyr", BOARD)
 
+# For some boards, Zephyr runs them emulated by default, using QEMU. For example, below is the
+# TARGET and BOARD used to build a microTVM firmware for the mps2-an521 board.
 #
-#  For some boards, Zephyr runs them emulated by default, using QEMU. For example, below is the
-#  TARGET and BOARD used to build a microTVM firmware for the mps2-an521 board. Since that board
-#  runs emulated by default on Zephyr the suffix "-qemu" is added to the board name to inform
-#  microTVM that the QEMU transporter must be used to communicate with the board. If the board name
-#  already has the prefix "qemu_", like "qemu_x86", then it's not necessary to add that suffix.
-#
-#  TARGET = tvm.target.target.micro("mps2_an521")
-#  BOARD = "mps2_an521-qemu"
+# `mps2_an521 = "mps2_an521"`
+# `TARGET = tvm.micro.testing.get_target("zephyr", BOARD)`
 
 ######################################################################
-# Now, compile the model for the target:
+# Now, compile the model for the target. If you do not specify Executor,
+# by default it uses GraphExecutor.
 
-with tvm.transform.PassContext(
-    opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["AlterOpLayout"]
-):
+with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
     module = relay.build(mod, target=TARGET, runtime=RUNTIME, params=params)
 
 
+######################################################################
 # Inspecting the compilation output
 # ---------------------------------
 #
 # The compilation process has produced some C code implementing the operators in this graph. We
 # can inspect it by printing the CSourceModule contents (for the purposes of this tutorial, let's
 # just print the first 10 lines):
+#
 
 c_source_module = module.get_lib().imported_modules[0]
 assert c_source_module.type_key == "c", "tutorial is broken"
@@ -166,27 +160,23 @@ assert any(
 print("\n".join(first_few_lines))
 
 
+######################################################################
 # Compiling the generated code
 # ----------------------------
 #
 # Now we need to incorporate the generated C code into a project that allows us to run inference on the
 # device. The simplest way to do this is to integrate it yourself, using microTVM's standard output format
-# (:doc:`Model Library Format` </dev/model_library_format>`). This is a tarball with a standard layout:
+# model library format. This is a tarball with a standard layout.
 
 # Get a temporary path where we can store the tarball (since this is running as a tutorial).
 
-fd, model_library_format_tar_path = tempfile.mkstemp()
-os.close(fd)
-os.unlink(model_library_format_tar_path)
-tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+temp_dir = tvm.contrib.utils.tempdir()
+model_tar_path = temp_dir / "model.tar"
+export_model_library_format(module, model_tar_path)
 
-with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
+with tarfile.open(model_tar_path, "r:*") as tar_f:
     print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
 
-# Cleanup for tutorial:
-os.unlink(model_library_format_tar_path)
-
-
 # TVM also provides a standard way for embedded platforms to automatically generate a standalone
 # project, compile and flash it to a target, and communicate with it using the standard TVM RPC
 # protocol. The Model Library Format serves as the model input to this process. When embedded
@@ -201,11 +191,8 @@ os.unlink(model_library_format_tar_path)
 template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("crt"))
 project_options = {}  # You can use options to provide platform-specific options through TVM.
 
-# Compiling for physical hardware (or an emulated board, like the mps_an521)
-# --------------------------------------------------------------------------
 #  For physical hardware, you can try out the Zephyr platform by using a different template project
 #  and options:
-#
 
 if use_physical_hw:
     template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
@@ -218,7 +205,6 @@ if use_physical_hw:
     }
 
 # Create a temporary directory
-
 temp_dir = tvm.contrib.utils.tempdir()
 generated_project_dir = temp_dir / "generated-project"
 generated_project = tvm.micro.generate_project(
diff --git a/docs/_downloads/4577847d31fa9ec38d0a6dda3e1d178d/micro_mlperftiny.py b/docs/_downloads/4577847d31fa9ec38d0a6dda3e1d178d/micro_mlperftiny.py
index 79308e0723..e8c6a253ad 100644
--- a/docs/_downloads/4577847d31fa9ec38d0a6dda3e1d178d/micro_mlperftiny.py
+++ b/docs/_downloads/4577847d31fa9ec38d0a6dda3e1d178d/micro_mlperftiny.py
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-.. _tutorial-micro-MLPerfTiny:
+.. _tutorial-micro-mlperftiny:
 
-Creating Your MLPerfTiny Submission with microTVM
-=================================================
+8. Creating Your MLPerfTiny Submission with microTVM
+====================================================
 **Authors**:
 `Mehrdad Hessar <https://github.com/mehrdadh>`_
 
@@ -69,6 +69,7 @@ from tvm.relay.backend import Executor, Runtime
 from tvm.contrib.download import download_testdata
 from tvm.micro import export_model_library_format
 from tvm.micro.model_library_format import generate_c_interface_header
+import tvm.micro.testing
 from tvm.micro.testing.utils import (
     create_header_file,
     mlf_extract_workspace_size_bytes,
diff --git a/docs/_downloads/55a9eff88b1303e525d53269eeb16897/micro_ethosu.ipynb b/docs/_downloads/55a9eff88b1303e525d53269eeb16897/micro_ethosu.ipynb
index d553c22355..3ae36341ae 100644
--- a/docs/_downloads/55a9eff88b1303e525d53269eeb16897/micro_ethosu.ipynb
+++ b/docs/_downloads/55a9eff88b1303e525d53269eeb16897/micro_ethosu.ipynb
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN\n**Author**:\n[Grant Watson](https://github.com/grant-arm)\n\nThis section contains an example of how to use TVM to run a model\non an Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN, using bare metal.\nThe Cortex(R)-M55 is a small, low-power CPU designed for use in embedded\ndevices. CMSIS-NN is a collection of kernels optimized for Arm(R) Cortex(R)-M CPUs.\nThe Ethos(TM)-U55 [...]
+        "\n\n# 7. Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN\n**Author**:\n[Grant Watson](https://github.com/grant-arm)\n\nThis section contains an example of how to use TVM to run a model\non an Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN, using bare metal.\nThe Cortex(R)-M55 is a small, low-power CPU designed for use in embedded\ndevices. CMSIS-NN is a collection of kernels optimized for Arm(R) Cortex(R)-M CPUs.\nThe Ethos(TM [...]
       ]
     },
     {
diff --git a/docs/_downloads/5b279d8a8718816263fa65b0eef1a5c0/micro_tflite.ipynb b/docs/_downloads/5b279d8a8718816263fa65b0eef1a5c0/micro_tflite.ipynb
index a2233b9b63..7a5b6824c3 100644
--- a/docs/_downloads/5b279d8a8718816263fa65b0eef1a5c0/micro_tflite.ipynb
+++ b/docs/_downloads/5b279d8a8718816263fa65b0eef1a5c0/micro_tflite.ipynb
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n\n# microTVM with TFLite Models\n**Author**: [Tom Gall](https://github.com/tom-gall)\n\nThis tutorial is an introduction to working with microTVM and a TFLite\nmodel with Relay.\n"
+        "\n\n# 2. microTVM TFLite Tutorial\n**Author**: [Tom Gall](https://github.com/tom-gall)\n\nThis tutorial is an introduction to working with microTVM and a TFLite\nmodel with Relay.\n"
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "import json\nimport tarfile\nimport pathlib\nimport tempfile\nimport numpy as np\n\nimport tvm\nfrom tvm import relay\nimport tvm.contrib.utils\nfrom tvm.contrib.download import download_testdata\n\nmodel_url = \"https://people.linaro.org/~tom.gall/sine_model.tflite\"\nmodel_file = \"sine_model.tflite\"\nmodel_path = download_testdata(model_url, model_file, module=\"data\")\n\ntflite_model_buf = open(model_path, \"rb\").read()"
+        "import json\nimport tarfile\nimport pathlib\nimport tempfile\nimport numpy as np\n\nimport tvm\nimport tvm.micro\nimport tvm.micro.testing\nfrom tvm import relay\nimport tvm.contrib.utils\nfrom tvm.micro import export_model_library_format\nfrom tvm.contrib.download import download_testdata\n\nmodel_url = (\n    \"https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/sine_model.tflite\"\n)\nmodel_file = \"sine_model.tflite\"\nmodel_path = download_testdata(model_u [...]
       ]
     },
     {
@@ -141,7 +141,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Defining the target\n\nNow we create a build config for relay, turning off two options and then calling relay.build which\nwill result in a C source file for the selected TARGET. When running on a simulated target of the\nsame architecture as the host (where this Python script is executed) choose \"host\" below for the\nTARGET, the C Runtime as the RUNTIME and a proper board/VM to run it (Zephyr will create the right\nQEMU VM based on BOARD. In the example below the x86 arch  [...]
+        "## Defining the target\n\nNow we create a build config for relay, turning off two options and then calling relay.build which\nwill result in a C source file for the selected TARGET. When running on a simulated target of the\nsame architecture as the host (where this Python script is executed) choose \"crt\" below for the\nTARGET, the C Runtime as the RUNTIME and a proper board/VM to run it (Zephyr will create the right\nQEMU VM based on BOARD. In the example below the x86 arch i [...]
       ]
     },
     {
@@ -152,14 +152,14 @@
       },
       "outputs": [],
       "source": [
-        "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": True})\nTARGET = tvm.target.target.micro(\"host\")\n\n#\n# Compiling for physical hardware\n#  When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The\n#  STM32F746 Nucleo target and board is chosen in the example below. Another option would be to\n#  choose the STM32F746 Discovery board instead. Since that board has the same MCU as the Nucleo\n#  board but a couple of wirings an [...]
+        "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": True})\nTARGET = tvm.micro.testing.get_target(\"crt\")\n\n# When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The\n# STM32L4R5ZI Nucleo target and board is chosen in the example below. You could change the testing\n# board by simply exporting `TVM_MICRO_BOARD` variable with a different Zephyr supported board.\n\nif use_physical_hw:\n    BOARD = os.getenv(\"TVM_MICRO_BOARD\", de [...]
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Now, compile the model for the target:\n\n"
+        "Now, compile the model for the target. If you do not specify Executor,\nby default it uses GraphExecutor.\n\n"
       ]
     },
     {
@@ -170,7 +170,43 @@
       },
       "outputs": [],
       "source": [
-        "with tvm.transform.PassContext(\n    opt_level=3, config={\"tir.disable_vectorize\": True}, disabled_pass=[\"AlterOpLayout\"]\n):\n    module = relay.build(mod, target=TARGET, runtime=RUNTIME, params=params)\n\n\n# Inspecting the compilation output\n# ---------------------------------\n#\n# The compilation process has produced some C code implementing the operators in this graph. We\n# can inspect it by printing the CSourceModule contents (for the purposes of this tutorial, let' [...]
+        "with tvm.transform.PassContext(opt_level=3, config={\"tir.disable_vectorize\": True}):\n    module = relay.build(mod, target=TARGET, runtime=RUNTIME, params=params)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Inspecting the compilation output\n\nThe compilation process has produced some C code implementing the operators in this graph. We\ncan inspect it by printing the CSourceModule contents (for the purposes of this tutorial, let's\njust print the first 10 lines):\n\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "c_source_module = module.get_lib().imported_modules[0]\nassert c_source_module.type_key == \"c\", \"tutorial is broken\"\n\nc_source_code = c_source_module.get_source()\nfirst_few_lines = c_source_code.split(\"\\n\")[:10]\nassert any(\n    l.startswith(\"TVM_DLL int32_t tvmgen_default_\") for l in first_few_lines\n), f\"tutorial is broken: {first_few_lines!r}\"\nprint(\"\\n\".join(first_few_lines))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Compiling the generated code\n\nNow we need to incorporate the generated C code into a project that allows us to run inference on the\ndevice. The simplest way to do this is to integrate it yourself, using microTVM's standard output format\nmodel library format. This is a tarball with a standard layout.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Get a temporary path where we can store the tarball (since this is running as a tutorial).\n\ntemp_dir = tvm.contrib.utils.tempdir()\nmodel_tar_path = temp_dir / \"model.tar\"\nexport_model_library_format(module, model_tar_path)\n\nwith tarfile.open(model_tar_path, \"r:*\") as tar_f:\n    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))\n\n# TVM also provides a standard way for embedded platforms to automatically generate a standalone\n# project, compile and f [...]
       ]
     },
     {
diff --git a/docs/_downloads/6e511f5a8ddbf12f2fca2dfadc0cc4a9/micro_tvmc.ipynb b/docs/_downloads/6e511f5a8ddbf12f2fca2dfadc0cc4a9/micro_tvmc.ipynb
index f1519a2f1a..4f944bfe6a 100644
--- a/docs/_downloads/6e511f5a8ddbf12f2fca2dfadc0cc4a9/micro_tvmc.ipynb
+++ b/docs/_downloads/6e511f5a8ddbf12f2fca2dfadc0cc4a9/micro_tvmc.ipynb
@@ -15,56 +15,92 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n\n# Executing a Tiny Model with TVMC Micro\n**Author**: [Mehrdad Hessar](https://github.com/mehrdadh)\n\nThis tutorial explains how to compile a tiny model for a micro device,\nbuild a program on Zephyr platform to execute this model, flash the program\nand run the model all using `tvmc micro` command.\n"
+        "\n\n# 1. microTVM CLI Tool\n**Author**: [Mehrdad Hessar](https://github.com/mehrdadh)\n\nThis tutorial explains how to compile a tiny model for a micro device,\nbuild a program on Zephyr platform to execute this model, flash the program\nand run the model all using `tvmc micro` command.\nYou need to install python and Zephyr dependencies before processing with this tutorial.\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "<div class=\"alert alert-info\"><h4>Note</h4><p>This tutorial is explaining using TVMC Mirco on Zephyr platform. You need\n    to install Zephyr dependencies before processing with this tutorial. Alternatively,\n    you can run this tutorial in one of the following ways which has Zephyr depencencies already installed.\n\n    * Use [microTVM Reference Virtual Machines](https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_reference_vm.html#sphx-glr-how-to-work-with-microtvm [...]
+        "## Install microTVM Python dependencies\n\nTVM does not include a package for Python serial communication, so\nwe must install one before using microTVM. We will also need TFLite\nto load models.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%%shell\npip install pyserial==3.5 tflite==2.1"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Install Zephyr\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%%shell\n# Install west and ninja\npython3 -m pip install west\napt-get install -y ninja-build\n\n# Install ZephyrProject\nZEPHYR_PROJECT_PATH=\"/content/zephyrproject\"\nexport ZEPHYR_BASE=${ZEPHYR_PROJECT_PATH}/zephyr\nwest init ${ZEPHYR_PROJECT_PATH}\ncd ${ZEPHYR_BASE}\ngit checkout v2.7-branch\ncd ..\nwest update\nwest zephyr-export\nchmod -R o+w ${ZEPHYR_PROJECT_PATH}\n\n# Install Zephyr SDK\nZEPHYR_SDK_VERSION=0.13.2\nZEPHYR_SDK_FILE=\"/content/zephyr-sdk-linux-setup.run\" [...]
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Using TVMC Micro\n\n TVMC is a command-line tool which is installed as a part of TVM Python packages. Accessing this\n package varies based on your machine setup. In many cases, you can use the ``tvmc`` command directly.\n Alternatively, if you have TVM as a Python module on your ``$PYTHONPATH``, you can access this\n driver with ``python -m tvm.driver.tvmc`` command. This tutorial will use TVMC command as\n ``tvmc`` for simplicity.\n\n To check if you have TVMC command inst [...]
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Using TVMC Micro\n\n TVMC is a command-line tool which is installed as a part of TVM Python packages. Accessing this\n package varies based on your machine setup. In many cases, you can use the ``tvmc`` command directly.\n Alternatively, if you have TVM as a Python module on your ``$PYTHONPATH``, you can access this\n driver with ``python -m tvm.driver.tvmc`` command. This tutorial will use TVMC command as\n ``tvmc`` for simplicity.\n\n To check if you have TVMC command insta [...]
+        "### Obtain a Tiny Model\n\n For this tutorial, we will use Magic Wand model from tflite micro. Magic Wand is a\n Depthwise Convolution Layer model which recognizes gestures with an accelerometer.\n\n For this tutorial we will be using the model in tflite format.\n\n```bash\nwget https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/examples/magic_wand/magic_wand.tflite\n```\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Obtain a Tiny Model\n\n For this tutorial, we will use Magic Wand model from tflite micro. Magic Wand is a\n Depthwise Convolution Layer model which recognizes gestures with an accelerometer.\n\n For this tutorial we will be using the model in tflite format.\n\n```bash\nwget https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/examples/magic_wand/magic_wand.tflite\n```\n"
+        "### Compiling a TFLite model to a Model Library Format\n\n Model Library Format (MLF) is an output format that TVM provides for micro targets. MLF is a tarball\n containing a file for each piece of the TVM compiler output which can be used on micro targets outside\n TVM environment. Read more about `Model Library Format <model_library_format>`.\n\n Here, we generate a MLF file for ``qemu_x86`` Zephyr board. To generate MLF output for the ``magic_wand`` tflite model:\n\n```bash\n [...]
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Compiling a TFLite model to a Model Library Format\n\n Model Library Format (MLF) is an output format that TVM provides for micro targets. MLF is a tarball\n containing a file for each piece of the TVM compiler output which can be used on micro targets outside\n TVM environment. Read more about [Model Library Format](https://tvm.apache.org/docs//arch/model_library_format.html).\n\n Here, we generate a MLF file for ``qemu_x86`` Zephyr board. To generate MLF output for the ``ma [...]
+        "### Create a Zephyr Project Using Model Library Format\n\n To generate a Zephyr project we use TVM Micro subcommand ``create``. We pass the MLF format and the path\n for the project to ``create`` subcommand along with project options. Project options for each\n platform (Zephyr/Arduino) are defined in their Project API server file. To build\n Zephyr project for a different Zephyr board, change ``zephyr_board`` project option.\n To generate Zephyr project, run:\n\n```bash\ntvmc m [...]
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Create a Zephyr Project Using Model Library Format\n\n To generate a Zephyr project we use TVM Micro subcommand ``create``. We pass the MLF format and the path\n for the project to ``create`` subcommand along with project options. Project options for each\n platform (Zephyr/Arduino) are defined in their Project API server file. To build\n Zephyr project for a different Zephyr board, change ``zephyr_board`` project option.\n To generate Zephyr project, run:\n\n```bash\ntvmc mi [...]
+        "### Build and Flash Zephyr Project Using TVMC Micro\n\n Next step is to build the Zephyr project which includes TVM generated code for running the tiny model, Zephyr\n template code to run a model in Host-Driven mode and TVM runtime source/header files. To build the project:\n\n```bash\ntvmc micro build \\\n    project \\\n    zephyr\n```\n This will build the project in ``project`` directory and generates binary files under ``project/build``.\n\n Next, we flash the Zephyr binar [...]
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Build and Flash Zephyr Project Using TVMC Micro\n\n Next step is to build the Zephyr project which includes TVM generated code for running the tiny model, Zephyr\n template code to run a model in Host-Driven mode and TVM runtime source/header files. To build the project:\n\n```bash\ntvmc micro build \\\n    project \\\n    zephyr\n```\n This will build the project in ``project`` directory and generates binary files under ``project/build``.\n\n Next, we flash the Zephyr binary [...]
+        "### Run Tiny Model on Micro Target\n\n After flashing the device, the compiled model and TVM RPC server are programmed on the device.\n The Zephyr board is waiting for host to open a communication channel. MicroTVM devices typicall communicate\n using a serial communication (UART). To run the flashed model on the device using TVMC, we use ``tvmc run`` subcommand\n and pass ``--device micro`` to specify the device type. This command will open a communication channel, set input\n  [...]
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Run Tiny Model on Micro Target\n\n After flashing the device, the compiled model and TVM RPC server are programmed on the device.\n The Zephyr board is waiting for host to open a communication channel. MicroTVM devices typicall communicate\n using a serial communication (UART). To run the flashed model on the device using TVMC, we use ``tvmc run`` subcommand\n and pass ``--device micro`` to specify the device type. This command will open a communication channel, set input\n v [...]
+        "Specifically, this command sets the input of the model\nto all ones and shows the four values of the output with their indices.\n\n```bash\n# Output:\n# INFO:__main__:b'[100%] [QEMU] CPU: qemu32,+nx,+pae\\n'\n# remote: microTVM Zephyr runtime - running\n# INFO:__main__:b'[100%] Built target run\\n'\n# [[3.         1.         2.         0.        ]\n# [0.47213247 0.41364592 0.07525456 0.03896701]]\n```\n"
       ]
     }
   ],
diff --git a/docs/_downloads/79027b28c061178b7ea56e3f047eeef1/micro_reference_vm.py b/docs/_downloads/79027b28c061178b7ea56e3f047eeef1/micro_reference_vm.py
deleted file mode 100644
index 3121bca353..0000000000
--- a/docs/_downloads/79027b28c061178b7ea56e3f047eeef1/micro_reference_vm.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-.. _tutorial-micro-reference-vm:
-
-===================================
-microTVM Reference Virtual Machines
-===================================
-**Author**: `Andrew Reusch <ar...@octoml.ai>`_
-
-This tutorial explains how to launch microTVM Reference Virtual Machines. You can use these to
-develop on real physical hardware without needing to individually install the microTVM
-dependencies. These are also particularly useful when trying to reproduce behavior with
-microTVM, such as when filing bug reports.
-
-microTVM is the effort to allow TVM to build and execute models on bare-metal microcontrollers.
-microTVM aims to be compatible with a wide variety of SoCs and runtime environments (i.e. bare metal,
-RTOS, etc). However, some stable software environment is needed to allow developers to share and
-reproduce bugs and results. The microTVM Reference Virtual Machines are intended to provide that
-environment.
-
-How it works
-============
-
-No Virtual Machines are stored in the TVM repository--instead, the files stored in
-``apps/microtvm/reference-vm`` describe how to build VMs to the Vagrant_ VM builder tool.
-
-The Reference VMs are split into two parts:
-
-1. A Vagrant Base Box, which contains all of the stable dependencies for that platform. Build
-   scripts are stored in ``apps/microtvm/reference-vm/<platform>/base-box``. TVM committers run
-   these when a platform's "stable" dependencies change, and the generated base boxes are stored in
-   `Vagrant Cloud`_.
-2. A per-workspace VM, which users normally build using the Base Box as a starting point. Build
-   scripts are stored in ``apps/microtvm/reference-vm/<platform>`` (everything except ``base-box``).
-
-.. _Vagrant: https://vagrantup.com
-.. _Vagrant Cloud: https://app.vagrantup.com/tlcpack
-
-Setting up the VM
-=================
-
-Installing prerequisites
-------------------------
-
-A minimal set of prerequisites are needed:
-
-1. `Vagrant <https://vagrantup.com>`__
-2. A supported Virtual Machine hypervisor (**VirtualBox**, **Parallels**, or **VMWare Fusion/Workstation**).
-   `VirtualBox <https://www.virtualbox.org>`__ is a suggested free hypervisor, but please note
-   that the `VirtualBox Extension Pack`_ is required for proper USB forwarding. If using VirtualBox,
-   also consider installing the `vbguest <https://github.com/dotless-de/vagrant-vbguest>`_ plugin.
-
-.. _VirtualBox Extension Pack: https://www.virtualbox.org/wiki/Downloads#VirtualBox6.1.16OracleVMVirtualBoxExtensionPack
-
-3. If required for your hypervisor, the
-   `Vagrant provider plugin <https://github.com/hashicorp/vagrant/wiki/Available-Vagrant-Plugins#providers>`__ (or see `here <https://www.vagrantup.com/vmware>`__ for VMWare).
-
-First boot
-----------
-
-The first time you use a reference VM, you need to create the box locally and then provision it.
-
-.. code-block:: bash
-
-    # Replace zephyr with the name of a different platform, if you are not using Zephyr.
-    ~/.../tvm $ cd apps/microtvm/reference-vm/zephyr
-    # Replace <provider_name> with the name of the hypervisor you wish to use (i.e. virtualbox, parallels, vmware_desktop).
-    ~/.../tvm/apps/microtvm/reference-vm/zephyr $ vagrant up --provider=<provider_name>
-
-
-This command will take a couple of minutes to run and will require 4 to 5GB of storage on your
-machine. It does the following:
-
-1. Downloads the `microTVM base box`_ and clones it to form a new VM specific to this TVM directory.
-2. Mounts your TVM directory (and, if using ``git-subtree``, the original ``.git`` repo) into the
-   VM.
-3. Builds TVM and installs a Python virtualenv with the dependencies corresponding with your TVM
-   build.
-
-.. _microTVM base box: https://app.vagrantup.com/tlcpack/boxes/microtvm
-
-Connect Hardware to the VM
---------------------------
-
-Next, you need to configure USB passthrough to attach your physical development board to the virtual
-machine (rather than directly to your laptop's host OS).
-
-It's suggested you setup a device filter, rather than doing a one-time forward, because often the
-device may reboot during the programming process and you may, at that time, need to enable
-forwarding again. It may not be obvious to the end user when this occurs. Instructions to do that:
-
- * `VirtualBox <https://www.virtualbox.org/manual/ch03.html#usb-support>`__
- * `Parallels <https://kb.parallels.com/122993>`__
- * `VMWare Workstation <https://docs.vmware.com/en/VMware-Workstation-Pro/15.0/com.vmware.ws.using.doc/GUID-E003456F-EB94-4B53-9082-293D9617CB5A.html>`__
-
-Rebuilding TVM inside the Reference VM
---------------------------------------
-
-After the first boot, you'll need to ensure you keep the build, in ``$TVM_HOME/build-microtvm-zephyr``,
-up-to-date when you modify the C++ runtime or checkout a different revision. You can either
-re-provision the machine (``vagrant provision`` in the same directory you ran ``vagrant up`` before)
-or manually rebuild TVM yourself.
-
-Remember: the TVM ``.so`` built inside the VM is different from the one you may use on your host
-machine. This is why it's built inside the special directory ``build-microtvm-zephyr``.
-
-Logging in to the VM
---------------------
-
-The VM should be available to your host only with the hostname ``microtvm``. You can SSH to the VM
-as follows:
-
-.. code-block:: bash
-
-    $ vagrant ssh
-
-Then ``cd`` to the same path used on your host machine for TVM. For example, on Mac:
-
-.. code-block:: bash
-
-    $ cd /Users/yourusername/path/to/tvm
-
-Running tests
-=============
-
-Once the VM has been provisioned, tests can be executed using ``poetry``:
-
-.. code-block:: bash
-
-    $ cd apps/microtvm/reference-vm/zephyr
-    $ poetry run python3 ../../../../tests/micro/zephyr/test_zephyr.py --board=stm32f746g_disco
-
-If you do not have physical hardware attached, but wish to run the tests using the
-local QEMU emulator running within the VM, run the following commands instead:
-
-.. code-block:: bash
-
-    $ cd /Users/yourusername/path/to/tvm
-    $ cd apps/microtvm/reference-vm/zephyr/
-    $ poetry run pytest ../../../../tests/micro/zephyr/test_zephyr.py --board=qemu_x86
-
-
-
-"""
diff --git a/docs/_downloads/7ef06253b3d2676eb50e20a5f81ef8f9/micro_reference_vm.ipynb b/docs/_downloads/7ef06253b3d2676eb50e20a5f81ef8f9/micro_reference_vm.ipynb
deleted file mode 100644
index 188456cb0c..0000000000
--- a/docs/_downloads/7ef06253b3d2676eb50e20a5f81ef8f9/micro_reference_vm.ipynb
+++ /dev/null
@@ -1,43 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "%%shell\n# Installs the latest dev build of TVM from PyPI. If you wish to build\n# from source, see https://tvm.apache.org/docs/install/from_source.html\npip install apache-tvm --pre"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "\n\n# microTVM Reference Virtual Machines\n**Author**: [Andrew Reusch](areusch@octoml.ai)\n\nThis tutorial explains how to launch microTVM Reference Virtual Machines. You can use these to\ndevelop on real physical hardware without needing to individually install the microTVM\ndependencies. These are also particularly useful when trying to reproduce behavior with\nmicroTVM, such as when filing bug reports.\n\nmicroTVM is the effort to allow TVM to build and execute models on bare [...]
-      ]
-    }
-  ],
-  "metadata": {
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.7.5"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
\ No newline at end of file
diff --git a/docs/_downloads/9ccca8fd489a1486ac71b55a55c320c5/micro_autotune.py b/docs/_downloads/9ccca8fd489a1486ac71b55a55c320c5/micro_autotune.py
index 9be257a57a..e8c032b70e 100644
--- a/docs/_downloads/9ccca8fd489a1486ac71b55a55c320c5/micro_autotune.py
+++ b/docs/_downloads/9ccca8fd489a1486ac71b55a55c320c5/micro_autotune.py
@@ -18,8 +18,8 @@
 """
 .. _tutorial-micro-autotune:
 
-Autotuning with microTVM
-=========================
+6. Model Tuning with microTVM
+=============================
 **Authors**:
 `Andrew Reusch <https://github.com/areusch>`_,
 `Mehrdad Hessar <https://github.com/mehrdadh>`_
@@ -55,6 +55,7 @@ import pathlib
 
 import tvm
 from tvm.relay.backend import Runtime
+import tvm.micro.testing
 
 ####################
 # Defining the model
@@ -102,20 +103,16 @@ params = {"weight": weight_sample}
 #
 
 RUNTIME = Runtime("crt", {"system-lib": True})
-TARGET = tvm.target.target.micro("host")
+TARGET = tvm.micro.testing.get_target("crt")
 
 # Compiling for physical hardware
 # --------------------------------------------------------------------------
 #  When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The
 #  STM32L4R5ZI Nucleo target and board is chosen in the example below.
 if use_physical_hw:
-    boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
-    with open(boards_file) as f:
-        boards = json.load(f)
-
     BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
     SERIAL = os.getenv("TVM_MICRO_SERIAL", default=None)
-    TARGET = tvm.target.target.micro(boards[BOARD]["model"])
+    TARGET = tvm.micro.testing.get_target("zephyr", BOARD)
 
 
 #########################
diff --git a/docs/_downloads/a7c7ea4b5017ae70db1f51dd8e6dcd82/micro_train.ipynb b/docs/_downloads/a7c7ea4b5017ae70db1f51dd8e6dcd82/micro_train.ipynb
index af36cf8c0b..6dab841627 100644
--- a/docs/_downloads/a7c7ea4b5017ae70db1f51dd8e6dcd82/micro_train.ipynb
+++ b/docs/_downloads/a7c7ea4b5017ae70db1f51dd8e6dcd82/micro_train.ipynb
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n\n# Training Vision Models for microTVM on Arduino\n**Author**: [Gavin Uberti](https://github.com/guberti)\n\nThis tutorial shows how MobileNetV1 models can be trained\nto fit on embedded devices, and how those models can be\ndeployed to Arduino using TVM.\n"
+        "\n\n# 5. Training Vision Models for microTVM on Arduino\n**Author**: [Gavin Uberti](https://github.com/guberti)\n\nThis tutorial shows how MobileNetV1 models can be trained\nto fit on embedded devices, and how those models can be\ndeployed to Arduino using TVM.\n"
       ]
     },
     {
@@ -249,7 +249,7 @@
       },
       "outputs": [],
       "source": [
-        "import shutil\nimport tflite\nimport tvm\n\n# Method to load model is different in TFLite 1 vs 2\ntry:  # TFLite 2.1 and above\n    tflite_model = tflite.Model.GetRootAsModel(quantized_model, 0)\nexcept AttributeError:  # Fall back to TFLite 1.14 method\n    tflite_model = tflite.Model.Model.GetRootAsModel(quantized_model, 0)\n\n# Convert to the Relay intermediate representation\nmod, params = tvm.relay.frontend.from_tflite(tflite_model)\n\n# Set configuration flags to improve p [...]
+        "import shutil\nimport tflite\nimport tvm\nimport tvm.micro.testing\n\n# Method to load model is different in TFLite 1 vs 2\ntry:  # TFLite 2.1 and above\n    tflite_model = tflite.Model.GetRootAsModel(quantized_model, 0)\nexcept AttributeError:  # Fall back to TFLite 1.14 method\n    tflite_model = tflite.Model.Model.GetRootAsModel(quantized_model, 0)\n\n# Convert to the Relay intermediate representation\nmod, params = tvm.relay.frontend.from_tflite(tflite_model)\n\n# Set config [...]
       ]
     },
     {
diff --git a/docs/_downloads/ab2eef18d10188532645b1d60fc7dd68/micro_ethosu.py b/docs/_downloads/ab2eef18d10188532645b1d60fc7dd68/micro_ethosu.py
index 74a9d59d77..e6f47321c8 100644
--- a/docs/_downloads/ab2eef18d10188532645b1d60fc7dd68/micro_ethosu.py
+++ b/docs/_downloads/ab2eef18d10188532645b1d60fc7dd68/micro_ethosu.py
@@ -15,8 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN
-======================================================================================
+.. _tutorial-micro-ethosu:
+
+7. Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN
+=========================================================================================
 **Author**:
 `Grant Watson <https://github.com/grant-arm>`_
 
diff --git a/docs/_downloads/b52cec46baf4f78d6bcd94cbe269c8a6/micro_train.py b/docs/_downloads/b52cec46baf4f78d6bcd94cbe269c8a6/micro_train.py
index 9b8a9a68dd..56ff54616f 100644
--- a/docs/_downloads/b52cec46baf4f78d6bcd94cbe269c8a6/micro_train.py
+++ b/docs/_downloads/b52cec46baf4f78d6bcd94cbe269c8a6/micro_train.py
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-.. _microtvm-train-arduino:
+.. _tutorial-micro-train-arduino:
 
-Training Vision Models for microTVM on Arduino
-==============================================
+5. Training Vision Models for microTVM on Arduino
+=================================================
 **Author**: `Gavin Uberti <https://github.com/guberti>`_
 
 This tutorial shows how MobileNetV1 models can be trained
@@ -441,6 +441,7 @@ with open(QUANTIZED_MODEL_PATH, "wb") as f:
 import shutil
 import tflite
 import tvm
+import tvm.micro.testing
 
 # Method to load model is different in TFLite 1 vs 2
 try:  # TFLite 2.1 and above
@@ -452,7 +453,7 @@ except AttributeError:  # Fall back to TFLite 1.14 method
 mod, params = tvm.relay.frontend.from_tflite(tflite_model)
 
 # Set configuration flags to improve performance
-target = tvm.target.target.micro("nrf52840")
+target = tvm.micro.testing.get_target("zephyr", "nrf5340dk_nrf5340_cpuapp")
 runtime = tvm.relay.backend.Runtime("crt")
 executor = tvm.relay.backend.Executor("aot", {"unpacked-api": True})
 
diff --git a/docs/_downloads/c00933f3fbcf90c4f584d54607b33805/micro_aot.ipynb b/docs/_downloads/c00933f3fbcf90c4f584d54607b33805/micro_aot.ipynb
index 1a0e3f2787..7686c6ec1a 100644
--- a/docs/_downloads/c00933f3fbcf90c4f584d54607b33805/micro_aot.ipynb
+++ b/docs/_downloads/c00933f3fbcf90c4f584d54607b33805/micro_aot.ipynb
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n\n# microTVM Host-Driven AoT\n**Authors**:\n[Mehrdad Hessar](https://github.com/mehrdadh),\n[Alan MacDonald](https://github.com/alanmacd)\n\nThis tutorial is showcasing microTVM host-driven AoT compilation with\na TFLite model. AoTExecutor reduces the overhead of parsing graph at runtime\ncompared to GraphExecutor. Also, we can have better memory management using ahead\nof time compilation. This tutorial can be executed on a x86 CPU using C runtime (CRT)\nor on Zephyr platform [...]
+        "\n\n# 3. microTVM Ahead-of-Time (AOT) Compilation\n**Authors**:\n[Mehrdad Hessar](https://github.com/mehrdadh),\n[Alan MacDonald](https://github.com/alanmacd)\n\nThis tutorial is showcasing microTVM host-driven AoT compilation with\na TFLite model. AoTExecutor reduces the overhead of parsing graph at runtime\ncompared to GraphExecutor. Also, we can have better memory management using ahead\nof time compilation. This tutorial can be executed on a x86 CPU using C runtime (CRT)\nor [...]
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\nimport pathlib\nimport json\n\nimport tvm\nfrom tvm import relay\nfrom tvm.relay.backend import Executor, Runtime\nfrom tvm.contrib.download import download_testdata"
+        "import numpy as np\nimport pathlib\nimport json\n\nimport tvm\nfrom tvm import relay\nimport tvm.micro.testing\nfrom tvm.relay.backend import Executor, Runtime\nfrom tvm.contrib.download import download_testdata"
       ]
     },
     {
@@ -105,7 +105,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Defining the target\n\nNow we need to define the target, runtime and executor. In this tutorial, we focused on\nusing AOT host driven executor. We use the host micro target which is for running a model\non x86 CPU using CRT runtime or running a model with Zephyr platform on qemu_x86 simulator\nboard. In the case of a physical microcontroller, we get the target model for the physical\nboard (E.g. nucleo_l4r5zi) and pass it to `tvm.target.target.micro` to create a full\nmicro t [...]
+        "## Defining the target\n\nNow we need to define the target, runtime and executor. In this tutorial, we focused on\nusing AOT host driven executor. We use the host micro target which is for running a model\non x86 CPU using CRT runtime or running a model with Zephyr platform on qemu_x86 simulator\nboard. In the case of a physical microcontroller, we get the target model for the physical\nboard (E.g. nucleo_l4r5zi) and change `BOARD` to supported Zephyr board.\n\n\n"
       ]
     },
     {
@@ -116,7 +116,7 @@
       },
       "outputs": [],
       "source": [
-        "# Use the C runtime (crt) and enable static linking by setting system-lib to True\nRUNTIME = Runtime(\"crt\", {\"system-lib\": True})\n\n# Simulate a microcontroller on the host machine. Uses the main() from `src/runtime/crt/host/main.cc`.\n# To use physical hardware, replace \"host\" with something matching your hardware.\nTARGET = tvm.target.target.micro(\"host\")\n\n# Use the AOT executor rather than graph or vm executors. Don't use unpacked API or C calling style.\nEXECUTOR  [...]
+        "# Use the C runtime (crt) and enable static linking by setting system-lib to True\nRUNTIME = Runtime(\"crt\", {\"system-lib\": True})\n\n# Simulate a microcontroller on the host machine. Uses the main() from `src/runtime/crt/host/main.cc`.\n# To use physical hardware, replace \"host\" with something matching your hardware.\nTARGET = tvm.micro.testing.get_target(\"crt\")\n\n# Use the AOT executor rather than graph or vm executors. Don't use unpacked API or C calling style.\nEXECU [...]
       ]
     },
     {
diff --git a/docs/_downloads/eb483c672b88006c331115968e0ffd9b/micro_tvmc.py b/docs/_downloads/eb483c672b88006c331115968e0ffd9b/micro_tvmc.py
index fa9b1ece2b..da3698d1b2 100644
--- a/docs/_downloads/eb483c672b88006c331115968e0ffd9b/micro_tvmc.py
+++ b/docs/_downloads/eb483c672b88006c331115968e0ffd9b/micro_tvmc.py
@@ -16,30 +16,26 @@
 # under the License.
 
 """
-.. _tutorial-micro-tvmc:
+.. _tutorial-micro-cli-tool:
 
-Executing a Tiny Model with TVMC Micro
-======================================
+1. microTVM CLI Tool
+====================
 **Author**: `Mehrdad Hessar <https://github.com/mehrdadh>`_
 
 This tutorial explains how to compile a tiny model for a micro device,
 build a program on Zephyr platform to execute this model, flash the program
 and run the model all using `tvmc micro` command.
+You need to install python and Zephyr dependencies before processing with this tutorial.
 """
 
 ######################################################################
-# .. note::
-#     This tutorial is explaining using TVMC Mirco on Zephyr platform. You need
-#     to install Zephyr dependencies before processing with this tutorial. Alternatively,
-#     you can run this tutorial in one of the following ways which has Zephyr depencencies already installed.
 #
-#     * Use `microTVM Reference Virtual Machines <https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_reference_vm.html#sphx-glr-how-to-work-with-microtvm-micro-reference-vm-py>`_.
-#     * Use QEMU docker image provided by TVM. Following these you will download and login to the docker image:
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
 #
-#     .. code-block:: bash
+
+######################################################################
 #
-#       cd tvm
-#       ./docker/bash.sh tlcpack/ci-qemu
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_zephyr.rst
 #
 
 
@@ -92,7 +88,7 @@ and run the model all using `tvmc micro` command.
 #
 # Model Library Format (MLF) is an output format that TVM provides for micro targets. MLF is a tarball
 # containing a file for each piece of the TVM compiler output which can be used on micro targets outside
-# TVM environment. Read more about `Model Library Format <https://tvm.apache.org/docs//arch/model_library_format.html>`_.
+# TVM environment. Read more about :ref:`Model Library Format <model_library_format>`.
 #
 # Here, we generate a MLF file for ``qemu_x86`` Zephyr board. To generate MLF output for the ``magic_wand`` tflite model:
 #
@@ -187,12 +183,17 @@ and run the model all using `tvmc micro` command.
 #	      --fill-mode ones \
 #	      --print-top 4
 #
-#     # Output:
-#     #
-#     # INFO:__main__:b'[100%] [QEMU] CPU: qemu32,+nx,+pae\n'
-#     # remote: microTVM Zephyr runtime - running
-#     # INFO:__main__:b'[100%] Built target run\n'
-#     # [[3.         1.         2.         0.        ]
-#     # [0.47213247 0.41364592 0.07525456 0.03896701]]
+
+############################################################
+# Specifically, this command sets the input of the model
+# to all ones and shows the four values of the output with their indices.
+#
+# .. code-block:: bash
+#
+#      # Output:
+#      # INFO:__main__:b'[100%] [QEMU] CPU: qemu32,+nx,+pae\n'
+#      # remote: microTVM Zephyr runtime - running
+#      # INFO:__main__:b'[100%] Built target run\n'
+#      # [[3.         1.         2.         0.        ]
+#      # [0.47213247 0.41364592 0.07525456 0.03896701]]
 #
-# Specifically, this command sets the input of the model to all ones and shows the four values of the output with their indices.
diff --git a/docs/_downloads/f83ba3df2d52f9b54cf141114359481a/micro_autotune.ipynb b/docs/_downloads/f83ba3df2d52f9b54cf141114359481a/micro_autotune.ipynb
index d7bea7ffba..a87493cb2a 100644
--- a/docs/_downloads/f83ba3df2d52f9b54cf141114359481a/micro_autotune.ipynb
+++ b/docs/_downloads/f83ba3df2d52f9b54cf141114359481a/micro_autotune.ipynb
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n\n# Autotuning with microTVM\n**Authors**:\n[Andrew Reusch](https://github.com/areusch),\n[Mehrdad Hessar](https://github.com/mehrdadh)\n\nThis tutorial explains how to autotune a model using the C runtime.\n"
+        "\n\n# 6. Model Tuning with microTVM\n**Authors**:\n[Andrew Reusch](https://github.com/areusch),\n[Mehrdad Hessar](https://github.com/mehrdadh)\n\nThis tutorial explains how to autotune a model using the C runtime.\n"
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "import json\nimport numpy as np\nimport pathlib\n\nimport tvm\nfrom tvm.relay.backend import Runtime"
+        "import json\nimport numpy as np\nimport pathlib\n\nimport tvm\nfrom tvm.relay.backend import Runtime\nimport tvm.micro.testing"
       ]
     },
     {
@@ -116,7 +116,7 @@
       },
       "outputs": [],
       "source": [
-        "RUNTIME = Runtime(\"crt\", {\"system-lib\": True})\nTARGET = tvm.target.target.micro(\"host\")\n\n# Compiling for physical hardware\n# --------------------------------------------------------------------------\n#  When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The\n#  STM32L4R5ZI Nucleo target and board is chosen in the example below.\nif use_physical_hw:\n    boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"zephyr [...]
+        "RUNTIME = Runtime(\"crt\", {\"system-lib\": True})\nTARGET = tvm.micro.testing.get_target(\"crt\")\n\n# Compiling for physical hardware\n# --------------------------------------------------------------------------\n#  When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The\n#  STM32L4R5ZI Nucleo target and board is chosen in the example below.\nif use_physical_hw:\n    BOARD = os.getenv(\"TVM_MICRO_BOARD\", default=\"nucleo_l4r5zi\")\n    S [...]
       ]
     },
     {
diff --git a/docs/_downloads/f8a7209a0e66b246185bfc41bbc82f54/micro_aot.py b/docs/_downloads/f8a7209a0e66b246185bfc41bbc82f54/micro_aot.py
index c1b29ba5c5..f31ffa1570 100644
--- a/docs/_downloads/f8a7209a0e66b246185bfc41bbc82f54/micro_aot.py
+++ b/docs/_downloads/f8a7209a0e66b246185bfc41bbc82f54/micro_aot.py
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-.. _tutorial-micro-AoT:
+.. _tutorial-micro-aot:
 
-microTVM Host-Driven AoT
-===========================
+3. microTVM Ahead-of-Time (AOT) Compilation
+===========================================
 **Authors**:
 `Mehrdad Hessar <https://github.com/mehrdadh>`_,
 `Alan MacDonald <https://github.com/alanmacd>`_
@@ -59,6 +59,7 @@ import json
 
 import tvm
 from tvm import relay
+import tvm.micro.testing
 from tvm.relay.backend import Executor, Runtime
 from tvm.contrib.download import download_testdata
 
@@ -102,8 +103,7 @@ relay_mod, params = relay.frontend.from_tflite(
 # using AOT host driven executor. We use the host micro target which is for running a model
 # on x86 CPU using CRT runtime or running a model with Zephyr platform on qemu_x86 simulator
 # board. In the case of a physical microcontroller, we get the target model for the physical
-# board (E.g. nucleo_l4r5zi) and pass it to `tvm.target.target.micro` to create a full
-# micro target.
+# board (E.g. nucleo_l4r5zi) and change `BOARD` to supported Zephyr board.
 #
 
 # Use the C runtime (crt) and enable static linking by setting system-lib to True
@@ -111,18 +111,15 @@ RUNTIME = Runtime("crt", {"system-lib": True})
 
 # Simulate a microcontroller on the host machine. Uses the main() from `src/runtime/crt/host/main.cc`.
 # To use physical hardware, replace "host" with something matching your hardware.
-TARGET = tvm.target.target.micro("host")
+TARGET = tvm.micro.testing.get_target("crt")
 
 # Use the AOT executor rather than graph or vm executors. Don't use unpacked API or C calling style.
 EXECUTOR = Executor("aot")
 
 if use_physical_hw:
-    boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
-    with open(boards_file) as f:
-        boards = json.load(f)
     BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
     SERIAL = os.getenv("TVM_MICRO_SERIAL", default=None)
-    TARGET = tvm.target.target.micro(boards[BOARD]["model"])
+    TARGET = tvm.micro.testing.get_target("zephyr", BOARD)
 
 ######################################################################
 # Compile the model
diff --git a/docs/_images/sphx_glr_micro_reference_vm_thumb.png b/docs/_images/sphx_glr_micro_reference_vm_thumb.png
deleted file mode 100644
index 8a5fed589d..0000000000
Binary files a/docs/_images/sphx_glr_micro_reference_vm_thumb.png and /dev/null differ
diff --git a/docs/_images/sphx_glr_micro_train_001.png b/docs/_images/sphx_glr_micro_train_001.png
index fb3c2850a3..b353dd858a 100644
Binary files a/docs/_images/sphx_glr_micro_train_001.png and b/docs/_images/sphx_glr_micro_train_001.png differ
diff --git a/docs/_images/sphx_glr_micro_train_thumb.png b/docs/_images/sphx_glr_micro_train_thumb.png
index 86defffe09..280b5c012d 100644
Binary files a/docs/_images/sphx_glr_micro_train_thumb.png and b/docs/_images/sphx_glr_micro_train_thumb.png differ
diff --git a/docs/_sources/how_to/compile_models/from_darknet.rst.txt b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
index 21f83283cf..39758c05da 100644
--- a/docs/_sources/how_to/compile_models/from_darknet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
@@ -318,7 +318,7 @@ The process is no different from other examples.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  20.114 seconds)
+   **Total running time of the script:** ( 1 minutes  16.820 seconds)
 
 
 .. _sphx_glr_download_how_to_compile_models_from_darknet.py:
diff --git a/docs/_sources/how_to/compile_models/from_keras.rst.txt b/docs/_sources/how_to/compile_models/from_keras.rst.txt
index ae2f4536e9..a45508a1f4 100644
--- a/docs/_sources/how_to/compile_models/from_keras.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_keras.rst.txt
@@ -232,7 +232,7 @@ Look up prediction top 1 index in 1000 class synset.
  .. code-block:: none
 
     Relay top-1 id: 285, class name: Egyptian cat
-
    1/1 [==============================] - ETA: 0s
    1/1 [==============================] - 1s 997ms/step
+
    1/1 [==============================] - ETA: 0s
    1/1 [==============================] - 1s 944ms/step
     Keras top-1 id: 285, class name: Egyptian cat
 
 
diff --git a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
index 0e6f5e4eb5..973d7b8498 100644
--- a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
@@ -116,7 +116,7 @@ In this section, we download a pretrained imagenet model and classify an image.
 
  .. code-block:: none
 
-    Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipffcd1a4f-2ae9-47b0-a873-3c22a873fc38 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+    Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipe52823d6-67b8-4970-a66d-061c7030a3da from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
     x (1, 3, 224, 224)
 
 
diff --git a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
index b165253c74..11db524d82 100644
--- a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
@@ -121,7 +121,7 @@ Load a pretrained OneFlow model and save model
  .. code-block:: none
 
     Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
-
      0%|          | 0.00/41.5M [00:00<?, ?B/s]
     15%|#5        | 6.33M/41.5M [00:00<00:00, 49.9MB/s]
     27%|##6       | 11.1M/41.5M [00:00<00:00, 39.5MB/s]
     44%|####4     | 18.3M/41.5M [00:00<00:00, 52.7MB/s]
     58%|#####7    | 24.0M/41.5M [00:00<00:00, 53.6MB/s]
     77%|#######7  | 32.0M/41.5M [00:00<00:00, 56.1MB/s]
     96%|#########6| 40.0M/41.5M [00:00<00:00, 60.5MB/s]
    100%|##########| 41.5M/41.5M [00:00<00:00, 57.1MB/s]
+
      0%|          | 0.00/41.5M [00:00<?, ?B/s]
     15%|#5        | 6.33M/41.5M [00:00<00:00, 57.4MB/s]
     28%|##8       | 11.8M/41.5M [00:00<00:00, 50.0MB/s]
     40%|####      | 16.6M/41.5M [00:00<00:00, 42.4MB/s]
     54%|#####3    | 22.3M/41.5M [00:00<00:00, 46.1MB/s]
     65%|######4   | 26.8M/41.5M [00:00<00:00, 41.4MB/s]
     77%|#######7  | 32.0M/41.5M [00:00<00:00, 39.2MB/s]
    100%|##########| 41.5M/41.5M [00:00<00:00, 48.8MB/s]
 
 
 
diff --git a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
index 88d993eff3..dc7213c6cd 100644
--- a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
@@ -101,7 +101,7 @@ Load a pretrained PyTorch model
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.
       warnings.warn(msg)
     Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
-
      0%|          | 0.00/44.7M [00:00<?, ?B/s]
     25%|##4       | 11.0M/44.7M [00:00<00:00, 116MB/s]
     51%|#####     | 22.6M/44.7M [00:00<00:00, 119MB/s]
     76%|#######6  | 34.0M/44.7M [00:00<00:00, 110MB/s]
    100%|#########9| 44.5M/44.7M [00:00<00:00, 106MB/s]
    100%|##########| 44.7M/44.7M [00:00<00:00, 109MB/s]
+
      0%|          | 0.00/44.7M [00:00<?, ?B/s]
     18%|#8        | 8.12M/44.7M [00:00<00:00, 71.4MB/s]
     42%|####2     | 18.9M/44.7M [00:00<00:00, 94.2MB/s]
     72%|#######1  | 32.0M/44.7M [00:00<00:00, 95.5MB/s]
    100%|##########| 44.7M/44.7M [00:00<00:00, 104MB/s] 
 
 
 
diff --git a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
index 6044af7dbb..159267ec28 100644
--- a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
@@ -424,7 +424,7 @@ Run the corresponding model on tensorflow
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  26.367 seconds)
+   **Total running time of the script:** ( 1 minutes  20.392 seconds)
 
 
 .. _sphx_glr_download_how_to_compile_models_from_tensorflow.py:
diff --git a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
index 83d6a63e16..9ddab8bdff 100644
--- a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
 
 Computation times
 =================
-**06:42.659** total execution time for **how_to_compile_models** files:
+**06:19.372** total execution time for **how_to_compile_models** files:
 
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:26.367 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:20.392 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``)       | 01:20.114 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``)       | 01:16.820 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``)         | 00:56.607 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``)         | 00:51.752 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``)       | 00:37.207 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``)       | 00:35.713 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``)         | 00:32.225 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``)         | 00:30.081 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``)           | 00:31.423 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``)           | 00:29.950 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``)         | 00:27.907 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``)         | 00:27.477 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``)       | 00:25.628 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``)       | 00:24.333 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``)           | 00:22.457 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``)           | 00:20.219 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``)             | 00:02.724 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``)             | 00:02.635 | 0.0 MB |
 +-----------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
index 3569f57df4..5e83d4977f 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
@@ -727,7 +727,7 @@ well as provides information about the model's performance
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-     3345.3108    3345.1042    3347.3534    3344.2457      0.9430   
+     3339.0634    3338.9141    3341.0578    3337.0339      1.2823   
                
 
 
@@ -736,7 +736,7 @@ well as provides information about the model's performance
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  4.549 seconds)
+   **Total running time of the script:** ( 1 minutes  2.822 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_model_on_adreno.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
index 8b29b978c7..7c8684c3ad 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
@@ -437,7 +437,7 @@ Execute on TVM
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      16.3370      16.3538      16.5308      16.1540       0.1121   
+      15.7339      15.7292      15.8864      15.6243       0.0913   
                
 
 
diff --git a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
index 78792b34b6..60fa2b1041 100644
--- a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
@@ -130,7 +130,7 @@ Load pre-trained maskrcnn from torchvision and do tracing
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MaskRCNN_ResNet50_FPN_Weights.COCO_V1`. You can also use `weights=MaskRCNN_ResNet50_FPN_Weights.DEFAULT` to get the most up-to-date weights.
       warnings.warn(msg)
     Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
-
      0%|          | 0.00/170M [00:00<?, ?B/s]
      5%|5         | 9.27M/170M [00:00<00:01, 97.2MB/s]
     11%|#         | 18.5M/170M [00:00<00:02, 73.6MB/s]
     15%|#5        | 25.9M/170M [00:00<00:02, 65.8MB/s]
     20%|##        | 34.1M/170M [00:00<00:02, 64.2MB/s]
     25%|##4       | 42.1M/170M [00:00<00:02, 63.9MB/s]
     33%|###2      | 56.0M/170M [00:00<00:01, 72.6MB/s]
     41%|####1     | 69.9M/170M [00:00<00:01, 90.8MB/s]
     47%|####6     | 79.1M/170M [00:01<00:01, 86.7MB/s]
     52%|#####1    | 88.0M/170M [00:01<00:00, 88.7MB/s]
     57%|#####6    | 96.8M/170M [00:01<00:01, 76.5MB/s]
     64%|######3   | 108M/170M [00:01<00:00, 86.7MB/s] 
     71%|#######   | 120M/170M [00:01<00:00, 86.4MB/s]
     77%|#######6  | 130M/170M [00:01<00:00, 90.4MB/s]
     82%|########1 | 139M/170M [00:01<00:00, 69.0MB/s]
     87%|########6 | 147M/170M [00:01<00:00, 73.1MB/s]
     93%|#########3| 158M/170M [00:02<00:00, 82.6MB/s]
     98%|#########8| 167M/170M [00:02<00:00, 81.9MB/s]
   
  100%|##########| 170M/170M [00:02<00:00, 78.5MB/s]
+
      0%|          | 0.00/170M [00:00<?, ?B/s]
      4%|3         | 6.30M/170M [00:00<00:02, 66.0MB/s]
      7%|7         | 12.6M/170M [00:00<00:02, 56.4MB/s]
     11%|#         | 18.1M/170M [00:00<00:03, 43.0MB/s]
     14%|#4        | 24.0M/170M [00:00<00:03, 44.6MB/s]
     19%|#8        | 32.0M/170M [00:00<00:02, 52.8MB/s]
     24%|##3       | 40.0M/170M [00:00<00:02, 47.4MB/s]
     28%|##8       | 48.0M/170M [00:01<00:02, 48.2MB/s]
     36%|###6      | 61.5M/170M [00:01<00:01, 69.1MB/s]
     42%|####2     | 72.0M/170M [00:01<00:01, 70.4MB/s]
     47%|####7     | 80.0M/170M [00:01<00:01, 72.1MB/s]
     52%|#####1    | 88.0M/170M [00:01<00:01, 71.8MB/s]
     56%|#####6    | 95.2M/170M [00:01<00:01, 64.8MB/s]
     60%|#####9    | 102M/170M [00:01<00:01, 64.8MB/s] 
     64%|######3   | 108M/170M [00:01<00:01, 50.6MB/s]
     68%|######8   | 116M/170M [00:02<00:00, 57.4MB/s]
     72%|#######1  | 122M/170M [00:02<00:00, 56.3MB/s]
     75%|#######5  | 128M/170M [00:02<00:00, 50.2MB/s]
 
     80%|########  | 136M/170M [00:02<00:00, 58.2MB/s]
     89%|########8 | 150M/170M [00:02<00:00, 81.4MB/s]
     94%|#########3| 159M/170M [00:02<00:00, 74.5MB/s]
     98%|#########8| 167M/170M [00:02<00:00, 62.8MB/s]
    100%|##########| 170M/170M [00:02<00:00, 60.7MB/s]
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/nn/functional.py:3897: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
       for i in range(dim)
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/detection/anchor_utils.py:124: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -299,7 +299,7 @@ Get boxes with score larger than 0.9
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 3 minutes  43.340 seconds)
+   **Total running time of the script:** ( 3 minutes  22.809 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
index f3454570b1..3b2ad3dabf 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
@@ -227,7 +227,7 @@ training. Other models require a full post training calibration.
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MobileNet_V2_Weights.IMAGENET1K_V1`. You can also use `weights=MobileNet_V2_Weights.DEFAULT` to get the most up-to-date weights.
       warnings.warn(msg)
     Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
-
      0%|          | 0.00/13.6M [00:00<?, ?B/s]
     59%|#####8    | 7.99M/13.6M [00:00<00:00, 47.4MB/s]
     92%|#########2| 12.5M/13.6M [00:00<00:00, 45.0MB/s]
    100%|##########| 13.6M/13.6M [00:00<00:00, 48.3MB/s]
+
      0%|          | 0.00/13.6M [00:00<?, ?B/s]
     59%|#####8    | 7.99M/13.6M [00:00<00:00, 72.2MB/s]
    100%|##########| 13.6M/13.6M [00:00<00:00, 86.3MB/s]
 
 
 
@@ -409,7 +409,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      90.5697      90.4384      95.3423      90.2346       0.6346   
+      90.0964      89.9840      95.3425      89.8630       0.5670   
                
 
 
@@ -458,7 +458,7 @@ TODO
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  17.620 seconds)
+   **Total running time of the script:** ( 1 minutes  12.482 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_prequantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
index 5fc270448f..5f8f018154 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
@@ -423,7 +423,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      121.5602     121.5118     123.0774     120.6707      0.4584   
+      118.2268     118.0605     120.5108     116.8578      0.7670   
                
 
 
@@ -460,7 +460,7 @@ Here we give an example of how to measure performance of TVM compiled models.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  36.672 seconds)
+   **Total running time of the script:** ( 2 minutes  36.205 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
index 82441d0612..e006e4baa6 100644
--- a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
@@ -257,7 +257,7 @@ We create a Relay VM to build and execute the model.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  39.242 seconds)
+   **Total running time of the script:** ( 1 minutes  36.839 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_quantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
index ff3f8b182b..f95b89ff19 100644
--- a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
@@ -170,7 +170,7 @@ Convert and compile model for CPU.
             data: None
       input_sym_arg_type = in_param.infer_type()[0]
     Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
-
      0%|          | 0/132723 [00:00<?, ?KB/s]
      4%|4         | 5406/132723 [00:00<00:02, 54053.61KB/s]
      9%|9         | 12568/132723 [00:00<00:01, 64303.17KB/s]
     15%|#5        | 19913/132723 [00:00<00:01, 68474.63KB/s]
     20%|##        | 27147/132723 [00:00<00:01, 69997.59KB/s]
     26%|##5       | 34473/132723 [00:00<00:01, 71127.78KB/s]
     31%|###1      | 41630/132723 [00:00<00:01, 71276.62KB/s]
     37%|###6      | 48911/132723 [00:00<00:01, 71771.12KB/s]
     42%|####2     | 56125/132723 [00:00<00:01, 71885.15KB/s]
     48%|####7     | 63353/132723 [00:00<00:00, 72006.43KB/s]
     53%|#####3    | 70562/132723 [00:01<00:00, 72028.11KB/s]
     59%|#####8    | 77765/132723 [00:01<00:00, 71671.32KB/s]
     64%|######4   | 85525/132723 [00:01<00:00, 73465.63KB/s]
     70%|#######   | 93176/132723 [00:01<00:00, 74383.10KB/s]
     76%|#######6  | 100953/132723 [00:01<00:00, 75402.56KB/s]
     82%|########1 | 108602/132723 [00:01<00:00, 75728.05KB/s]
     88%|########7
  | 116261/132723 [00:01<00:00, 75984.24KB/s]
     93%|#########3| 124018/132723 [00:01<00:00, 76312.12KB/s]
    100%|#########9| 132220/132723 [00:01<00:00, 78022.00KB/s]
    100%|##########| 132723/132723 [00:01<00:00, 73286.92KB/s]
+
      0%|          | 0/132723 [00:00<?, ?KB/s]
      5%|4         | 6432/132723 [00:00<00:01, 64315.00KB/s]
     11%|#         | 14292/132723 [00:00<00:01, 72714.86KB/s]
     17%|#6        | 22172/132723 [00:00<00:01, 75485.50KB/s]
     23%|##2       | 30013/132723 [00:00<00:01, 76634.87KB/s]
     28%|##8       | 37677/132723 [00:00<00:01, 75788.39KB/s]
     34%|###4      | 45625/132723 [00:00<00:01, 77028.58KB/s]
     40%|####      | 53568/132723 [00:00<00:01, 77807.97KB/s]
     46%|####6     | 61505/132723 [00:00<00:00, 78300.89KB/s]
     52%|#####2    | 69457/132723 [00:00<00:00, 78677.13KB/s]
     58%|#####8    | 77480/132723 [00:01<00:00, 79153.48KB/s]
     64%|######4   | 85408/132723 [00:01<00:00, 79190.85KB/s]
     70%|#######   | 93328/132723 [00:01<00:00, 79153.56KB/s]
     76%|#######6  | 101256/132723 [00:01<00:00, 79190.82KB/s]
     82%|########2 | 109236/132723 [00:01<00:00, 79371.81KB/s]
     88%|########8 | 117211/132723 [00:01<00:00, 79483.38KB/s]
     94%|########
 #4| 125228/132723 [00:01<00:00, 79688.28KB/s]
    100%|##########| 132723/132723 [00:01<00:00, 78195.11KB/s]
 
 
 
@@ -246,7 +246,7 @@ Display result
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 3 minutes  42.908 seconds)
+   **Total running time of the script:** ( 3 minutes  30.899 seconds)
 
 
 .. _sphx_glr_download_how_to_deploy_models_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
index 6ee6bd8d08..3c629af22d 100644
--- a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
 
 Computation times
 =================
-**15:45.104** total execution time for **how_to_deploy_models** files:
+**14:56.331** total execution time for **how_to_deploy_models** files:
 
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:43.340 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)                           | 03:30.899 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``)                           | 03:42.908 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:22.809 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)           | 02:36.672 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``)           | 02:36.205 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``)                               | 01:39.242 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``)                               | 01:36.839 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``)                         | 01:17.620 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``)                         | 01:12.482 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_adreno.py` (``deploy_model_on_adreno.py``)                   | 01:04.549 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_adreno.py` (``deploy_model_on_adreno.py``)                   | 01:02.822 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``)                 | 00:43.076 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``)                 | 00:39.952 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``)                       | 00:29.035 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``)                       | 00:27.314 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)                       | 00:28.657 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``)                       | 00:27.002 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``)                                     | 00:00.006 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
index 21160598b4..8da6092f67 100644
--- a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
@@ -463,7 +463,7 @@ First let us define two helper functions to get the mobilenet model and a cat im
 
  .. code-block:: none
 
-    Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip6daa7b75-ff03-48fa-9986-16dd6f921aee from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+    Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip4e80f2d3-e9d7-4777-a3fc-910c963a2b33 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
 
 
 
diff --git a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
index c4560db149..ce003a1751 100644
--- a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**00:56.270** total execution time for **how_to_extend_tvm** files:
+**00:51.788** total execution time for **how_to_extend_tvm** files:
 
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:52.287 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:48.088 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``)           | 00:02.848 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``)           | 00:02.643 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``)                     | 00:01.128 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``)                     | 00:01.049 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``)       | 00:00.007 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``)       | 00:00.008 | 0.0 MB |
 +-------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
index 70e0a47329..cacf10793f 100644
--- a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
@@ -220,10 +220,10 @@ profile the execution time of each passes.
  .. code-block:: none
 
     Printing results of timing profile...
-    InferType: 21703us [21703us] (47.86%; 47.86%)
-    FoldScaleAxis: 23641us [10us] (52.14%; 52.14%)
-            FoldConstant: 23631us [1853us] (52.11%; 99.96%)
-                    InferType: 21778us [21778us] (48.03%; 92.16%)
+    InferType: 21097us [21097us] (48.81%; 48.81%)
+    FoldScaleAxis: 22125us [6us] (51.19%; 51.19%)
+            FoldConstant: 22118us [1665us] (51.17%; 99.97%)
+                    InferType: 20454us [20454us] (47.32%; 92.47%)
 
 
 
@@ -262,10 +262,10 @@ Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for th
  .. code-block:: none
 
     Printing results of timing profile...
-    InferType: 21797us [21797us] (48.65%; 48.65%)
-    FoldScaleAxis: 23003us [8us] (51.35%; 51.35%)
-            FoldConstant: 22995us [1841us] (51.33%; 99.97%)
-                    InferType: 21155us [21155us] (47.22%; 92.00%)
+    InferType: 20549us [20549us] (48.45%; 48.45%)
+    FoldScaleAxis: 21861us [5us] (51.55%; 51.55%)
+            FoldConstant: 21856us [1662us] (51.54%; 99.98%)
+                    InferType: 20194us [20194us] (47.62%; 92.40%)
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
index 92fa0fae6b..1560fc39e8 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
@@ -331,7 +331,7 @@ latency of convolution.
 
  .. code-block:: none
 
-    Convolution: 52.344928 ms
+    Convolution: 49.075649 ms
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
index d9123773fe..5a2f814874 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
@@ -608,7 +608,7 @@ be able to run on our build server
 
  .. code-block:: none
 
-    conv2d with tensor core: 7.066883 ms
+    conv2d with tensor core: 10.195248 ms
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
index b2f6031a4f..56c0ef31c0 100644
--- a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
@@ -134,8 +134,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 
  .. code-block:: none
 
-    Numpy running time: 0.018969
-    Baseline: 3.226941
+    Numpy running time: 0.017991
+    Baseline: 3.421825
 
 
 
@@ -227,7 +227,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 
  .. code-block:: none
 
-    Opt1: 0.314088
+    Opt1: 0.298870
 
 
 
@@ -318,7 +318,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
 
  .. code-block:: none
 
-    Opt2: 0.343279
+    Opt2: 0.335195
 
 
 
@@ -406,7 +406,7 @@ the access pattern for A matrix is more cache friendly.
 
  .. code-block:: none
 
-    Opt3: 0.132868
+    Opt3: 0.114142
 
 
 
@@ -523,7 +523,7 @@ flattening.
 
  .. code-block:: none
 
-    Opt4: 0.109901
+    Opt4: 0.110151
 
 
 
@@ -635,7 +635,7 @@ write to C when all the block results are ready.
 
  .. code-block:: none
 
-    Opt5: 0.111493
+    Opt5: 0.111765
 
 
 
@@ -748,7 +748,7 @@ Furthermore, we can also utilize multi-core processors to do the thread-level pa
 
  .. code-block:: none
 
-    Opt6: 0.148603
+    Opt6: 0.146912
 
 
 
diff --git a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
index c452b152a1..7d085b360f 100644
--- a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
 
 Computation times
 =================
-**00:35.066** total execution time for **how_to_optimize_operators** files:
+**00:34.796** total execution time for **how_to_optimize_operators** files:
 
 +-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``)                       | 00:32.429 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``)                       | 00:32.247 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.484 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``)             | 00:01.153 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``)             | 00:01.065 | 0.0 MB |
 +-----------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
index a549040038..86cd480cb2 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
 
 Computation times
 =================
-**09:49.275** total execution time for **how_to_tune_with_autoscheduler** files:
+**09:15.573** total execution time for **how_to_tune_with_autoscheduler** files:
 
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 05:49.007 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 05:35.959 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``)             | 01:42.967 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``)             | 01:38.441 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``)           | 01:07.441 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``)           | 01:05.335 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)               | 00:41.045 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``)               | 00:29.087 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``)             | 00:15.132 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``)             | 00:13.906 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``)           | 00:13.683 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``)           | 00:12.845 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
index 89100f9483..959d2688a1 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
@@ -244,787 +244,1119 @@ cooperative fetching, unrolling and operator fusion.
         def main(data: T.Buffer((1, 512, 7, 7), "float32"), kernel: T.Buffer((512, 512, 3, 3), "float32"), bias: T.Buffer((1, 512, 1, 1), "float32"), compute: T.Buffer((1, 512, 7, 7), "float32")):
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             blockIdx_x = T.env_thread("blockIdx.x")
-            T.launch_thread(blockIdx_x, 28)
-            conv2d_nchw = T.allocate([8], "float32", "local")
-            pad_temp_shared = T.allocate([216], "float32", "shared")
-            kernel_shared = T.allocate([9216], "float32", "shared")
+            T.launch_thread(blockIdx_x, 32)
+            conv2d_nchw = T.allocate([14], "float32", "local")
+            pad_temp_shared = T.allocate([648], "float32", "shared")
+            kernel_shared = T.allocate([1152], "float32", "shared")
             threadIdx_x = T.env_thread("threadIdx.x")
-            T.launch_thread(threadIdx_x, 112)
-            conv2d_nchw_1 = T.Buffer((1,), data=conv2d_nchw, scope="local", align=4)
+            T.launch_thread(threadIdx_x, 56)
+            conv2d_nchw_1 = T.Buffer((14,), data=conv2d_nchw, scope="local", align=32)
             conv2d_nchw_1[0] = T.float32(0)
+            conv2d_nchw_1[7] = T.float32(0)
             conv2d_nchw_1[1] = T.float32(0)
+            conv2d_nchw_1[8] = T.float32(0)
             conv2d_nchw_1[2] = T.float32(0)
+            conv2d_nchw_1[9] = T.float32(0)
             conv2d_nchw_1[3] = T.float32(0)
+            conv2d_nchw_1[10] = T.float32(0)
             conv2d_nchw_1[4] = T.float32(0)
+            conv2d_nchw_1[11] = T.float32(0)
             conv2d_nchw_1[5] = T.float32(0)
+            conv2d_nchw_1[12] = T.float32(0)
             conv2d_nchw_1[6] = T.float32(0)
-            conv2d_nchw_1[7] = T.float32(0)
+            conv2d_nchw_1[13] = T.float32(0)
             for rc_outer_outer in range(64):
                 cse_var_1: T.int32 = rc_outer_outer * 72
                 threadIdx_x_1 = T.env_thread("threadIdx.x")
-                pad_temp_shared_1 = T.Buffer((216,), data=pad_temp_shared, scope="shared")
-                data_1 = T.Buffer((25088,), data=data.data)
-                with T.launch_thread(threadIdx_x_1, 112):
-                    pad_temp_shared_1[threadIdx_x_1] = T.if_then_else(3 <= threadIdx_x_1 % 27 and threadIdx_x_1 % 27 < 24 and 1 <= blockIdx_x % 7 + threadIdx_x_1 % 3 and blockIdx_x % 7 + threadIdx_x_1 % 3 < 8, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 27 * 49 + threadIdx_x_1 % 27 // 3 * 7 + blockIdx_x % 7 + threadIdx_x_1 % 3 - 8], T.float32(0))
-                with T.launch_thread(threadIdx_x_1, 112):
-                    if T.likely(threadIdx_x_1 < 104):
-                        pad_temp_shared_1[threadIdx_x_1 + 112] = T.if_then_else(3 <= (threadIdx_x_1 + 4) % 27 and (threadIdx_x_1 + 4) % 27 < 24 and 1 <= blockIdx_x % 7 + (threadIdx_x_1 + 1) % 3 and blockIdx_x % 7 + (threadIdx_x_1 + 1) % 3 < 8, data_1[rc_outer_outer * 392 + (threadIdx_x_1 + 112) // 27 * 49 + (threadIdx_x_1 + 4) % 27 // 3 * 7 + blockIdx_x % 7 + (threadIdx_x_1 + 1) % 3 - 8], T.float32(0))
+                pad_temp_shared_1 = T.Buffer((648,), data=pad_temp_shared, scope="shared")
+                with T.launch_thread(threadIdx_x_1, 56):
+                    if T.likely(threadIdx_x_1 < 24):
+                        pad_temp_shared_1[threadIdx_x_1 * 27] = T.float32(0)
+                        data_1 = T.Buffer((25088,), data=data.data)
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 1] = T.if_then_else(1 <= threadIdx_x_1 % 3, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 - 7], T.float32(0))
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 2] = T.if_then_else(1 <= threadIdx_x_1 % 3, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 - 6], T.float32(0))
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 3] = T.if_then_else(1 <= threadIdx_x_1 % 3, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 - 5], T.float32(0))
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 4] = T.if_then_else(1 <= threadIdx_x_1 % 3, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 - 4], T.float32(0))
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 5] = T.if_then_else(1 <= threadIdx_x_1 % 3, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 - 3], T.float32(0))
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 6] = T.if_then_else(1 <= threadIdx_x_1 % 3, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 - 2], T.float32(0))
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 7] = T.if_then_else(1 <= threadIdx_x_1 % 3, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 - 1], T.float32(0))
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 8] = T.float32(0)
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 9] = T.float32(0)
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 10] = data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21]
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 11] = data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 + 1]
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 12] = data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 + 2]
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 13] = data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 + 3]
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 14] = data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 + 4]
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 15] = data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 + 5]
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 16] = data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 + 6]
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 17] = T.float32(0)
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 18] = T.float32(0)
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 19] = T.if_then_else(threadIdx_x_1 % 3 < 2, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 + 7], T.float32(0))
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 20] = T.if_then_else(threadIdx_x_1 % 3 < 2, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 + 8], T.float32(0))
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 21] = T.if_then_else(threadIdx_x_1 % 3 < 2, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 + 9], T.float32(0))
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 22] = T.if_then_else(threadIdx_x_1 % 3 < 2, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 + 10], T.float32(0))
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 23] = T.if_then_else(threadIdx_x_1 % 3 < 2, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 + 11], T.float32(0))
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 24] = T.if_then_else(threadIdx_x_1 % 3 < 2, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 + 12], T.float32(0))
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 25] = T.if_then_else(threadIdx_x_1 % 3 < 2, data_1[rc_outer_outer * 392 + threadIdx_x_1 // 3 * 49 + threadIdx_x_1 % 3 * 21 + 13], T.float32(0))
+                        pad_temp_shared_1[threadIdx_x_1 * 27 + 26] = T.float32(0)
                 threadIdx_x_2 = T.env_thread("threadIdx.x")
-                kernel_shared_1 = T.Buffer((9216,), data=kernel_shared, scope="shared")
+                kernel_shared_1 = T.Buffer((1152,), data=kernel_shared, scope="shared")
                 kernel_1 = T.Buffer((2359296,), data=kernel.data)
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 72 * 4608 + cse_var_1 + threadIdx_x_2 % 72]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 112] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 112) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 72 // 9 * 9 + (threadIdx_x_2 + 4) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 224] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 224) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 336] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 336) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 48) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 1) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 448] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 448) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 72 // 9 * 9 + (threadIdx_x_2 + 7) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 560] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 560) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 56) % 72 // 9 * 9 + (threadIdx_x_2 + 2) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 672] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 672) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 24) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 2) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 784] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 784) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 64) % 72 // 9 * 9 + (threadIdx_x_2 + 1) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 896] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 896) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 72 // 9 * 9 + (threadIdx_x_2 + 5) % 9 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 1008] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 72 * 4608 + cse_var_1 + threadIdx_x_2 % 72 + 64512]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 1120] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1120) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 72 // 9 * 9 + (threadIdx_x_2 + 4) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 1232] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1232) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 1344] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1344) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 48) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 1) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 1456] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1456) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 72 // 9 * 9 + (threadIdx_x_2 + 7) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 1568] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1568) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 56) % 72 // 9 * 9 + (threadIdx_x_2 + 2) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 1680] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1680) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 24) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 2) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 1792] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1792) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 64) % 72 // 9 * 9 + (threadIdx_x_2 + 1) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 1904] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1904) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 72 // 9 * 9 + (threadIdx_x_2 + 5) % 9 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 2016] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 72 * 4608 + cse_var_1 + threadIdx_x_2 % 72 + 129024]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 2128] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2128) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 72 // 9 * 9 + (threadIdx_x_2 + 4) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 2240] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2240) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 2352] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2352) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 48) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 1) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 2464] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2464) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 72 // 9 * 9 + (threadIdx_x_2 + 7) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 2576] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2576) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 56) % 72 // 9 * 9 + (threadIdx_x_2 + 2) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 2688] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2688) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 24) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 2) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 2800] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2800) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 64) % 72 // 9 * 9 + (threadIdx_x_2 + 1) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 2912] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2912) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 72 // 9 * 9 + (threadIdx_x_2 + 5) % 9 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 3024] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 72 * 4608 + cse_var_1 + threadIdx_x_2 % 72 + 193536]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 3136] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 3136) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 72 // 9 * 9 + (threadIdx_x_2 + 4) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 3248] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 3248) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 3360] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 3360) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 48) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 1) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 3472] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 3472) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 72 // 9 * 9 + (threadIdx_x_2 + 7) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 3584] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 3584) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 56) % 72 // 9 * 9 + (threadIdx_x_2 + 2) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 3696] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 3696) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 24) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 2) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 3808] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 3808) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 64) % 72 // 9 * 9 + (threadIdx_x_2 + 1) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 3920] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 3920) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 72 // 9 * 9 + (threadIdx_x_2 + 5) % 9 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 4032] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 72 * 4608 + cse_var_1 + threadIdx_x_2 % 72 + 258048]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 4144] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 4144) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 72 // 9 * 9 + (threadIdx_x_2 + 4) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 4256] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 4256) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 4368] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 4368) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 48) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 1) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 4480] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 4480) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 72 // 9 * 9 + (threadIdx_x_2 + 7) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 4592] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 4592) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 56) % 72 // 9 * 9 + (threadIdx_x_2 + 2) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 4704] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 4704) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 24) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 2) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 4816] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 4816) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 64) % 72 // 9 * 9 + (threadIdx_x_2 + 1) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 4928] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 4928) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 72 // 9 * 9 + (threadIdx_x_2 + 5) % 9 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 5040] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 72 * 4608 + cse_var_1 + threadIdx_x_2 % 72 + 322560]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 5152] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 5152) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 72 // 9 * 9 + (threadIdx_x_2 + 4) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 5264] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 5264) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 5376] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 5376) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 48) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 1) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 5488] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 5488) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 72 // 9 * 9 + (threadIdx_x_2 + 7) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 5600] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 5600) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 56) % 72 // 9 * 9 + (threadIdx_x_2 + 2) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 5712] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 5712) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 24) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 2) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 5824] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 5824) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 64) % 72 // 9 * 9 + (threadIdx_x_2 + 1) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 5936] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 5936) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 72 // 9 * 9 + (threadIdx_x_2 + 5) % 9 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 6048] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 72 * 4608 + cse_var_1 + threadIdx_x_2 % 72 + 387072]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 6160] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 6160) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 72 // 9 * 9 + (threadIdx_x_2 + 4) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 6272] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 6272) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 6384] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 6384) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 48) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 1) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 6496] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 6496) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 72 // 9 * 9 + (threadIdx_x_2 + 7) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 6608] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 6608) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 56) % 72 // 9 * 9 + (threadIdx_x_2 + 2) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 6720] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 6720) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 24) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 2) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 6832] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 6832) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 64) % 72 // 9 * 9 + (threadIdx_x_2 + 1) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 6944] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 6944) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 72 // 9 * 9 + (threadIdx_x_2 + 5) % 9 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 7056] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 72 * 4608 + cse_var_1 + threadIdx_x_2 % 72 + 451584]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 7168] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 7168) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 72 // 9 * 9 + (threadIdx_x_2 + 4) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 7280] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 7280) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 7392] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 7392) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 48) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 1) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 7504] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 7504) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 72 // 9 * 9 + (threadIdx_x_2 + 7) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 7616] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 7616) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 56) % 72 // 9 * 9 + (threadIdx_x_2 + 2) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 7728] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 7728) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 24) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 2) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 7840] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 7840) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 64) % 72 // 9 * 9 + (threadIdx_x_2 + 1) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 7952] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 7952) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 72 // 9 * 9 + (threadIdx_x_2 + 5) % 9 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 8064] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 72 * 4608 + cse_var_1 + threadIdx_x_2 % 72 + 516096]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 8176] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 8176) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 72 // 9 * 9 + (threadIdx_x_2 + 4) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 8288] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 8288) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 8400] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 8400) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 48) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 1) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 8512] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 8512) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 72 // 9 * 9 + (threadIdx_x_2 + 7) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 8624] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 8624) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 56) % 72 // 9 * 9 + (threadIdx_x_2 + 2) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 8736] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 8736) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 24) % 72 // 9 * 9 + (threadIdx_x_2 // 3 + 2) % 3 * 3 + threadIdx_x_2 % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 8848] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 8848) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 64) % 72 // 9 * 9 + (threadIdx_x_2 + 1) % 9]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 8960] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 8960) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 72 // 9 * 9 + (threadIdx_x_2 + 5) % 9 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
-                with T.launch_thread(threadIdx_x_2, 112):
-                    kernel_shared_1[threadIdx_x_2 + 9072] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 72 * 4608 + cse_var_1 + threadIdx_x_2 % 72 + 580608]
-                with T.launch_thread(threadIdx_x_2, 112):
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x * 73728 + cse_var_1 + threadIdx_x_2]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 56] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 56) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 56) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 112] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 112) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 72 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 168] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 168) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 // 3 + 8) % 24 * 3 + threadIdx_x_2 % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 224] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 224) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 280] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 280) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 64) % 72 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 336] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 336) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 // 3 + 16) % 24 * 3 + threadIdx_x_2 % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 392] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 392) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 448] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 448) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 72 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 504] = kernel_1[blockIdx_x * 73728 + cse_var_1 + threadIdx_x_2 + 32256]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 560] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 560) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 56) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 616] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 616) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 72 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 672] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 672) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 // 3 + 8) % 24 * 3 + threadIdx_x_2 % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 728] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 728) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 784] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 784) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 64) % 72 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 840] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 840) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 // 3 + 16) % 24 * 3 + threadIdx_x_2 % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 896] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 896) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 952] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 952) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 72 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 1008] = kernel_1[blockIdx_x * 73728 + cse_var_1 + threadIdx_x_2 + 64512]
+                with T.launch_thread(threadIdx_x_2, 56):
+                    kernel_shared_1[threadIdx_x_2 + 1064] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 1064) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 56) % 72 // 3 * 3 + (threadIdx_x_2 + 2) % 3]
+                with T.launch_thread(threadIdx_x_2, 56):
                     if T.likely(threadIdx_x_2 < 32):
-                        kernel_shared_1[threadIdx_x_2 + 9184] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 9184) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 72 // 9 * 9 + (threadIdx_x_2 + 4) % 9 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3] * kernel_shared_1[threadIdx_x // 7 * 72]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 1152]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 2304]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 3456]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 4608]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 5760]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 6912]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 8064]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 3]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 1155]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 2307]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 3459]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 4611]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 5763]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 6915]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 3] * kernel_shared_1[threadIdx_x // 7 * 72 + 8067]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 6] * kernel_shared_1[threadIdx_x // 7 * 72 + 6]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 6] * kernel_shared_1[threadIdx_x // 7 * 72 + 1158]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 6] * kernel_shared_1[threadIdx_x // 7 * 72 + 2310]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 6] * kernel_shared_1[threadIdx_x // 7 * 72 + 3462]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 6] * kernel_shared_1[threadIdx_x // 7 * 72 + 4614]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 6] * kernel_shared_1[threadIdx_x // 7 * 72 + 5766]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 6] * kernel_shared_1[threadIdx_x // 7 * 72 + 6918]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 6] * kernel_shared_1[threadIdx_x // 7 * 72 + 8070]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 27] * kernel_shared_1[threadIdx_x // 7 * 72 + 9]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 27] * kernel_shared_1[threadIdx_x // 7 * 72 + 1161]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 27] * kernel_shared_1[threadIdx_x // 7 * 72 + 2313]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 27] * kernel_shared_1[threadIdx_x // 7 * 72 + 3465]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 27] * kernel_shared_1[threadIdx_x // 7 * 72 + 4617]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 27] * kernel_shared_1[threadIdx_x // 7 * 72 + 5769]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 27] * kernel_shared_1[threadIdx_x // 7 * 72 + 6921]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 27] * kernel_shared_1[threadIdx_x // 7 * 72 + 8073]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 30] * kernel_shared_1[threadIdx_x // 7 * 72 + 12]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 30] * kernel_shared_1[threadIdx_x // 7 * 72 + 1164]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 30] * kernel_shared_1[threadIdx_x // 7 * 72 + 2316]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 30] * kernel_shared_1[threadIdx_x // 7 * 72 + 3468]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 30] * kernel_shared_1[threadIdx_x // 7 * 72 + 4620]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 30] * kernel_shared_1[threadIdx_x // 7 * 72 + 5772]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 30] * kernel_shared_1[threadIdx_x // 7 * 72 + 6924]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 30] * kernel_shared_1[threadIdx_x // 7 * 72 + 8076]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 33] * kernel_shared_1[threadIdx_x // 7 * 72 + 15]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 33] * kernel_shared_1[threadIdx_x // 7 * 72 + 1167]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 33] * kernel_shared_1[threadIdx_x // 7 * 72 + 2319]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 33] * kernel_shared_1[threadIdx_x // 7 * 72 + 3471]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 33] * kernel_shared_1[threadIdx_x // 7 * 72 + 4623]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 33] * kernel_shared_1[threadIdx_x // 7 * 72 + 5775]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 33] * kernel_shared_1[threadIdx_x // 7 * 72 + 6927]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 33] * kernel_shared_1[threadIdx_x // 7 * 72 + 8079]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 54] * kernel_shared_1[threadIdx_x // 7 * 72 + 18]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 54] * kernel_shared_1[threadIdx_x // 7 * 72 + 1170]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 54] * kernel_shared_1[threadIdx_x // 7 * 72 + 2322]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 54] * kernel_shared_1[threadIdx_x // 7 * 72 + 3474]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 54] * kernel_shared_1[threadIdx_x // 7 * 72 + 4626]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 54] * kernel_shared_1[threadIdx_x // 7 * 72 + 5778]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 54] * kernel_shared_1[threadIdx_x // 7 * 72 + 6930]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 54] * kernel_shared_1[threadIdx_x // 7 * 72 + 8082]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 57] * kernel_shared_1[threadIdx_x // 7 * 72 + 21]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 57] * kernel_shared_1[threadIdx_x // 7 * 72 + 1173]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 57] * kernel_shared_1[threadIdx_x // 7 * 72 + 2325]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 57] * kernel_shared_1[threadIdx_x // 7 * 72 + 3477]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 57] * kernel_shared_1[threadIdx_x // 7 * 72 + 4629]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 57] * kernel_shared_1[threadIdx_x // 7 * 72 + 5781]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 57] * kernel_shared_1[threadIdx_x // 7 * 72 + 6933]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 57] * kernel_shared_1[threadIdx_x // 7 * 72 + 8085]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 60] * kernel_shared_1[threadIdx_x // 7 * 72 + 24]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 60] * kernel_shared_1[threadIdx_x // 7 * 72 + 1176]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 60] * kernel_shared_1[threadIdx_x // 7 * 72 + 2328]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 60] * kernel_shared_1[threadIdx_x // 7 * 72 + 3480]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 60] * kernel_shared_1[threadIdx_x // 7 * 72 + 4632]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 60] * kernel_shared_1[threadIdx_x // 7 * 72 + 5784]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 60] * kernel_shared_1[threadIdx_x // 7 * 72 + 6936]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 60] * kernel_shared_1[threadIdx_x // 7 * 72 + 8088]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 81] * kernel_shared_1[threadIdx_x // 7 * 72 + 27]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 81] * kernel_shared_1[threadIdx_x // 7 * 72 + 1179]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 81] * kernel_shared_1[threadIdx_x // 7 * 72 + 2331]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 81] * kernel_shared_1[threadIdx_x // 7 * 72 + 3483]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 81] * kernel_shared_1[threadIdx_x // 7 * 72 + 4635]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 81] * kernel_shared_1[threadIdx_x // 7 * 72 + 5787]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 81] * kernel_shared_1[threadIdx_x // 7 * 72 + 6939]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 81] * kernel_shared_1[threadIdx_x // 7 * 72 + 8091]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 84] * kernel_shared_1[threadIdx_x // 7 * 72 + 30]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 84] * kernel_shared_1[threadIdx_x // 7 * 72 + 1182]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 84] * kernel_shared_1[threadIdx_x // 7 * 72 + 2334]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 84] * kernel_shared_1[threadIdx_x // 7 * 72 + 3486]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 84] * kernel_shared_1[threadIdx_x // 7 * 72 + 4638]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 84] * kernel_shared_1[threadIdx_x // 7 * 72 + 5790]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 84] * kernel_shared_1[threadIdx_x // 7 * 72 + 6942]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 84] * kernel_shared_1[threadIdx_x // 7 * 72 + 8094]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 87] * kernel_shared_1[threadIdx_x // 7 * 72 + 33]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 87] * kernel_shared_1[threadIdx_x // 7 * 72 + 1185]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 87] * kernel_shared_1[threadIdx_x // 7 * 72 + 2337]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 87] * kernel_shared_1[threadIdx_x // 7 * 72 + 3489]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 87] * kernel_shared_1[threadIdx_x // 7 * 72 + 4641]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 87] * kernel_shared_1[threadIdx_x // 7 * 72 + 5793]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 87] * kernel_shared_1[threadIdx_x // 7 * 72 + 6945]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 87] * kernel_shared_1[threadIdx_x // 7 * 72 + 8097]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 1] * kernel_shared_1[threadIdx_x // 7 * 72 + 1]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 1] * kernel_shared_1[threadIdx_x // 7 * 72 + 1153]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 1] * kernel_shared_1[threadIdx_x // 7 * 72 + 2305]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 1] * kernel_shared_1[threadIdx_x // 7 * 72 + 3457]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 1] * kernel_shared_1[threadIdx_x // 7 * 72 + 4609]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 1] * kernel_shared_1[threadIdx_x // 7 * 72 + 5761]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 1] * kernel_shared_1[threadIdx_x // 7 * 72 + 6913]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 1] * kernel_shared_1[threadIdx_x // 7 * 72 + 8065]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 4] * kernel_shared_1[threadIdx_x // 7 * 72 + 4]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 4] * kernel_shared_1[threadIdx_x // 7 * 72 + 1156]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 4] * kernel_shared_1[threadIdx_x // 7 * 72 + 2308]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 4] * kernel_shared_1[threadIdx_x // 7 * 72 + 3460]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 4] * kernel_shared_1[threadIdx_x // 7 * 72 + 4612]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 4] * kernel_shared_1[threadIdx_x // 7 * 72 + 5764]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 4] * kernel_shared_1[threadIdx_x // 7 * 72 + 6916]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 4] * kernel_shared_1[threadIdx_x // 7 * 72 + 8068]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 7] * kernel_shared_1[threadIdx_x // 7 * 72 + 7]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 7] * kernel_shared_1[threadIdx_x // 7 * 72 + 1159]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 7] * kernel_shared_1[threadIdx_x // 7 * 72 + 2311]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 7] * kernel_shared_1[threadIdx_x // 7 * 72 + 3463]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 7] * kernel_shared_1[threadIdx_x // 7 * 72 + 4615]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 7] * kernel_shared_1[threadIdx_x // 7 * 72 + 5767]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 7] * kernel_shared_1[threadIdx_x // 7 * 72 + 6919]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 7] * kernel_shared_1[threadIdx_x // 7 * 72 + 8071]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 28] * kernel_shared_1[threadIdx_x // 7 * 72 + 10]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 28] * kernel_shared_1[threadIdx_x // 7 * 72 + 1162]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 28] * kernel_shared_1[threadIdx_x // 7 * 72 + 2314]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 28] * kernel_shared_1[threadIdx_x // 7 * 72 + 3466]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 28] * kernel_shared_1[threadIdx_x // 7 * 72 + 4618]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 28] * kernel_shared_1[threadIdx_x // 7 * 72 + 5770]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 28] * kernel_shared_1[threadIdx_x // 7 * 72 + 6922]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 28] * kernel_shared_1[threadIdx_x // 7 * 72 + 8074]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 31] * kernel_shared_1[threadIdx_x // 7 * 72 + 13]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 31] * kernel_shared_1[threadIdx_x // 7 * 72 + 1165]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 31] * kernel_shared_1[threadIdx_x // 7 * 72 + 2317]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 31] * kernel_shared_1[threadIdx_x // 7 * 72 + 3469]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 31] * kernel_shared_1[threadIdx_x // 7 * 72 + 4621]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 31] * kernel_shared_1[threadIdx_x // 7 * 72 + 5773]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 31] * kernel_shared_1[threadIdx_x // 7 * 72 + 6925]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 31] * kernel_shared_1[threadIdx_x // 7 * 72 + 8077]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 34] * kernel_shared_1[threadIdx_x // 7 * 72 + 16]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 34] * kernel_shared_1[threadIdx_x // 7 * 72 + 1168]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 34] * kernel_shared_1[threadIdx_x // 7 * 72 + 2320]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 34] * kernel_shared_1[threadIdx_x // 7 * 72 + 3472]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 34] * kernel_shared_1[threadIdx_x // 7 * 72 + 4624]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 34] * kernel_shared_1[threadIdx_x // 7 * 72 + 5776]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 34] * kernel_shared_1[threadIdx_x // 7 * 72 + 6928]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 34] * kernel_shared_1[threadIdx_x // 7 * 72 + 8080]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 55] * kernel_shared_1[threadIdx_x // 7 * 72 + 19]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 55] * kernel_shared_1[threadIdx_x // 7 * 72 + 1171]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 55] * kernel_shared_1[threadIdx_x // 7 * 72 + 2323]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 55] * kernel_shared_1[threadIdx_x // 7 * 72 + 3475]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 55] * kernel_shared_1[threadIdx_x // 7 * 72 + 4627]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 55] * kernel_shared_1[threadIdx_x // 7 * 72 + 5779]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 55] * kernel_shared_1[threadIdx_x // 7 * 72 + 6931]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 55] * kernel_shared_1[threadIdx_x // 7 * 72 + 8083]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 58] * kernel_shared_1[threadIdx_x // 7 * 72 + 22]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 58] * kernel_shared_1[threadIdx_x // 7 * 72 + 1174]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 58] * kernel_shared_1[threadIdx_x // 7 * 72 + 2326]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 58] * kernel_shared_1[threadIdx_x // 7 * 72 + 3478]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 58] * kernel_shared_1[threadIdx_x // 7 * 72 + 4630]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 58] * kernel_shared_1[threadIdx_x // 7 * 72 + 5782]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 58] * kernel_shared_1[threadIdx_x // 7 * 72 + 6934]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 58] * kernel_shared_1[threadIdx_x // 7 * 72 + 8086]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 61] * kernel_shared_1[threadIdx_x // 7 * 72 + 25]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 61] * kernel_shared_1[threadIdx_x // 7 * 72 + 1177]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 61] * kernel_shared_1[threadIdx_x // 7 * 72 + 2329]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 61] * kernel_shared_1[threadIdx_x // 7 * 72 + 3481]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 61] * kernel_shared_1[threadIdx_x // 7 * 72 + 4633]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 61] * kernel_shared_1[threadIdx_x // 7 * 72 + 5785]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 61] * kernel_shared_1[threadIdx_x // 7 * 72 + 6937]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 61] * kernel_shared_1[threadIdx_x // 7 * 72 + 8089]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 82] * kernel_shared_1[threadIdx_x // 7 * 72 + 28]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 82] * kernel_shared_1[threadIdx_x // 7 * 72 + 1180]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 82] * kernel_shared_1[threadIdx_x // 7 * 72 + 2332]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 82] * kernel_shared_1[threadIdx_x // 7 * 72 + 3484]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 82] * kernel_shared_1[threadIdx_x // 7 * 72 + 4636]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 82] * kernel_shared_1[threadIdx_x // 7 * 72 + 5788]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 82] * kernel_shared_1[threadIdx_x // 7 * 72 + 6940]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 82] * kernel_shared_1[threadIdx_x // 7 * 72 + 8092]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 85] * kernel_shared_1[threadIdx_x // 7 * 72 + 31]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 85] * kernel_shared_1[threadIdx_x // 7 * 72 + 1183]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 85] * kernel_shared_1[threadIdx_x // 7 * 72 + 2335]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 85] * kernel_shared_1[threadIdx_x // 7 * 72 + 3487]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 85] * kernel_shared_1[threadIdx_x // 7 * 72 + 4639]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 85] * kernel_shared_1[threadIdx_x // 7 * 72 + 5791]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 85] * kernel_shared_1[threadIdx_x // 7 * 72 + 6943]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 85] * kernel_shared_1[threadIdx_x // 7 * 72 + 8095]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 88] * kernel_shared_1[threadIdx_x // 7 * 72 + 34]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 88] * kernel_shared_1[threadIdx_x // 7 * 72 + 1186]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 88] * kernel_shared_1[threadIdx_x // 7 * 72 + 2338]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 88] * kernel_shared_1[threadIdx_x // 7 * 72 + 3490]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 88] * kernel_shared_1[threadIdx_x // 7 * 72 + 4642]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 88] * kernel_shared_1[threadIdx_x // 7 * 72 + 5794]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 88] * kernel_shared_1[threadIdx_x // 7 * 72 + 6946]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 88] * kernel_shared_1[threadIdx_x // 7 * 72 + 8098]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 2] * kernel_shared_1[threadIdx_x // 7 * 72 + 2]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 2] * kernel_shared_1[threadIdx_x // 7 * 72 + 1154]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 2] * kernel_shared_1[threadIdx_x // 7 * 72 + 2306]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 2] * kernel_shared_1[threadIdx_x // 7 * 72 + 3458]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 2] * kernel_shared_1[threadIdx_x // 7 * 72 + 4610]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 2] * kernel_shared_1[threadIdx_x // 7 * 72 + 5762]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 2] * kernel_shared_1[threadIdx_x // 7 * 72 + 6914]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 2] * kernel_shared_1[threadIdx_x // 7 * 72 + 8066]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 5] * kernel_shared_1[threadIdx_x // 7 * 72 + 5]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 5] * kernel_shared_1[threadIdx_x // 7 * 72 + 1157]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 5] * kernel_shared_1[threadIdx_x // 7 * 72 + 2309]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 5] * kernel_shared_1[threadIdx_x // 7 * 72 + 3461]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 5] * kernel_shared_1[threadIdx_x // 7 * 72 + 4613]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 5] * kernel_shared_1[threadIdx_x // 7 * 72 + 5765]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 5] * kernel_shared_1[threadIdx_x // 7 * 72 + 6917]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 5] * kernel_shared_1[threadIdx_x // 7 * 72 + 8069]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 8] * kernel_shared_1[threadIdx_x // 7 * 72 + 8]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 8] * kernel_shared_1[threadIdx_x // 7 * 72 + 1160]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 8] * kernel_shared_1[threadIdx_x // 7 * 72 + 2312]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 8] * kernel_shared_1[threadIdx_x // 7 * 72 + 3464]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 8] * kernel_shared_1[threadIdx_x // 7 * 72 + 4616]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 8] * kernel_shared_1[threadIdx_x // 7 * 72 + 5768]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 8] * kernel_shared_1[threadIdx_x // 7 * 72 + 6920]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 8] * kernel_shared_1[threadIdx_x // 7 * 72 + 8072]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 29] * kernel_shared_1[threadIdx_x // 7 * 72 + 11]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 29] * kernel_shared_1[threadIdx_x // 7 * 72 + 1163]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 29] * kernel_shared_1[threadIdx_x // 7 * 72 + 2315]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 29] * kernel_shared_1[threadIdx_x // 7 * 72 + 3467]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 29] * kernel_shared_1[threadIdx_x // 7 * 72 + 4619]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 29] * kernel_shared_1[threadIdx_x // 7 * 72 + 5771]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 29] * kernel_shared_1[threadIdx_x // 7 * 72 + 6923]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 29] * kernel_shared_1[threadIdx_x // 7 * 72 + 8075]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 32] * kernel_shared_1[threadIdx_x // 7 * 72 + 14]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 32] * kernel_shared_1[threadIdx_x // 7 * 72 + 1166]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 32] * kernel_shared_1[threadIdx_x // 7 * 72 + 2318]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 32] * kernel_shared_1[threadIdx_x // 7 * 72 + 3470]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 32] * kernel_shared_1[threadIdx_x // 7 * 72 + 4622]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 32] * kernel_shared_1[threadIdx_x // 7 * 72 + 5774]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 32] * kernel_shared_1[threadIdx_x // 7 * 72 + 6926]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 32] * kernel_shared_1[threadIdx_x // 7 * 72 + 8078]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 35] * kernel_shared_1[threadIdx_x // 7 * 72 + 17]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 35] * kernel_shared_1[threadIdx_x // 7 * 72 + 1169]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 35] * kernel_shared_1[threadIdx_x // 7 * 72 + 2321]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 35] * kernel_shared_1[threadIdx_x // 7 * 72 + 3473]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 35] * kernel_shared_1[threadIdx_x // 7 * 72 + 4625]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 35] * kernel_shared_1[threadIdx_x // 7 * 72 + 5777]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 35] * kernel_shared_1[threadIdx_x // 7 * 72 + 6929]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 35] * kernel_shared_1[threadIdx_x // 7 * 72 + 8081]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 56] * kernel_shared_1[threadIdx_x // 7 * 72 + 20]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 56] * kernel_shared_1[threadIdx_x // 7 * 72 + 1172]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 56] * kernel_shared_1[threadIdx_x // 7 * 72 + 2324]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 56] * kernel_shared_1[threadIdx_x // 7 * 72 + 3476]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 56] * kernel_shared_1[threadIdx_x // 7 * 72 + 4628]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 56] * kernel_shared_1[threadIdx_x // 7 * 72 + 5780]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 56] * kernel_shared_1[threadIdx_x // 7 * 72 + 6932]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 56] * kernel_shared_1[threadIdx_x // 7 * 72 + 8084]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 59] * kernel_shared_1[threadIdx_x // 7 * 72 + 23]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 59] * kernel_shared_1[threadIdx_x // 7 * 72 + 1175]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 59] * kernel_shared_1[threadIdx_x // 7 * 72 + 2327]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 59] * kernel_shared_1[threadIdx_x // 7 * 72 + 3479]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 59] * kernel_shared_1[threadIdx_x // 7 * 72 + 4631]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 59] * kernel_shared_1[threadIdx_x // 7 * 72 + 5783]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 59] * kernel_shared_1[threadIdx_x // 7 * 72 + 6935]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 59] * kernel_shared_1[threadIdx_x // 7 * 72 + 8087]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 62] * kernel_shared_1[threadIdx_x // 7 * 72 + 26]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 62] * kernel_shared_1[threadIdx_x // 7 * 72 + 1178]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 62] * kernel_shared_1[threadIdx_x // 7 * 72 + 2330]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 62] * kernel_shared_1[threadIdx_x // 7 * 72 + 3482]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 62] * kernel_shared_1[threadIdx_x // 7 * 72 + 4634]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 62] * kernel_shared_1[threadIdx_x // 7 * 72 + 5786]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 62] * kernel_shared_1[threadIdx_x // 7 * 72 + 6938]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 62] * kernel_shared_1[threadIdx_x // 7 * 72 + 8090]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 83] * kernel_shared_1[threadIdx_x // 7 * 72 + 29]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 83] * kernel_shared_1[threadIdx_x // 7 * 72 + 1181]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 83] * kernel_shared_1[threadIdx_x // 7 * 72 + 2333]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 83] * kernel_shared_1[threadIdx_x // 7 * 72 + 3485]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 83] * kernel_shared_1[threadIdx_x // 7 * 72 + 4637]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 83] * kernel_shared_1[threadIdx_x // 7 * 72 + 5789]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 83] * kernel_shared_1[threadIdx_x // 7 * 72 + 6941]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 83] * kernel_shared_1[threadIdx_x // 7 * 72 + 8093]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 86] * kernel_shared_1[threadIdx_x // 7 * 72 + 32]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 86] * kernel_shared_1[threadIdx_x // 7 * 72 + 1184]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 86] * kernel_shared_1[threadIdx_x // 7 * 72 + 2336]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 86] * kernel_shared_1[threadIdx_x // 7 * 72 + 3488]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 86] * kernel_shared_1[threadIdx_x // 7 * 72 + 4640]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 86] * kernel_shared_1[threadIdx_x // 7 * 72 + 5792]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 86] * kernel_shared_1[threadIdx_x // 7 * 72 + 6944]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 86] * kernel_shared_1[threadIdx_x // 7 * 72 + 8096]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 89] * kernel_shared_1[threadIdx_x // 7 * 72 + 35]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 89] * kernel_shared_1[threadIdx_x // 7 * 72 + 1187]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 89] * kernel_shared_1[threadIdx_x // 7 * 72 + 2339]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 89] * kernel_shared_1[threadIdx_x // 7 * 72 + 3491]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 89] * kernel_shared_1[threadIdx_x // 7 * 72 + 4643]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 89] * kernel_shared_1[threadIdx_x // 7 * 72 + 5795]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 89] * kernel_shared_1[threadIdx_x // 7 * 72 + 6947]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 89] * kernel_shared_1[threadIdx_x // 7 * 72 + 8099]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 108] * kernel_shared_1[threadIdx_x // 7 * 72 + 36]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 108] * kernel_shared_1[threadIdx_x // 7 * 72 + 1188]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 108] * kernel_shared_1[threadIdx_x // 7 * 72 + 2340]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 108] * kernel_shared_1[threadIdx_x // 7 * 72 + 3492]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 108] * kernel_shared_1[threadIdx_x // 7 * 72 + 4644]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 108] * kernel_shared_1[threadIdx_x // 7 * 72 + 5796]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 108] * kernel_shared_1[threadIdx_x // 7 * 72 + 6948]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 108] * kernel_shared_1[threadIdx_x // 7 * 72 + 8100]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 111] * kernel_shared_1[threadIdx_x // 7 * 72 + 39]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 111] * kernel_shared_1[threadIdx_x // 7 * 72 + 1191]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 111] * kernel_shared_1[threadIdx_x // 7 * 72 + 2343]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 111] * kernel_shared_1[threadIdx_x // 7 * 72 + 3495]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 111] * kernel_shared_1[threadIdx_x // 7 * 72 + 4647]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 111] * kernel_shared_1[threadIdx_x // 7 * 72 + 5799]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 111] * kernel_shared_1[threadIdx_x // 7 * 72 + 6951]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 111] * kernel_shared_1[threadIdx_x // 7 * 72 + 8103]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 114] * kernel_shared_1[threadIdx_x // 7 * 72 + 42]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 114] * kernel_shared_1[threadIdx_x // 7 * 72 + 1194]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 114] * kernel_shared_1[threadIdx_x // 7 * 72 + 2346]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 114] * kernel_shared_1[threadIdx_x // 7 * 72 + 3498]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 114] * kernel_shared_1[threadIdx_x // 7 * 72 + 4650]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 114] * kernel_shared_1[threadIdx_x // 7 * 72 + 5802]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 114] * kernel_shared_1[threadIdx_x // 7 * 72 + 6954]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 114] * kernel_shared_1[threadIdx_x // 7 * 72 + 8106]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 135] * kernel_shared_1[threadIdx_x // 7 * 72 + 45]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 135] * kernel_shared_1[threadIdx_x // 7 * 72 + 1197]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 135] * kernel_shared_1[threadIdx_x // 7 * 72 + 2349]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 135] * kernel_shared_1[threadIdx_x // 7 * 72 + 3501]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 135] * kernel_shared_1[threadIdx_x // 7 * 72 + 4653]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 135] * kernel_shared_1[threadIdx_x // 7 * 72 + 5805]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 135] * kernel_shared_1[threadIdx_x // 7 * 72 + 6957]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 135] * kernel_shared_1[threadIdx_x // 7 * 72 + 8109]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 138] * kernel_shared_1[threadIdx_x // 7 * 72 + 48]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 138] * kernel_shared_1[threadIdx_x // 7 * 72 + 1200]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 138] * kernel_shared_1[threadIdx_x // 7 * 72 + 2352]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 138] * kernel_shared_1[threadIdx_x // 7 * 72 + 3504]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 138] * kernel_shared_1[threadIdx_x // 7 * 72 + 4656]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 138] * kernel_shared_1[threadIdx_x // 7 * 72 + 5808]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 138] * kernel_shared_1[threadIdx_x // 7 * 72 + 6960]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 138] * kernel_shared_1[threadIdx_x // 7 * 72 + 8112]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 141] * kernel_shared_1[threadIdx_x // 7 * 72 + 51]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 141] * kernel_shared_1[threadIdx_x // 7 * 72 + 1203]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 141] * kernel_shared_1[threadIdx_x // 7 * 72 + 2355]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 141] * kernel_shared_1[threadIdx_x // 7 * 72 + 3507]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 141] * kernel_shared_1[threadIdx_x // 7 * 72 + 4659]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 141] * kernel_shared_1[threadIdx_x // 7 * 72 + 5811]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 141] * kernel_shared_1[threadIdx_x // 7 * 72 + 6963]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 141] * kernel_shared_1[threadIdx_x // 7 * 72 + 8115]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 162] * kernel_shared_1[threadIdx_x // 7 * 72 + 54]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 162] * kernel_shared_1[threadIdx_x // 7 * 72 + 1206]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 162] * kernel_shared_1[threadIdx_x // 7 * 72 + 2358]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 162] * kernel_shared_1[threadIdx_x // 7 * 72 + 3510]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 162] * kernel_shared_1[threadIdx_x // 7 * 72 + 4662]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 162] * kernel_shared_1[threadIdx_x // 7 * 72 + 5814]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 162] * kernel_shared_1[threadIdx_x // 7 * 72 + 6966]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 162] * kernel_shared_1[threadIdx_x // 7 * 72 + 8118]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 165] * kernel_shared_1[threadIdx_x // 7 * 72 + 57]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 165] * kernel_shared_1[threadIdx_x // 7 * 72 + 1209]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 165] * kernel_shared_1[threadIdx_x // 7 * 72 + 2361]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 165] * kernel_shared_1[threadIdx_x // 7 * 72 + 3513]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 165] * kernel_shared_1[threadIdx_x // 7 * 72 + 4665]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 165] * kernel_shared_1[threadIdx_x // 7 * 72 + 5817]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 165] * kernel_shared_1[threadIdx_x // 7 * 72 + 6969]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 165] * kernel_shared_1[threadIdx_x // 7 * 72 + 8121]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 168] * kernel_shared_1[threadIdx_x // 7 * 72 + 60]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 168] * kernel_shared_1[threadIdx_x // 7 * 72 + 1212]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 168] * kernel_shared_1[threadIdx_x // 7 * 72 + 2364]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 168] * kernel_shared_1[threadIdx_x // 7 * 72 + 3516]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 168] * kernel_shared_1[threadIdx_x // 7 * 72 + 4668]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 168] * kernel_shared_1[threadIdx_x // 7 * 72 + 5820]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 168] * kernel_shared_1[threadIdx_x // 7 * 72 + 6972]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 168] * kernel_shared_1[threadIdx_x // 7 * 72 + 8124]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 189] * kernel_shared_1[threadIdx_x // 7 * 72 + 63]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 189] * kernel_shared_1[threadIdx_x // 7 * 72 + 1215]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 189] * kernel_shared_1[threadIdx_x // 7 * 72 + 2367]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 189] * kernel_shared_1[threadIdx_x // 7 * 72 + 3519]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 189] * kernel_shared_1[threadIdx_x // 7 * 72 + 4671]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 189] * kernel_shared_1[threadIdx_x // 7 * 72 + 5823]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 189] * kernel_shared_1[threadIdx_x // 7 * 72 + 6975]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 189] * kernel_shared_1[threadIdx_x // 7 * 72 + 8127]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 192] * kernel_shared_1[threadIdx_x // 7 * 72 + 66]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 192] * kernel_shared_1[threadIdx_x // 7 * 72 + 1218]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 192] * kernel_shared_1[threadIdx_x // 7 * 72 + 2370]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 192] * kernel_shared_1[threadIdx_x // 7 * 72 + 3522]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 192] * kernel_shared_1[threadIdx_x // 7 * 72 + 4674]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 192] * kernel_shared_1[threadIdx_x // 7 * 72 + 5826]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 192] * kernel_shared_1[threadIdx_x // 7 * 72 + 6978]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 192] * kernel_shared_1[threadIdx_x // 7 * 72 + 8130]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 195] * kernel_shared_1[threadIdx_x // 7 * 72 + 69]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 195] * kernel_shared_1[threadIdx_x // 7 * 72 + 1221]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 195] * kernel_shared_1[threadIdx_x // 7 * 72 + 2373]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 195] * kernel_shared_1[threadIdx_x // 7 * 72 + 3525]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 195] * kernel_shared_1[threadIdx_x // 7 * 72 + 4677]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 195] * kernel_shared_1[threadIdx_x // 7 * 72 + 5829]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 195] * kernel_shared_1[threadIdx_x // 7 * 72 + 6981]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 195] * kernel_shared_1[threadIdx_x // 7 * 72 + 8133]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 109] * kernel_shared_1[threadIdx_x // 7 * 72 + 37]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 109] * kernel_shared_1[threadIdx_x // 7 * 72 + 1189]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 109] * kernel_shared_1[threadIdx_x // 7 * 72 + 2341]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 109] * kernel_shared_1[threadIdx_x // 7 * 72 + 3493]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 109] * kernel_shared_1[threadIdx_x // 7 * 72 + 4645]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 109] * kernel_shared_1[threadIdx_x // 7 * 72 + 5797]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 109] * kernel_shared_1[threadIdx_x // 7 * 72 + 6949]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 109] * kernel_shared_1[threadIdx_x // 7 * 72 + 8101]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 112] * kernel_shared_1[threadIdx_x // 7 * 72 + 40]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 112] * kernel_shared_1[threadIdx_x // 7 * 72 + 1192]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 112] * kernel_shared_1[threadIdx_x // 7 * 72 + 2344]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 112] * kernel_shared_1[threadIdx_x // 7 * 72 + 3496]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 112] * kernel_shared_1[threadIdx_x // 7 * 72 + 4648]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 112] * kernel_shared_1[threadIdx_x // 7 * 72 + 5800]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 112] * kernel_shared_1[threadIdx_x // 7 * 72 + 6952]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 112] * kernel_shared_1[threadIdx_x // 7 * 72 + 8104]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 115] * kernel_shared_1[threadIdx_x // 7 * 72 + 43]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 115] * kernel_shared_1[threadIdx_x // 7 * 72 + 1195]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 115] * kernel_shared_1[threadIdx_x // 7 * 72 + 2347]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 115] * kernel_shared_1[threadIdx_x // 7 * 72 + 3499]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 115] * kernel_shared_1[threadIdx_x // 7 * 72 + 4651]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 115] * kernel_shared_1[threadIdx_x // 7 * 72 + 5803]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 115] * kernel_shared_1[threadIdx_x // 7 * 72 + 6955]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 115] * kernel_shared_1[threadIdx_x // 7 * 72 + 8107]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 136] * kernel_shared_1[threadIdx_x // 7 * 72 + 46]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 136] * kernel_shared_1[threadIdx_x // 7 * 72 + 1198]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 136] * kernel_shared_1[threadIdx_x // 7 * 72 + 2350]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 136] * kernel_shared_1[threadIdx_x // 7 * 72 + 3502]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 136] * kernel_shared_1[threadIdx_x // 7 * 72 + 4654]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 136] * kernel_shared_1[threadIdx_x // 7 * 72 + 5806]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 136] * kernel_shared_1[threadIdx_x // 7 * 72 + 6958]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 136] * kernel_shared_1[threadIdx_x // 7 * 72 + 8110]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 139] * kernel_shared_1[threadIdx_x // 7 * 72 + 49]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 139] * kernel_shared_1[threadIdx_x // 7 * 72 + 1201]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 139] * kernel_shared_1[threadIdx_x // 7 * 72 + 2353]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 139] * kernel_shared_1[threadIdx_x // 7 * 72 + 3505]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 139] * kernel_shared_1[threadIdx_x // 7 * 72 + 4657]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 139] * kernel_shared_1[threadIdx_x // 7 * 72 + 5809]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 139] * kernel_shared_1[threadIdx_x // 7 * 72 + 6961]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 139] * kernel_shared_1[threadIdx_x // 7 * 72 + 8113]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 142] * kernel_shared_1[threadIdx_x // 7 * 72 + 52]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 142] * kernel_shared_1[threadIdx_x // 7 * 72 + 1204]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 142] * kernel_shared_1[threadIdx_x // 7 * 72 + 2356]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 142] * kernel_shared_1[threadIdx_x // 7 * 72 + 3508]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 142] * kernel_shared_1[threadIdx_x // 7 * 72 + 4660]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 142] * kernel_shared_1[threadIdx_x // 7 * 72 + 5812]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 142] * kernel_shared_1[threadIdx_x // 7 * 72 + 6964]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 142] * kernel_shared_1[threadIdx_x // 7 * 72 + 8116]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 163] * kernel_shared_1[threadIdx_x // 7 * 72 + 55]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 163] * kernel_shared_1[threadIdx_x // 7 * 72 + 1207]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 163] * kernel_shared_1[threadIdx_x // 7 * 72 + 2359]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 163] * kernel_shared_1[threadIdx_x // 7 * 72 + 3511]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 163] * kernel_shared_1[threadIdx_x // 7 * 72 + 4663]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 163] * kernel_shared_1[threadIdx_x // 7 * 72 + 5815]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 163] * kernel_shared_1[threadIdx_x // 7 * 72 + 6967]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 163] * kernel_shared_1[threadIdx_x // 7 * 72 + 8119]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 166] * kernel_shared_1[threadIdx_x // 7 * 72 + 58]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 166] * kernel_shared_1[threadIdx_x // 7 * 72 + 1210]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 166] * kernel_shared_1[threadIdx_x // 7 * 72 + 2362]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 166] * kernel_shared_1[threadIdx_x // 7 * 72 + 3514]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 166] * kernel_shared_1[threadIdx_x // 7 * 72 + 4666]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 166] * kernel_shared_1[threadIdx_x // 7 * 72 + 5818]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 166] * kernel_shared_1[threadIdx_x // 7 * 72 + 6970]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 166] * kernel_shared_1[threadIdx_x // 7 * 72 + 8122]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 169] * kernel_shared_1[threadIdx_x // 7 * 72 + 61]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 169] * kernel_shared_1[threadIdx_x // 7 * 72 + 1213]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 169] * kernel_shared_1[threadIdx_x // 7 * 72 + 2365]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 169] * kernel_shared_1[threadIdx_x // 7 * 72 + 3517]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 169] * kernel_shared_1[threadIdx_x // 7 * 72 + 4669]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 169] * kernel_shared_1[threadIdx_x // 7 * 72 + 5821]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 169] * kernel_shared_1[threadIdx_x // 7 * 72 + 6973]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 169] * kernel_shared_1[threadIdx_x // 7 * 72 + 8125]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 190] * kernel_shared_1[threadIdx_x // 7 * 72 + 64]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 190] * kernel_shared_1[threadIdx_x // 7 * 72 + 1216]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 190] * kernel_shared_1[threadIdx_x // 7 * 72 + 2368]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 190] * kernel_shared_1[threadIdx_x // 7 * 72 + 3520]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 190] * kernel_shared_1[threadIdx_x // 7 * 72 + 4672]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 190] * kernel_shared_1[threadIdx_x // 7 * 72 + 5824]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 190] * kernel_shared_1[threadIdx_x // 7 * 72 + 6976]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 190] * kernel_shared_1[threadIdx_x // 7 * 72 + 8128]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 193] * kernel_shared_1[threadIdx_x // 7 * 72 + 67]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 193] * kernel_shared_1[threadIdx_x // 7 * 72 + 1219]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 193] * kernel_shared_1[threadIdx_x // 7 * 72 + 2371]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 193] * kernel_shared_1[threadIdx_x // 7 * 72 + 3523]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 193] * kernel_shared_1[threadIdx_x // 7 * 72 + 4675]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 193] * kernel_shared_1[threadIdx_x // 7 * 72 + 5827]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 193] * kernel_shared_1[threadIdx_x // 7 * 72 + 6979]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 193] * kernel_shared_1[threadIdx_x // 7 * 72 + 8131]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 196] * kernel_shared_1[threadIdx_x // 7 * 72 + 70]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 196] * kernel_shared_1[threadIdx_x // 7 * 72 + 1222]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 196] * kernel_shared_1[threadIdx_x // 7 * 72 + 2374]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 196] * kernel_shared_1[threadIdx_x // 7 * 72 + 3526]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 196] * kernel_shared_1[threadIdx_x // 7 * 72 + 4678]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 196] * kernel_shared_1[threadIdx_x // 7 * 72 + 5830]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 196] * kernel_shared_1[threadIdx_x // 7 * 72 + 6982]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 196] * kernel_shared_1[threadIdx_x // 7 * 72 + 8134]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 110] * kernel_shared_1[threadIdx_x // 7 * 72 + 38]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 110] * kernel_shared_1[threadIdx_x // 7 * 72 + 1190]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 110] * kernel_shared_1[threadIdx_x // 7 * 72 + 2342]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 110] * kernel_shared_1[threadIdx_x // 7 * 72 + 3494]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 110] * kernel_shared_1[threadIdx_x // 7 * 72 + 4646]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 110] * kernel_shared_1[threadIdx_x // 7 * 72 + 5798]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 110] * kernel_shared_1[threadIdx_x // 7 * 72 + 6950]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 110] * kernel_shared_1[threadIdx_x // 7 * 72 + 8102]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 113] * kernel_shared_1[threadIdx_x // 7 * 72 + 41]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 113] * kernel_shared_1[threadIdx_x // 7 * 72 + 1193]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 113] * kernel_shared_1[threadIdx_x // 7 * 72 + 2345]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 113] * kernel_shared_1[threadIdx_x // 7 * 72 + 3497]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 113] * kernel_shared_1[threadIdx_x // 7 * 72 + 4649]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 113] * kernel_shared_1[threadIdx_x // 7 * 72 + 5801]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 113] * kernel_shared_1[threadIdx_x // 7 * 72 + 6953]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 113] * kernel_shared_1[threadIdx_x // 7 * 72 + 8105]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 116] * kernel_shared_1[threadIdx_x // 7 * 72 + 44]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 116] * kernel_shared_1[threadIdx_x // 7 * 72 + 1196]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 116] * kernel_shared_1[threadIdx_x // 7 * 72 + 2348]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 116] * kernel_shared_1[threadIdx_x // 7 * 72 + 3500]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 116] * kernel_shared_1[threadIdx_x // 7 * 72 + 4652]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 116] * kernel_shared_1[threadIdx_x // 7 * 72 + 5804]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 116] * kernel_shared_1[threadIdx_x // 7 * 72 + 6956]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 116] * kernel_shared_1[threadIdx_x // 7 * 72 + 8108]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 137] * kernel_shared_1[threadIdx_x // 7 * 72 + 47]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 137] * kernel_shared_1[threadIdx_x // 7 * 72 + 1199]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 137] * kernel_shared_1[threadIdx_x // 7 * 72 + 2351]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 137] * kernel_shared_1[threadIdx_x // 7 * 72 + 3503]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 137] * kernel_shared_1[threadIdx_x // 7 * 72 + 4655]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 137] * kernel_shared_1[threadIdx_x // 7 * 72 + 5807]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 137] * kernel_shared_1[threadIdx_x // 7 * 72 + 6959]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 137] * kernel_shared_1[threadIdx_x // 7 * 72 + 8111]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 140] * kernel_shared_1[threadIdx_x // 7 * 72 + 50]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 140] * kernel_shared_1[threadIdx_x // 7 * 72 + 1202]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 140] * kernel_shared_1[threadIdx_x // 7 * 72 + 2354]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 140] * kernel_shared_1[threadIdx_x // 7 * 72 + 3506]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 140] * kernel_shared_1[threadIdx_x // 7 * 72 + 4658]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 140] * kernel_shared_1[threadIdx_x // 7 * 72 + 5810]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 140] * kernel_shared_1[threadIdx_x // 7 * 72 + 6962]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 140] * kernel_shared_1[threadIdx_x // 7 * 72 + 8114]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 143] * kernel_shared_1[threadIdx_x // 7 * 72 + 53]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 143] * kernel_shared_1[threadIdx_x // 7 * 72 + 1205]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 143] * kernel_shared_1[threadIdx_x // 7 * 72 + 2357]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 143] * kernel_shared_1[threadIdx_x // 7 * 72 + 3509]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 143] * kernel_shared_1[threadIdx_x // 7 * 72 + 4661]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 143] * kernel_shared_1[threadIdx_x // 7 * 72 + 5813]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 143] * kernel_shared_1[threadIdx_x // 7 * 72 + 6965]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 143] * kernel_shared_1[threadIdx_x // 7 * 72 + 8117]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 164] * kernel_shared_1[threadIdx_x // 7 * 72 + 56]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 164] * kernel_shared_1[threadIdx_x // 7 * 72 + 1208]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 164] * kernel_shared_1[threadIdx_x // 7 * 72 + 2360]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 164] * kernel_shared_1[threadIdx_x // 7 * 72 + 3512]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 164] * kernel_shared_1[threadIdx_x // 7 * 72 + 4664]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 164] * kernel_shared_1[threadIdx_x // 7 * 72 + 5816]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 164] * kernel_shared_1[threadIdx_x // 7 * 72 + 6968]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 164] * kernel_shared_1[threadIdx_x // 7 * 72 + 8120]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 167] * kernel_shared_1[threadIdx_x // 7 * 72 + 59]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 167] * kernel_shared_1[threadIdx_x // 7 * 72 + 1211]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 167] * kernel_shared_1[threadIdx_x // 7 * 72 + 2363]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 167] * kernel_shared_1[threadIdx_x // 7 * 72 + 3515]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 167] * kernel_shared_1[threadIdx_x // 7 * 72 + 4667]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 167] * kernel_shared_1[threadIdx_x // 7 * 72 + 5819]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 167] * kernel_shared_1[threadIdx_x // 7 * 72 + 6971]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 167] * kernel_shared_1[threadIdx_x // 7 * 72 + 8123]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 170] * kernel_shared_1[threadIdx_x // 7 * 72 + 62]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 170] * kernel_shared_1[threadIdx_x // 7 * 72 + 1214]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 170] * kernel_shared_1[threadIdx_x // 7 * 72 + 2366]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 170] * kernel_shared_1[threadIdx_x // 7 * 72 + 3518]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 170] * kernel_shared_1[threadIdx_x // 7 * 72 + 4670]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 170] * kernel_shared_1[threadIdx_x // 7 * 72 + 5822]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 170] * kernel_shared_1[threadIdx_x // 7 * 72 + 6974]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 170] * kernel_shared_1[threadIdx_x // 7 * 72 + 8126]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 191] * kernel_shared_1[threadIdx_x // 7 * 72 + 65]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 191] * kernel_shared_1[threadIdx_x // 7 * 72 + 1217]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 191] * kernel_shared_1[threadIdx_x // 7 * 72 + 2369]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 191] * kernel_shared_1[threadIdx_x // 7 * 72 + 3521]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 191] * kernel_shared_1[threadIdx_x // 7 * 72 + 4673]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 191] * kernel_shared_1[threadIdx_x // 7 * 72 + 5825]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 191] * kernel_shared_1[threadIdx_x // 7 * 72 + 6977]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 191] * kernel_shared_1[threadIdx_x // 7 * 72 + 8129]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 194] * kernel_shared_1[threadIdx_x // 7 * 72 + 68]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 194] * kernel_shared_1[threadIdx_x // 7 * 72 + 1220]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 194] * kernel_shared_1[threadIdx_x // 7 * 72 + 2372]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 194] * kernel_shared_1[threadIdx_x // 7 * 72 + 3524]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 194] * kernel_shared_1[threadIdx_x // 7 * 72 + 4676]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 194] * kernel_shared_1[threadIdx_x // 7 * 72 + 5828]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 194] * kernel_shared_1[threadIdx_x // 7 * 72 + 6980]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 194] * kernel_shared_1[threadIdx_x // 7 * 72 + 8132]
-                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 197] * kernel_shared_1[threadIdx_x // 7 * 72 + 71]
-                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 197] * kernel_shared_1[threadIdx_x // 7 * 72 + 1223]
-                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 197] * kernel_shared_1[threadIdx_x // 7 * 72 + 2375]
-                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 197] * kernel_shared_1[threadIdx_x // 7 * 72 + 3527]
-                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 197] * kernel_shared_1[threadIdx_x // 7 * 72 + 4679]
-                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 197] * kernel_shared_1[threadIdx_x // 7 * 72 + 5831]
-                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 197] * kernel_shared_1[threadIdx_x // 7 * 72 + 6983]
-                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 * 3 + 197] * kernel_shared_1[threadIdx_x // 7 * 72 + 8135]
-            compute_1 = T.Buffer((25088,), data=compute.data)
-            bias_1 = T.Buffer((512,), data=bias.data)
-            compute_1[blockIdx_x // 7 * 6272 + threadIdx_x * 7 + blockIdx_x % 7] = T.max(conv2d_nchw_1[0] + bias_1[blockIdx_x // 7 * 128 + threadIdx_x // 7], T.float32(0))
-            compute_1[blockIdx_x // 7 * 6272 + threadIdx_x * 7 + blockIdx_x % 7 + 784] = T.max(conv2d_nchw_1[1] + bias_1[blockIdx_x // 7 * 128 + threadIdx_x // 7 + 16], T.float32(0))
-            compute_1[blockIdx_x // 7 * 6272 + threadIdx_x * 7 + blockIdx_x % 7 + 1568] = T.max(conv2d_nchw_1[2] + bias_1[blockIdx_x // 7 * 128 + threadIdx_x // 7 + 32], T.float32(0))
-            compute_1[blockIdx_x // 7 * 6272 + threadIdx_x * 7 + blockIdx_x % 7 + 2352] = T.max(conv2d_nchw_1[3] + bias_1[blockIdx_x // 7 * 128 + threadIdx_x // 7 + 48], T.float32(0))
-            compute_1[blockIdx_x // 7 * 6272 + threadIdx_x * 7 + blockIdx_x % 7 + 3136] = T.max(conv2d_nchw_1[4] + bias_1[blockIdx_x // 7 * 128 + threadIdx_x // 7 + 64], T.float32(0))
-            compute_1[blockIdx_x // 7 * 6272 + threadIdx_x * 7 + blockIdx_x % 7 + 3920] = T.max(conv2d_nchw_1[5] + bias_1[blockIdx_x // 7 * 128 + threadIdx_x // 7 + 80], T.float32(0))
-            compute_1[blockIdx_x // 7 * 6272 + threadIdx_x * 7 + blockIdx_x % 7 + 4704] = T.max(conv2d_nchw_1[6] + bias_1[blockIdx_x // 7 * 128 + threadIdx_x // 7 + 96], T.float32(0))
-            compute_1[blockIdx_x // 7 * 6272 + threadIdx_x * 7 + blockIdx_x % 7 + 5488] = T.max(conv2d_nchw_1[7] + bias_1[blockIdx_x // 7 * 128 + threadIdx_x // 7 + 112], T.float32(0))
+                        kernel_shared_1[threadIdx_x_2 + 1120] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 1120) // 72 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 72 // 3 * 3 + (threadIdx_x_2 + 1) % 3]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7] * kernel_shared_1[threadIdx_x // 7 * 144]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7] * kernel_shared_1[threadIdx_x // 7 * 144 + 72]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 1] * kernel_shared_1[threadIdx_x // 7 * 144 + 1]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 1] * kernel_shared_1[threadIdx_x // 7 * 144 + 73]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 2] * kernel_shared_1[threadIdx_x // 7 * 144 + 2]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 2] * kernel_shared_1[threadIdx_x // 7 * 144 + 74]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 9] * kernel_shared_1[threadIdx_x // 7 * 144]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 9] * kernel_shared_1[threadIdx_x // 7 * 144 + 72]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 10] * kernel_shared_1[threadIdx_x // 7 * 144 + 1]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 10] * kernel_shared_1[threadIdx_x // 7 * 144 + 73]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 11] * kernel_shared_1[threadIdx_x // 7 * 144 + 2]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 11] * kernel_shared_1[threadIdx_x // 7 * 144 + 74]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 144]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 144 + 72]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 19] * kernel_shared_1[threadIdx_x // 7 * 144 + 1]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 19] * kernel_shared_1[threadIdx_x // 7 * 144 + 73]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 20] * kernel_shared_1[threadIdx_x // 7 * 144 + 2]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 20] * kernel_shared_1[threadIdx_x // 7 * 144 + 74]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 27] * kernel_shared_1[threadIdx_x // 7 * 144]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 27] * kernel_shared_1[threadIdx_x // 7 * 144 + 72]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 28] * kernel_shared_1[threadIdx_x // 7 * 144 + 1]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 28] * kernel_shared_1[threadIdx_x // 7 * 144 + 73]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 29] * kernel_shared_1[threadIdx_x // 7 * 144 + 2]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 29] * kernel_shared_1[threadIdx_x // 7 * 144 + 74]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 36] * kernel_shared_1[threadIdx_x // 7 * 144]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 36] * kernel_shared_1[threadIdx_x // 7 * 144 + 72]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 37] * kernel_shared_1[threadIdx_x // 7 * 144 + 1]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 37] * kernel_shared_1[threadIdx_x // 7 * 144 + 73]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 38] * kernel_shared_1[threadIdx_x // 7 * 144 + 2]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 38] * kernel_shared_1[threadIdx_x // 7 * 144 + 74]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 45] * kernel_shared_1[threadIdx_x // 7 * 144]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 45] * kernel_shared_1[threadIdx_x // 7 * 144 + 72]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 46] * kernel_shared_1[threadIdx_x // 7 * 144 + 1]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 46] * kernel_shared_1[threadIdx_x // 7 * 144 + 73]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 47] * kernel_shared_1[threadIdx_x // 7 * 144 + 2]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 47] * kernel_shared_1[threadIdx_x // 7 * 144 + 74]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 54] * kernel_shared_1[threadIdx_x // 7 * 144]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 54] * kernel_shared_1[threadIdx_x // 7 * 144 + 72]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 55] * kernel_shared_1[threadIdx_x // 7 * 144 + 1]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 55] * kernel_shared_1[threadIdx_x // 7 * 144 + 73]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 56] * kernel_shared_1[threadIdx_x // 7 * 144 + 2]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 56] * kernel_shared_1[threadIdx_x // 7 * 144 + 74]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 9] * kernel_shared_1[threadIdx_x // 7 * 144 + 3]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 9] * kernel_shared_1[threadIdx_x // 7 * 144 + 75]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 10] * kernel_shared_1[threadIdx_x // 7 * 144 + 4]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 10] * kernel_shared_1[threadIdx_x // 7 * 144 + 76]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 11] * kernel_shared_1[threadIdx_x // 7 * 144 + 5]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 11] * kernel_shared_1[threadIdx_x // 7 * 144 + 77]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 144 + 3]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 144 + 75]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 19] * kernel_shared_1[threadIdx_x // 7 * 144 + 4]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 19] * kernel_shared_1[threadIdx_x // 7 * 144 + 76]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 20] * kernel_shared_1[threadIdx_x // 7 * 144 + 5]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 20] * kernel_shared_1[threadIdx_x // 7 * 144 + 77]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 27] * kernel_shared_1[threadIdx_x // 7 * 144 + 3]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 27] * kernel_shared_1[threadIdx_x // 7 * 144 + 75]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 28] * kernel_shared_1[threadIdx_x // 7 * 144 + 4]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 28] * kernel_shared_1[threadIdx_x // 7 * 144 + 76]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 29] * kernel_shared_1[threadIdx_x // 7 * 144 + 5]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 29] * kernel_shared_1[threadIdx_x // 7 * 144 + 77]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 36] * kernel_shared_1[threadIdx_x // 7 * 144 + 3]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 36] * kernel_shared_1[threadIdx_x // 7 * 144 + 75]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 37] * kernel_shared_1[threadIdx_x // 7 * 144 + 4]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 37] * kernel_shared_1[threadIdx_x // 7 * 144 + 76]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 38] * kernel_shared_1[threadIdx_x // 7 * 144 + 5]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 38] * kernel_shared_1[threadIdx_x // 7 * 144 + 77]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 45] * kernel_shared_1[threadIdx_x // 7 * 144 + 3]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 45] * kernel_shared_1[threadIdx_x // 7 * 144 + 75]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 46] * kernel_shared_1[threadIdx_x // 7 * 144 + 4]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 46] * kernel_shared_1[threadIdx_x // 7 * 144 + 76]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 47] * kernel_shared_1[threadIdx_x // 7 * 144 + 5]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 47] * kernel_shared_1[threadIdx_x // 7 * 144 + 77]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 54] * kernel_shared_1[threadIdx_x // 7 * 144 + 3]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 54] * kernel_shared_1[threadIdx_x // 7 * 144 + 75]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 55] * kernel_shared_1[threadIdx_x // 7 * 144 + 4]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 55] * kernel_shared_1[threadIdx_x // 7 * 144 + 76]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 56] * kernel_shared_1[threadIdx_x // 7 * 144 + 5]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 56] * kernel_shared_1[threadIdx_x // 7 * 144 + 77]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 63] * kernel_shared_1[threadIdx_x // 7 * 144 + 3]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 63] * kernel_shared_1[threadIdx_x // 7 * 144 + 75]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 64] * kernel_shared_1[threadIdx_x // 7 * 144 + 4]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 64] * kernel_shared_1[threadIdx_x // 7 * 144 + 76]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 65] * kernel_shared_1[threadIdx_x // 7 * 144 + 5]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 65] * kernel_shared_1[threadIdx_x // 7 * 144 + 77]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 144 + 6]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 144 + 78]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 19] * kernel_shared_1[threadIdx_x // 7 * 144 + 7]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 19] * kernel_shared_1[threadIdx_x // 7 * 144 + 79]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 20] * kernel_shared_1[threadIdx_x // 7 * 144 + 8]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 20] * kernel_shared_1[threadIdx_x // 7 * 144 + 80]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 27] * kernel_shared_1[threadIdx_x // 7 * 144 + 6]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 27] * kernel_shared_1[threadIdx_x // 7 * 144 + 78]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 28] * kernel_shared_1[threadIdx_x // 7 * 144 + 7]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 28] * kernel_shared_1[threadIdx_x // 7 * 144 + 79]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 29] * kernel_shared_1[threadIdx_x // 7 * 144 + 8]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 29] * kernel_shared_1[threadIdx_x // 7 * 144 + 80]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 36] * kernel_shared_1[threadIdx_x // 7 * 144 + 6]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 36] * kernel_shared_1[threadIdx_x // 7 * 144 + 78]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 37] * kernel_shared_1[threadIdx_x // 7 * 144 + 7]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 37] * kernel_shared_1[threadIdx_x // 7 * 144 + 79]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 38] * kernel_shared_1[threadIdx_x // 7 * 144 + 8]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 38] * kernel_shared_1[threadIdx_x // 7 * 144 + 80]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 45] * kernel_shared_1[threadIdx_x // 7 * 144 + 6]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 45] * kernel_shared_1[threadIdx_x // 7 * 144 + 78]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 46] * kernel_shared_1[threadIdx_x // 7 * 144 + 7]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 46] * kernel_shared_1[threadIdx_x // 7 * 144 + 79]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 47] * kernel_shared_1[threadIdx_x // 7 * 144 + 8]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 47] * kernel_shared_1[threadIdx_x // 7 * 144 + 80]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 54] * kernel_shared_1[threadIdx_x // 7 * 144 + 6]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 54] * kernel_shared_1[threadIdx_x // 7 * 144 + 78]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 55] * kernel_shared_1[threadIdx_x // 7 * 144 + 7]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 55] * kernel_shared_1[threadIdx_x // 7 * 144 + 79]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 56] * kernel_shared_1[threadIdx_x // 7 * 144 + 8]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 56] * kernel_shared_1[threadIdx_x // 7 * 144 + 80]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 63] * kernel_shared_1[threadIdx_x // 7 * 144 + 6]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 63] * kernel_shared_1[threadIdx_x // 7 * 144 + 78]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 64] * kernel_shared_1[threadIdx_x // 7 * 144 + 7]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 64] * kernel_shared_1[threadIdx_x // 7 * 144 + 79]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 65] * kernel_shared_1[threadIdx_x // 7 * 144 + 8]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 65] * kernel_shared_1[threadIdx_x // 7 * 144 + 80]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 72] * kernel_shared_1[threadIdx_x // 7 * 144 + 6]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 72] * kernel_shared_1[threadIdx_x // 7 * 144 + 78]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 73] * kernel_shared_1[threadIdx_x // 7 * 144 + 7]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 73] * kernel_shared_1[threadIdx_x // 7 * 144 + 79]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 74] * kernel_shared_1[threadIdx_x // 7 * 144 + 8]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 74] * kernel_shared_1[threadIdx_x // 7 * 144 + 80]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 81] * kernel_shared_1[threadIdx_x // 7 * 144 + 9]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 81] * kernel_shared_1[threadIdx_x // 7 * 144 + 81]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 82] * kernel_shared_1[threadIdx_x // 7 * 144 + 10]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 82] * kernel_shared_1[threadIdx_x // 7 * 144 + 82]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 83] * kernel_shared_1[threadIdx_x // 7 * 144 + 11]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 83] * kernel_shared_1[threadIdx_x // 7 * 144 + 83]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 90] * kernel_shared_1[threadIdx_x // 7 * 144 + 9]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 90] * kernel_shared_1[threadIdx_x // 7 * 144 + 81]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 91] * kernel_shared_1[threadIdx_x // 7 * 144 + 10]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 91] * kernel_shared_1[threadIdx_x // 7 * 144 + 82]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 92] * kernel_shared_1[threadIdx_x // 7 * 144 + 11]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 92] * kernel_shared_1[threadIdx_x // 7 * 144 + 83]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 99] * kernel_shared_1[threadIdx_x // 7 * 144 + 9]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 99] * kernel_shared_1[threadIdx_x // 7 * 144 + 81]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 100] * kernel_shared_1[threadIdx_x // 7 * 144 + 10]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 100] * kernel_shared_1[threadIdx_x // 7 * 144 + 82]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 101] * kernel_shared_1[threadIdx_x // 7 * 144 + 11]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 101] * kernel_shared_1[threadIdx_x // 7 * 144 + 83]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 108] * kernel_shared_1[threadIdx_x // 7 * 144 + 9]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 108] * kernel_shared_1[threadIdx_x // 7 * 144 + 81]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 109] * kernel_shared_1[threadIdx_x // 7 * 144 + 10]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 109] * kernel_shared_1[threadIdx_x // 7 * 144 + 82]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 110] * kernel_shared_1[threadIdx_x // 7 * 144 + 11]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 110] * kernel_shared_1[threadIdx_x // 7 * 144 + 83]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 117] * kernel_shared_1[threadIdx_x // 7 * 144 + 9]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 117] * kernel_shared_1[threadIdx_x // 7 * 144 + 81]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 118] * kernel_shared_1[threadIdx_x // 7 * 144 + 10]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 118] * kernel_shared_1[threadIdx_x // 7 * 144 + 82]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 119] * kernel_shared_1[threadIdx_x // 7 * 144 + 11]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 119] * kernel_shared_1[threadIdx_x // 7 * 144 + 83]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 126] * kernel_shared_1[threadIdx_x // 7 * 144 + 9]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 126] * kernel_shared_1[threadIdx_x // 7 * 144 + 81]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 127] * kernel_shared_1[threadIdx_x // 7 * 144 + 10]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 127] * kernel_shared_1[threadIdx_x // 7 * 144 + 82]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 128] * kernel_shared_1[threadIdx_x // 7 * 144 + 11]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 128] * kernel_shared_1[threadIdx_x // 7 * 144 + 83]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 135] * kernel_shared_1[threadIdx_x // 7 * 144 + 9]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 135] * kernel_shared_1[threadIdx_x // 7 * 144 + 81]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 136] * kernel_shared_1[threadIdx_x // 7 * 144 + 10]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 136] * kernel_shared_1[threadIdx_x // 7 * 144 + 82]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 137] * kernel_shared_1[threadIdx_x // 7 * 144 + 11]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 137] * kernel_shared_1[threadIdx_x // 7 * 144 + 83]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 90] * kernel_shared_1[threadIdx_x // 7 * 144 + 12]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 90] * kernel_shared_1[threadIdx_x // 7 * 144 + 84]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 91] * kernel_shared_1[threadIdx_x // 7 * 144 + 13]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 91] * kernel_shared_1[threadIdx_x // 7 * 144 + 85]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 92] * kernel_shared_1[threadIdx_x // 7 * 144 + 14]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 92] * kernel_shared_1[threadIdx_x // 7 * 144 + 86]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 99] * kernel_shared_1[threadIdx_x // 7 * 144 + 12]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 99] * kernel_shared_1[threadIdx_x // 7 * 144 + 84]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 100] * kernel_shared_1[threadIdx_x // 7 * 144 + 13]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 100] * kernel_shared_1[threadIdx_x // 7 * 144 + 85]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 101] * kernel_shared_1[threadIdx_x // 7 * 144 + 14]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 101] * kernel_shared_1[threadIdx_x // 7 * 144 + 86]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 108] * kernel_shared_1[threadIdx_x // 7 * 144 + 12]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 108] * kernel_shared_1[threadIdx_x // 7 * 144 + 84]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 109] * kernel_shared_1[threadIdx_x // 7 * 144 + 13]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 109] * kernel_shared_1[threadIdx_x // 7 * 144 + 85]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 110] * kernel_shared_1[threadIdx_x // 7 * 144 + 14]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 110] * kernel_shared_1[threadIdx_x // 7 * 144 + 86]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 117] * kernel_shared_1[threadIdx_x // 7 * 144 + 12]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 117] * kernel_shared_1[threadIdx_x // 7 * 144 + 84]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 118] * kernel_shared_1[threadIdx_x // 7 * 144 + 13]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 118] * kernel_shared_1[threadIdx_x // 7 * 144 + 85]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 119] * kernel_shared_1[threadIdx_x // 7 * 144 + 14]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 119] * kernel_shared_1[threadIdx_x // 7 * 144 + 86]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 126] * kernel_shared_1[threadIdx_x // 7 * 144 + 12]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 126] * kernel_shared_1[threadIdx_x // 7 * 144 + 84]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 127] * kernel_shared_1[threadIdx_x // 7 * 144 + 13]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 127] * kernel_shared_1[threadIdx_x // 7 * 144 + 85]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 128] * kernel_shared_1[threadIdx_x // 7 * 144 + 14]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 128] * kernel_shared_1[threadIdx_x // 7 * 144 + 86]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 135] * kernel_shared_1[threadIdx_x // 7 * 144 + 12]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 135] * kernel_shared_1[threadIdx_x // 7 * 144 + 84]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 136] * kernel_shared_1[threadIdx_x // 7 * 144 + 13]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 136] * kernel_shared_1[threadIdx_x // 7 * 144 + 85]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 137] * kernel_shared_1[threadIdx_x // 7 * 144 + 14]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 137] * kernel_shared_1[threadIdx_x // 7 * 144 + 86]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 144] * kernel_shared_1[threadIdx_x // 7 * 144 + 12]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 144] * kernel_shared_1[threadIdx_x // 7 * 144 + 84]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 145] * kernel_shared_1[threadIdx_x // 7 * 144 + 13]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 145] * kernel_shared_1[threadIdx_x // 7 * 144 + 85]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 146] * kernel_shared_1[threadIdx_x // 7 * 144 + 14]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 146] * kernel_shared_1[threadIdx_x // 7 * 144 + 86]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 99] * kernel_shared_1[threadIdx_x // 7 * 144 + 15]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 99] * kernel_shared_1[threadIdx_x // 7 * 144 + 87]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 100] * kernel_shared_1[threadIdx_x // 7 * 144 + 16]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 100] * kernel_shared_1[threadIdx_x // 7 * 144 + 88]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 101] * kernel_shared_1[threadIdx_x // 7 * 144 + 17]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 101] * kernel_shared_1[threadIdx_x // 7 * 144 + 89]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 108] * kernel_shared_1[threadIdx_x // 7 * 144 + 15]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 108] * kernel_shared_1[threadIdx_x // 7 * 144 + 87]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 109] * kernel_shared_1[threadIdx_x // 7 * 144 + 16]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 109] * kernel_shared_1[threadIdx_x // 7 * 144 + 88]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 110] * kernel_shared_1[threadIdx_x // 7 * 144 + 17]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 110] * kernel_shared_1[threadIdx_x // 7 * 144 + 89]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 117] * kernel_shared_1[threadIdx_x // 7 * 144 + 15]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 117] * kernel_shared_1[threadIdx_x // 7 * 144 + 87]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 118] * kernel_shared_1[threadIdx_x // 7 * 144 + 16]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 118] * kernel_shared_1[threadIdx_x // 7 * 144 + 88]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 119] * kernel_shared_1[threadIdx_x // 7 * 144 + 17]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 119] * kernel_shared_1[threadIdx_x // 7 * 144 + 89]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 126] * kernel_shared_1[threadIdx_x // 7 * 144 + 15]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 126] * kernel_shared_1[threadIdx_x // 7 * 144 + 87]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 127] * kernel_shared_1[threadIdx_x // 7 * 144 + 16]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 127] * kernel_shared_1[threadIdx_x // 7 * 144 + 88]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 128] * kernel_shared_1[threadIdx_x // 7 * 144 + 17]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 128] * kernel_shared_1[threadIdx_x // 7 * 144 + 89]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 135] * kernel_shared_1[threadIdx_x // 7 * 144 + 15]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 135] * kernel_shared_1[threadIdx_x // 7 * 144 + 87]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 136] * kernel_shared_1[threadIdx_x // 7 * 144 + 16]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 136] * kernel_shared_1[threadIdx_x // 7 * 144 + 88]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 137] * kernel_shared_1[threadIdx_x // 7 * 144 + 17]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 137] * kernel_shared_1[threadIdx_x // 7 * 144 + 89]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 144] * kernel_shared_1[threadIdx_x // 7 * 144 + 15]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 144] * kernel_shared_1[threadIdx_x // 7 * 144 + 87]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 145] * kernel_shared_1[threadIdx_x // 7 * 144 + 16]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 145] * kernel_shared_1[threadIdx_x // 7 * 144 + 88]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 146] * kernel_shared_1[threadIdx_x // 7 * 144 + 17]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 146] * kernel_shared_1[threadIdx_x // 7 * 144 + 89]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 153] * kernel_shared_1[threadIdx_x // 7 * 144 + 15]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 153] * kernel_shared_1[threadIdx_x // 7 * 144 + 87]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 154] * kernel_shared_1[threadIdx_x // 7 * 144 + 16]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 154] * kernel_shared_1[threadIdx_x // 7 * 144 + 88]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 155] * kernel_shared_1[threadIdx_x // 7 * 144 + 17]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 155] * kernel_shared_1[threadIdx_x // 7 * 144 + 89]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 162] * kernel_shared_1[threadIdx_x // 7 * 144 + 18]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 162] * kernel_shared_1[threadIdx_x // 7 * 144 + 90]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 163] * kernel_shared_1[threadIdx_x // 7 * 144 + 19]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 163] * kernel_shared_1[threadIdx_x // 7 * 144 + 91]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 164] * kernel_shared_1[threadIdx_x // 7 * 144 + 20]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 164] * kernel_shared_1[threadIdx_x // 7 * 144 + 92]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 171] * kernel_shared_1[threadIdx_x // 7 * 144 + 18]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 171] * kernel_shared_1[threadIdx_x // 7 * 144 + 90]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 172] * kernel_shared_1[threadIdx_x // 7 * 144 + 19]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 172] * kernel_shared_1[threadIdx_x // 7 * 144 + 91]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 173] * kernel_shared_1[threadIdx_x // 7 * 144 + 20]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 173] * kernel_shared_1[threadIdx_x // 7 * 144 + 92]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 180] * kernel_shared_1[threadIdx_x // 7 * 144 + 18]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 180] * kernel_shared_1[threadIdx_x // 7 * 144 + 90]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 181] * kernel_shared_1[threadIdx_x // 7 * 144 + 19]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 181] * kernel_shared_1[threadIdx_x // 7 * 144 + 91]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 182] * kernel_shared_1[threadIdx_x // 7 * 144 + 20]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 182] * kernel_shared_1[threadIdx_x // 7 * 144 + 92]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 189] * kernel_shared_1[threadIdx_x // 7 * 144 + 18]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 189] * kernel_shared_1[threadIdx_x // 7 * 144 + 90]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 190] * kernel_shared_1[threadIdx_x // 7 * 144 + 19]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 190] * kernel_shared_1[threadIdx_x // 7 * 144 + 91]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 191] * kernel_shared_1[threadIdx_x // 7 * 144 + 20]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 191] * kernel_shared_1[threadIdx_x // 7 * 144 + 92]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 198] * kernel_shared_1[threadIdx_x // 7 * 144 + 18]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 198] * kernel_shared_1[threadIdx_x // 7 * 144 + 90]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 199] * kernel_shared_1[threadIdx_x // 7 * 144 + 19]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 199] * kernel_shared_1[threadIdx_x // 7 * 144 + 91]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 200] * kernel_shared_1[threadIdx_x // 7 * 144 + 20]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 200] * kernel_shared_1[threadIdx_x // 7 * 144 + 92]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 207] * kernel_shared_1[threadIdx_x // 7 * 144 + 18]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 207] * kernel_shared_1[threadIdx_x // 7 * 144 + 90]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 208] * kernel_shared_1[threadIdx_x // 7 * 144 + 19]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 208] * kernel_shared_1[threadIdx_x // 7 * 144 + 91]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 209] * kernel_shared_1[threadIdx_x // 7 * 144 + 20]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 209] * kernel_shared_1[threadIdx_x // 7 * 144 + 92]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 216] * kernel_shared_1[threadIdx_x // 7 * 144 + 18]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 216] * kernel_shared_1[threadIdx_x // 7 * 144 + 90]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 217] * kernel_shared_1[threadIdx_x // 7 * 144 + 19]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 217] * kernel_shared_1[threadIdx_x // 7 * 144 + 91]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 218] * kernel_shared_1[threadIdx_x // 7 * 144 + 20]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 218] * kernel_shared_1[threadIdx_x // 7 * 144 + 92]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 171] * kernel_shared_1[threadIdx_x // 7 * 144 + 21]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 171] * kernel_shared_1[threadIdx_x // 7 * 144 + 93]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 172] * kernel_shared_1[threadIdx_x // 7 * 144 + 22]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 172] * kernel_shared_1[threadIdx_x // 7 * 144 + 94]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 173] * kernel_shared_1[threadIdx_x // 7 * 144 + 23]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 173] * kernel_shared_1[threadIdx_x // 7 * 144 + 95]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 180] * kernel_shared_1[threadIdx_x // 7 * 144 + 21]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 180] * kernel_shared_1[threadIdx_x // 7 * 144 + 93]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 181] * kernel_shared_1[threadIdx_x // 7 * 144 + 22]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 181] * kernel_shared_1[threadIdx_x // 7 * 144 + 94]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 182] * kernel_shared_1[threadIdx_x // 7 * 144 + 23]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 182] * kernel_shared_1[threadIdx_x // 7 * 144 + 95]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 189] * kernel_shared_1[threadIdx_x // 7 * 144 + 21]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 189] * kernel_shared_1[threadIdx_x // 7 * 144 + 93]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 190] * kernel_shared_1[threadIdx_x // 7 * 144 + 22]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 190] * kernel_shared_1[threadIdx_x // 7 * 144 + 94]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 191] * kernel_shared_1[threadIdx_x // 7 * 144 + 23]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 191] * kernel_shared_1[threadIdx_x // 7 * 144 + 95]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 198] * kernel_shared_1[threadIdx_x // 7 * 144 + 21]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 198] * kernel_shared_1[threadIdx_x // 7 * 144 + 93]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 199] * kernel_shared_1[threadIdx_x // 7 * 144 + 22]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 199] * kernel_shared_1[threadIdx_x // 7 * 144 + 94]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 200] * kernel_shared_1[threadIdx_x // 7 * 144 + 23]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 200] * kernel_shared_1[threadIdx_x // 7 * 144 + 95]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 207] * kernel_shared_1[threadIdx_x // 7 * 144 + 21]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 207] * kernel_shared_1[threadIdx_x // 7 * 144 + 93]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 208] * kernel_shared_1[threadIdx_x // 7 * 144 + 22]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 208] * kernel_shared_1[threadIdx_x // 7 * 144 + 94]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 209] * kernel_shared_1[threadIdx_x // 7 * 144 + 23]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 209] * kernel_shared_1[threadIdx_x // 7 * 144 + 95]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 216] * kernel_shared_1[threadIdx_x // 7 * 144 + 21]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 216] * kernel_shared_1[threadIdx_x // 7 * 144 + 93]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 217] * kernel_shared_1[threadIdx_x // 7 * 144 + 22]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 217] * kernel_shared_1[threadIdx_x // 7 * 144 + 94]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 218] * kernel_shared_1[threadIdx_x // 7 * 144 + 23]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 218] * kernel_shared_1[threadIdx_x // 7 * 144 + 95]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 225] * kernel_shared_1[threadIdx_x // 7 * 144 + 21]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 225] * kernel_shared_1[threadIdx_x // 7 * 144 + 93]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 226] * kernel_shared_1[threadIdx_x // 7 * 144 + 22]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 226] * kernel_shared_1[threadIdx_x // 7 * 144 + 94]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 227] * kernel_shared_1[threadIdx_x // 7 * 144 + 23]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 227] * kernel_shared_1[threadIdx_x // 7 * 144 + 95]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 180] * kernel_shared_1[threadIdx_x // 7 * 144 + 24]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 180] * kernel_shared_1[threadIdx_x // 7 * 144 + 96]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 181] * kernel_shared_1[threadIdx_x // 7 * 144 + 25]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 181] * kernel_shared_1[threadIdx_x // 7 * 144 + 97]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 182] * kernel_shared_1[threadIdx_x // 7 * 144 + 26]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 182] * kernel_shared_1[threadIdx_x // 7 * 144 + 98]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 189] * kernel_shared_1[threadIdx_x // 7 * 144 + 24]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 189] * kernel_shared_1[threadIdx_x // 7 * 144 + 96]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 190] * kernel_shared_1[threadIdx_x // 7 * 144 + 25]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 190] * kernel_shared_1[threadIdx_x // 7 * 144 + 97]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 191] * kernel_shared_1[threadIdx_x // 7 * 144 + 26]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 191] * kernel_shared_1[threadIdx_x // 7 * 144 + 98]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 198] * kernel_shared_1[threadIdx_x // 7 * 144 + 24]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 198] * kernel_shared_1[threadIdx_x // 7 * 144 + 96]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 199] * kernel_shared_1[threadIdx_x // 7 * 144 + 25]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 199] * kernel_shared_1[threadIdx_x // 7 * 144 + 97]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 200] * kernel_shared_1[threadIdx_x // 7 * 144 + 26]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 200] * kernel_shared_1[threadIdx_x // 7 * 144 + 98]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 207] * kernel_shared_1[threadIdx_x // 7 * 144 + 24]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 207] * kernel_shared_1[threadIdx_x // 7 * 144 + 96]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 208] * kernel_shared_1[threadIdx_x // 7 * 144 + 25]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 208] * kernel_shared_1[threadIdx_x // 7 * 144 + 97]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 209] * kernel_shared_1[threadIdx_x // 7 * 144 + 26]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 209] * kernel_shared_1[threadIdx_x // 7 * 144 + 98]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 216] * kernel_shared_1[threadIdx_x // 7 * 144 + 24]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 216] * kernel_shared_1[threadIdx_x // 7 * 144 + 96]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 217] * kernel_shared_1[threadIdx_x // 7 * 144 + 25]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 217] * kernel_shared_1[threadIdx_x // 7 * 144 + 97]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 218] * kernel_shared_1[threadIdx_x // 7 * 144 + 26]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 218] * kernel_shared_1[threadIdx_x // 7 * 144 + 98]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 225] * kernel_shared_1[threadIdx_x // 7 * 144 + 24]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 225] * kernel_shared_1[threadIdx_x // 7 * 144 + 96]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 226] * kernel_shared_1[threadIdx_x // 7 * 144 + 25]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 226] * kernel_shared_1[threadIdx_x // 7 * 144 + 97]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 227] * kernel_shared_1[threadIdx_x // 7 * 144 + 26]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 227] * kernel_shared_1[threadIdx_x // 7 * 144 + 98]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 234] * kernel_shared_1[threadIdx_x // 7 * 144 + 24]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 234] * kernel_shared_1[threadIdx_x // 7 * 144 + 96]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 235] * kernel_shared_1[threadIdx_x // 7 * 144 + 25]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 235] * kernel_shared_1[threadIdx_x // 7 * 144 + 97]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 236] * kernel_shared_1[threadIdx_x // 7 * 144 + 26]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 236] * kernel_shared_1[threadIdx_x // 7 * 144 + 98]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 243] * kernel_shared_1[threadIdx_x // 7 * 144 + 27]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 243] * kernel_shared_1[threadIdx_x // 7 * 144 + 99]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 244] * kernel_shared_1[threadIdx_x // 7 * 144 + 28]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 244] * kernel_shared_1[threadIdx_x // 7 * 144 + 100]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 245] * kernel_shared_1[threadIdx_x // 7 * 144 + 29]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 245] * kernel_shared_1[threadIdx_x // 7 * 144 + 101]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 252] * kernel_shared_1[threadIdx_x // 7 * 144 + 27]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 252] * kernel_shared_1[threadIdx_x // 7 * 144 + 99]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 253] * kernel_shared_1[threadIdx_x // 7 * 144 + 28]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 253] * kernel_shared_1[threadIdx_x // 7 * 144 + 100]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 254] * kernel_shared_1[threadIdx_x // 7 * 144 + 29]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 254] * kernel_shared_1[threadIdx_x // 7 * 144 + 101]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 261] * kernel_shared_1[threadIdx_x // 7 * 144 + 27]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 261] * kernel_shared_1[threadIdx_x // 7 * 144 + 99]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 262] * kernel_shared_1[threadIdx_x // 7 * 144 + 28]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 262] * kernel_shared_1[threadIdx_x // 7 * 144 + 100]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 263] * kernel_shared_1[threadIdx_x // 7 * 144 + 29]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 263] * kernel_shared_1[threadIdx_x // 7 * 144 + 101]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 270] * kernel_shared_1[threadIdx_x // 7 * 144 + 27]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 270] * kernel_shared_1[threadIdx_x // 7 * 144 + 99]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 271] * kernel_shared_1[threadIdx_x // 7 * 144 + 28]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 271] * kernel_shared_1[threadIdx_x // 7 * 144 + 100]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 272] * kernel_shared_1[threadIdx_x // 7 * 144 + 29]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 272] * kernel_shared_1[threadIdx_x // 7 * 144 + 101]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 279] * kernel_shared_1[threadIdx_x // 7 * 144 + 27]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 279] * kernel_shared_1[threadIdx_x // 7 * 144 + 99]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 280] * kernel_shared_1[threadIdx_x // 7 * 144 + 28]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 280] * kernel_shared_1[threadIdx_x // 7 * 144 + 100]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 281] * kernel_shared_1[threadIdx_x // 7 * 144 + 29]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 281] * kernel_shared_1[threadIdx_x // 7 * 144 + 101]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 288] * kernel_shared_1[threadIdx_x // 7 * 144 + 27]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 288] * kernel_shared_1[threadIdx_x // 7 * 144 + 99]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 289] * kernel_shared_1[threadIdx_x // 7 * 144 + 28]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 289] * kernel_shared_1[threadIdx_x // 7 * 144 + 100]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 290] * kernel_shared_1[threadIdx_x // 7 * 144 + 29]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 290] * kernel_shared_1[threadIdx_x // 7 * 144 + 101]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 297] * kernel_shared_1[threadIdx_x // 7 * 144 + 27]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 297] * kernel_shared_1[threadIdx_x // 7 * 144 + 99]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 298] * kernel_shared_1[threadIdx_x // 7 * 144 + 28]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 298] * kernel_shared_1[threadIdx_x // 7 * 144 + 100]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 299] * kernel_shared_1[threadIdx_x // 7 * 144 + 29]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 299] * kernel_shared_1[threadIdx_x // 7 * 144 + 101]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 252] * kernel_shared_1[threadIdx_x // 7 * 144 + 30]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 252] * kernel_shared_1[threadIdx_x // 7 * 144 + 102]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 253] * kernel_shared_1[threadIdx_x // 7 * 144 + 31]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 253] * kernel_shared_1[threadIdx_x // 7 * 144 + 103]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 254] * kernel_shared_1[threadIdx_x // 7 * 144 + 32]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 254] * kernel_shared_1[threadIdx_x // 7 * 144 + 104]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 261] * kernel_shared_1[threadIdx_x // 7 * 144 + 30]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 261] * kernel_shared_1[threadIdx_x // 7 * 144 + 102]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 262] * kernel_shared_1[threadIdx_x // 7 * 144 + 31]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 262] * kernel_shared_1[threadIdx_x // 7 * 144 + 103]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 263] * kernel_shared_1[threadIdx_x // 7 * 144 + 32]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 263] * kernel_shared_1[threadIdx_x // 7 * 144 + 104]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 270] * kernel_shared_1[threadIdx_x // 7 * 144 + 30]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 270] * kernel_shared_1[threadIdx_x // 7 * 144 + 102]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 271] * kernel_shared_1[threadIdx_x // 7 * 144 + 31]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 271] * kernel_shared_1[threadIdx_x // 7 * 144 + 103]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 272] * kernel_shared_1[threadIdx_x // 7 * 144 + 32]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 272] * kernel_shared_1[threadIdx_x // 7 * 144 + 104]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 279] * kernel_shared_1[threadIdx_x // 7 * 144 + 30]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 279] * kernel_shared_1[threadIdx_x // 7 * 144 + 102]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 280] * kernel_shared_1[threadIdx_x // 7 * 144 + 31]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 280] * kernel_shared_1[threadIdx_x // 7 * 144 + 103]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 281] * kernel_shared_1[threadIdx_x // 7 * 144 + 32]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 281] * kernel_shared_1[threadIdx_x // 7 * 144 + 104]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 288] * kernel_shared_1[threadIdx_x // 7 * 144 + 30]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 288] * kernel_shared_1[threadIdx_x // 7 * 144 + 102]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 289] * kernel_shared_1[threadIdx_x // 7 * 144 + 31]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 289] * kernel_shared_1[threadIdx_x // 7 * 144 + 103]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 290] * kernel_shared_1[threadIdx_x // 7 * 144 + 32]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 290] * kernel_shared_1[threadIdx_x // 7 * 144 + 104]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 297] * kernel_shared_1[threadIdx_x // 7 * 144 + 30]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 297] * kernel_shared_1[threadIdx_x // 7 * 144 + 102]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 298] * kernel_shared_1[threadIdx_x // 7 * 144 + 31]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 298] * kernel_shared_1[threadIdx_x // 7 * 144 + 103]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 299] * kernel_shared_1[threadIdx_x // 7 * 144 + 32]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 299] * kernel_shared_1[threadIdx_x // 7 * 144 + 104]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 306] * kernel_shared_1[threadIdx_x // 7 * 144 + 30]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 306] * kernel_shared_1[threadIdx_x // 7 * 144 + 102]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 307] * kernel_shared_1[threadIdx_x // 7 * 144 + 31]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 307] * kernel_shared_1[threadIdx_x // 7 * 144 + 103]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 308] * kernel_shared_1[threadIdx_x // 7 * 144 + 32]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 308] * kernel_shared_1[threadIdx_x // 7 * 144 + 104]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 261] * kernel_shared_1[threadIdx_x // 7 * 144 + 33]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 261] * kernel_shared_1[threadIdx_x // 7 * 144 + 105]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 262] * kernel_shared_1[threadIdx_x // 7 * 144 + 34]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 262] * kernel_shared_1[threadIdx_x // 7 * 144 + 106]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 263] * kernel_shared_1[threadIdx_x // 7 * 144 + 35]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 263] * kernel_shared_1[threadIdx_x // 7 * 144 + 107]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 270] * kernel_shared_1[threadIdx_x // 7 * 144 + 33]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 270] * kernel_shared_1[threadIdx_x // 7 * 144 + 105]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 271] * kernel_shared_1[threadIdx_x // 7 * 144 + 34]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 271] * kernel_shared_1[threadIdx_x // 7 * 144 + 106]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 272] * kernel_shared_1[threadIdx_x // 7 * 144 + 35]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 272] * kernel_shared_1[threadIdx_x // 7 * 144 + 107]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 279] * kernel_shared_1[threadIdx_x // 7 * 144 + 33]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 279] * kernel_shared_1[threadIdx_x // 7 * 144 + 105]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 280] * kernel_shared_1[threadIdx_x // 7 * 144 + 34]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 280] * kernel_shared_1[threadIdx_x // 7 * 144 + 106]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 281] * kernel_shared_1[threadIdx_x // 7 * 144 + 35]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 281] * kernel_shared_1[threadIdx_x // 7 * 144 + 107]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 288] * kernel_shared_1[threadIdx_x // 7 * 144 + 33]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 288] * kernel_shared_1[threadIdx_x // 7 * 144 + 105]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 289] * kernel_shared_1[threadIdx_x // 7 * 144 + 34]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 289] * kernel_shared_1[threadIdx_x // 7 * 144 + 106]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 290] * kernel_shared_1[threadIdx_x // 7 * 144 + 35]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 290] * kernel_shared_1[threadIdx_x // 7 * 144 + 107]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 297] * kernel_shared_1[threadIdx_x // 7 * 144 + 33]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 297] * kernel_shared_1[threadIdx_x // 7 * 144 + 105]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 298] * kernel_shared_1[threadIdx_x // 7 * 144 + 34]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 298] * kernel_shared_1[threadIdx_x // 7 * 144 + 106]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 299] * kernel_shared_1[threadIdx_x // 7 * 144 + 35]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 299] * kernel_shared_1[threadIdx_x // 7 * 144 + 107]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 306] * kernel_shared_1[threadIdx_x // 7 * 144 + 33]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 306] * kernel_shared_1[threadIdx_x // 7 * 144 + 105]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 307] * kernel_shared_1[threadIdx_x // 7 * 144 + 34]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 307] * kernel_shared_1[threadIdx_x // 7 * 144 + 106]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 308] * kernel_shared_1[threadIdx_x // 7 * 144 + 35]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 308] * kernel_shared_1[threadIdx_x // 7 * 144 + 107]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 315] * kernel_shared_1[threadIdx_x // 7 * 144 + 33]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 315] * kernel_shared_1[threadIdx_x // 7 * 144 + 105]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 316] * kernel_shared_1[threadIdx_x // 7 * 144 + 34]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 316] * kernel_shared_1[threadIdx_x // 7 * 144 + 106]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 317] * kernel_shared_1[threadIdx_x // 7 * 144 + 35]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 317] * kernel_shared_1[threadIdx_x // 7 * 144 + 107]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 324] * kernel_shared_1[threadIdx_x // 7 * 144 + 36]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 324] * kernel_shared_1[threadIdx_x // 7 * 144 + 108]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 325] * kernel_shared_1[threadIdx_x // 7 * 144 + 37]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 325] * kernel_shared_1[threadIdx_x // 7 * 144 + 109]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 326] * kernel_shared_1[threadIdx_x // 7 * 144 + 38]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 326] * kernel_shared_1[threadIdx_x // 7 * 144 + 110]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 333] * kernel_shared_1[threadIdx_x // 7 * 144 + 36]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 333] * kernel_shared_1[threadIdx_x // 7 * 144 + 108]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 334] * kernel_shared_1[threadIdx_x // 7 * 144 + 37]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 334] * kernel_shared_1[threadIdx_x // 7 * 144 + 109]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 335] * kernel_shared_1[threadIdx_x // 7 * 144 + 38]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 335] * kernel_shared_1[threadIdx_x // 7 * 144 + 110]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 342] * kernel_shared_1[threadIdx_x // 7 * 144 + 36]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 342] * kernel_shared_1[threadIdx_x // 7 * 144 + 108]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 343] * kernel_shared_1[threadIdx_x // 7 * 144 + 37]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 343] * kernel_shared_1[threadIdx_x // 7 * 144 + 109]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 344] * kernel_shared_1[threadIdx_x // 7 * 144 + 38]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 344] * kernel_shared_1[threadIdx_x // 7 * 144 + 110]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 351] * kernel_shared_1[threadIdx_x // 7 * 144 + 36]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 351] * kernel_shared_1[threadIdx_x // 7 * 144 + 108]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 352] * kernel_shared_1[threadIdx_x // 7 * 144 + 37]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 352] * kernel_shared_1[threadIdx_x // 7 * 144 + 109]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 353] * kernel_shared_1[threadIdx_x // 7 * 144 + 38]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 353] * kernel_shared_1[threadIdx_x // 7 * 144 + 110]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 360] * kernel_shared_1[threadIdx_x // 7 * 144 + 36]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 360] * kernel_shared_1[threadIdx_x // 7 * 144 + 108]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 361] * kernel_shared_1[threadIdx_x // 7 * 144 + 37]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 361] * kernel_shared_1[threadIdx_x // 7 * 144 + 109]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 362] * kernel_shared_1[threadIdx_x // 7 * 144 + 38]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 362] * kernel_shared_1[threadIdx_x // 7 * 144 + 110]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 369] * kernel_shared_1[threadIdx_x // 7 * 144 + 36]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 369] * kernel_shared_1[threadIdx_x // 7 * 144 + 108]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 370] * kernel_shared_1[threadIdx_x // 7 * 144 + 37]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 370] * kernel_shared_1[threadIdx_x // 7 * 144 + 109]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 371] * kernel_shared_1[threadIdx_x // 7 * 144 + 38]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 371] * kernel_shared_1[threadIdx_x // 7 * 144 + 110]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 378] * kernel_shared_1[threadIdx_x // 7 * 144 + 36]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 378] * kernel_shared_1[threadIdx_x // 7 * 144 + 108]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 379] * kernel_shared_1[threadIdx_x // 7 * 144 + 37]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 379] * kernel_shared_1[threadIdx_x // 7 * 144 + 109]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 380] * kernel_shared_1[threadIdx_x // 7 * 144 + 38]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 380] * kernel_shared_1[threadIdx_x // 7 * 144 + 110]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 333] * kernel_shared_1[threadIdx_x // 7 * 144 + 39]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 333] * kernel_shared_1[threadIdx_x // 7 * 144 + 111]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 334] * kernel_shared_1[threadIdx_x // 7 * 144 + 40]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 334] * kernel_shared_1[threadIdx_x // 7 * 144 + 112]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 335] * kernel_shared_1[threadIdx_x // 7 * 144 + 41]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 335] * kernel_shared_1[threadIdx_x // 7 * 144 + 113]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 342] * kernel_shared_1[threadIdx_x // 7 * 144 + 39]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 342] * kernel_shared_1[threadIdx_x // 7 * 144 + 111]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 343] * kernel_shared_1[threadIdx_x // 7 * 144 + 40]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 343] * kernel_shared_1[threadIdx_x // 7 * 144 + 112]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 344] * kernel_shared_1[threadIdx_x // 7 * 144 + 41]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 344] * kernel_shared_1[threadIdx_x // 7 * 144 + 113]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 351] * kernel_shared_1[threadIdx_x // 7 * 144 + 39]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 351] * kernel_shared_1[threadIdx_x // 7 * 144 + 111]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 352] * kernel_shared_1[threadIdx_x // 7 * 144 + 40]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 352] * kernel_shared_1[threadIdx_x // 7 * 144 + 112]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 353] * kernel_shared_1[threadIdx_x // 7 * 144 + 41]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 353] * kernel_shared_1[threadIdx_x // 7 * 144 + 113]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 360] * kernel_shared_1[threadIdx_x // 7 * 144 + 39]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 360] * kernel_shared_1[threadIdx_x // 7 * 144 + 111]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 361] * kernel_shared_1[threadIdx_x // 7 * 144 + 40]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 361] * kernel_shared_1[threadIdx_x // 7 * 144 + 112]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 362] * kernel_shared_1[threadIdx_x // 7 * 144 + 41]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 362] * kernel_shared_1[threadIdx_x // 7 * 144 + 113]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 369] * kernel_shared_1[threadIdx_x // 7 * 144 + 39]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 369] * kernel_shared_1[threadIdx_x // 7 * 144 + 111]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 370] * kernel_shared_1[threadIdx_x // 7 * 144 + 40]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 370] * kernel_shared_1[threadIdx_x // 7 * 144 + 112]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 371] * kernel_shared_1[threadIdx_x // 7 * 144 + 41]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 371] * kernel_shared_1[threadIdx_x // 7 * 144 + 113]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 378] * kernel_shared_1[threadIdx_x // 7 * 144 + 39]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 378] * kernel_shared_1[threadIdx_x // 7 * 144 + 111]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 379] * kernel_shared_1[threadIdx_x // 7 * 144 + 40]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 379] * kernel_shared_1[threadIdx_x // 7 * 144 + 112]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 380] * kernel_shared_1[threadIdx_x // 7 * 144 + 41]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 380] * kernel_shared_1[threadIdx_x // 7 * 144 + 113]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 387] * kernel_shared_1[threadIdx_x // 7 * 144 + 39]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 387] * kernel_shared_1[threadIdx_x // 7 * 144 + 111]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 388] * kernel_shared_1[threadIdx_x // 7 * 144 + 40]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 388] * kernel_shared_1[threadIdx_x // 7 * 144 + 112]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 389] * kernel_shared_1[threadIdx_x // 7 * 144 + 41]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 389] * kernel_shared_1[threadIdx_x // 7 * 144 + 113]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 342] * kernel_shared_1[threadIdx_x // 7 * 144 + 42]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 342] * kernel_shared_1[threadIdx_x // 7 * 144 + 114]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 343] * kernel_shared_1[threadIdx_x // 7 * 144 + 43]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 343] * kernel_shared_1[threadIdx_x // 7 * 144 + 115]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 344] * kernel_shared_1[threadIdx_x // 7 * 144 + 44]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 344] * kernel_shared_1[threadIdx_x // 7 * 144 + 116]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 351] * kernel_shared_1[threadIdx_x // 7 * 144 + 42]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 351] * kernel_shared_1[threadIdx_x // 7 * 144 + 114]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 352] * kernel_shared_1[threadIdx_x // 7 * 144 + 43]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 352] * kernel_shared_1[threadIdx_x // 7 * 144 + 115]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 353] * kernel_shared_1[threadIdx_x // 7 * 144 + 44]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 353] * kernel_shared_1[threadIdx_x // 7 * 144 + 116]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 360] * kernel_shared_1[threadIdx_x // 7 * 144 + 42]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 360] * kernel_shared_1[threadIdx_x // 7 * 144 + 114]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 361] * kernel_shared_1[threadIdx_x // 7 * 144 + 43]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 361] * kernel_shared_1[threadIdx_x // 7 * 144 + 115]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 362] * kernel_shared_1[threadIdx_x // 7 * 144 + 44]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 362] * kernel_shared_1[threadIdx_x // 7 * 144 + 116]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 369] * kernel_shared_1[threadIdx_x // 7 * 144 + 42]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 369] * kernel_shared_1[threadIdx_x // 7 * 144 + 114]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 370] * kernel_shared_1[threadIdx_x // 7 * 144 + 43]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 370] * kernel_shared_1[threadIdx_x // 7 * 144 + 115]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 371] * kernel_shared_1[threadIdx_x // 7 * 144 + 44]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 371] * kernel_shared_1[threadIdx_x // 7 * 144 + 116]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 378] * kernel_shared_1[threadIdx_x // 7 * 144 + 42]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 378] * kernel_shared_1[threadIdx_x // 7 * 144 + 114]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 379] * kernel_shared_1[threadIdx_x // 7 * 144 + 43]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 379] * kernel_shared_1[threadIdx_x // 7 * 144 + 115]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 380] * kernel_shared_1[threadIdx_x // 7 * 144 + 44]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 380] * kernel_shared_1[threadIdx_x // 7 * 144 + 116]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 387] * kernel_shared_1[threadIdx_x // 7 * 144 + 42]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 387] * kernel_shared_1[threadIdx_x // 7 * 144 + 114]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 388] * kernel_shared_1[threadIdx_x // 7 * 144 + 43]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 388] * kernel_shared_1[threadIdx_x // 7 * 144 + 115]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 389] * kernel_shared_1[threadIdx_x // 7 * 144 + 44]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 389] * kernel_shared_1[threadIdx_x // 7 * 144 + 116]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 396] * kernel_shared_1[threadIdx_x // 7 * 144 + 42]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 396] * kernel_shared_1[threadIdx_x // 7 * 144 + 114]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 397] * kernel_shared_1[threadIdx_x // 7 * 144 + 43]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 397] * kernel_shared_1[threadIdx_x // 7 * 144 + 115]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 398] * kernel_shared_1[threadIdx_x // 7 * 144 + 44]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 398] * kernel_shared_1[threadIdx_x // 7 * 144 + 116]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 405] * kernel_shared_1[threadIdx_x // 7 * 144 + 45]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 405] * kernel_shared_1[threadIdx_x // 7 * 144 + 117]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 406] * kernel_shared_1[threadIdx_x // 7 * 144 + 46]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 406] * kernel_shared_1[threadIdx_x // 7 * 144 + 118]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 407] * kernel_shared_1[threadIdx_x // 7 * 144 + 47]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 407] * kernel_shared_1[threadIdx_x // 7 * 144 + 119]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 414] * kernel_shared_1[threadIdx_x // 7 * 144 + 45]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 414] * kernel_shared_1[threadIdx_x // 7 * 144 + 117]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 415] * kernel_shared_1[threadIdx_x // 7 * 144 + 46]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 415] * kernel_shared_1[threadIdx_x // 7 * 144 + 118]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 416] * kernel_shared_1[threadIdx_x // 7 * 144 + 47]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 416] * kernel_shared_1[threadIdx_x // 7 * 144 + 119]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 423] * kernel_shared_1[threadIdx_x // 7 * 144 + 45]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 423] * kernel_shared_1[threadIdx_x // 7 * 144 + 117]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 424] * kernel_shared_1[threadIdx_x // 7 * 144 + 46]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 424] * kernel_shared_1[threadIdx_x // 7 * 144 + 118]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 425] * kernel_shared_1[threadIdx_x // 7 * 144 + 47]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 425] * kernel_shared_1[threadIdx_x // 7 * 144 + 119]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 432] * kernel_shared_1[threadIdx_x // 7 * 144 + 45]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 432] * kernel_shared_1[threadIdx_x // 7 * 144 + 117]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 433] * kernel_shared_1[threadIdx_x // 7 * 144 + 46]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 433] * kernel_shared_1[threadIdx_x // 7 * 144 + 118]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 434] * kernel_shared_1[threadIdx_x // 7 * 144 + 47]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 434] * kernel_shared_1[threadIdx_x // 7 * 144 + 119]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 441] * kernel_shared_1[threadIdx_x // 7 * 144 + 45]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 441] * kernel_shared_1[threadIdx_x // 7 * 144 + 117]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 442] * kernel_shared_1[threadIdx_x // 7 * 144 + 46]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 442] * kernel_shared_1[threadIdx_x // 7 * 144 + 118]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 443] * kernel_shared_1[threadIdx_x // 7 * 144 + 47]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 443] * kernel_shared_1[threadIdx_x // 7 * 144 + 119]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 450] * kernel_shared_1[threadIdx_x // 7 * 144 + 45]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 450] * kernel_shared_1[threadIdx_x // 7 * 144 + 117]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 451] * kernel_shared_1[threadIdx_x // 7 * 144 + 46]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 451] * kernel_shared_1[threadIdx_x // 7 * 144 + 118]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 452] * kernel_shared_1[threadIdx_x // 7 * 144 + 47]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 452] * kernel_shared_1[threadIdx_x // 7 * 144 + 119]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 459] * kernel_shared_1[threadIdx_x // 7 * 144 + 45]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 459] * kernel_shared_1[threadIdx_x // 7 * 144 + 117]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 460] * kernel_shared_1[threadIdx_x // 7 * 144 + 46]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 460] * kernel_shared_1[threadIdx_x // 7 * 144 + 118]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 461] * kernel_shared_1[threadIdx_x // 7 * 144 + 47]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 461] * kernel_shared_1[threadIdx_x // 7 * 144 + 119]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 414] * kernel_shared_1[threadIdx_x // 7 * 144 + 48]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 414] * kernel_shared_1[threadIdx_x // 7 * 144 + 120]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 415] * kernel_shared_1[threadIdx_x // 7 * 144 + 49]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 415] * kernel_shared_1[threadIdx_x // 7 * 144 + 121]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 416] * kernel_shared_1[threadIdx_x // 7 * 144 + 50]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 416] * kernel_shared_1[threadIdx_x // 7 * 144 + 122]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 423] * kernel_shared_1[threadIdx_x // 7 * 144 + 48]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 423] * kernel_shared_1[threadIdx_x // 7 * 144 + 120]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 424] * kernel_shared_1[threadIdx_x // 7 * 144 + 49]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 424] * kernel_shared_1[threadIdx_x // 7 * 144 + 121]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 425] * kernel_shared_1[threadIdx_x // 7 * 144 + 50]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 425] * kernel_shared_1[threadIdx_x // 7 * 144 + 122]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 432] * kernel_shared_1[threadIdx_x // 7 * 144 + 48]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 432] * kernel_shared_1[threadIdx_x // 7 * 144 + 120]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 433] * kernel_shared_1[threadIdx_x // 7 * 144 + 49]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 433] * kernel_shared_1[threadIdx_x // 7 * 144 + 121]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 434] * kernel_shared_1[threadIdx_x // 7 * 144 + 50]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 434] * kernel_shared_1[threadIdx_x // 7 * 144 + 122]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 441] * kernel_shared_1[threadIdx_x // 7 * 144 + 48]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 441] * kernel_shared_1[threadIdx_x // 7 * 144 + 120]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 442] * kernel_shared_1[threadIdx_x // 7 * 144 + 49]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 442] * kernel_shared_1[threadIdx_x // 7 * 144 + 121]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 443] * kernel_shared_1[threadIdx_x // 7 * 144 + 50]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 443] * kernel_shared_1[threadIdx_x // 7 * 144 + 122]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 450] * kernel_shared_1[threadIdx_x // 7 * 144 + 48]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 450] * kernel_shared_1[threadIdx_x // 7 * 144 + 120]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 451] * kernel_shared_1[threadIdx_x // 7 * 144 + 49]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 451] * kernel_shared_1[threadIdx_x // 7 * 144 + 121]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 452] * kernel_shared_1[threadIdx_x // 7 * 144 + 50]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 452] * kernel_shared_1[threadIdx_x // 7 * 144 + 122]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 459] * kernel_shared_1[threadIdx_x // 7 * 144 + 48]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 459] * kernel_shared_1[threadIdx_x // 7 * 144 + 120]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 460] * kernel_shared_1[threadIdx_x // 7 * 144 + 49]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 460] * kernel_shared_1[threadIdx_x // 7 * 144 + 121]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 461] * kernel_shared_1[threadIdx_x // 7 * 144 + 50]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 461] * kernel_shared_1[threadIdx_x // 7 * 144 + 122]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 468] * kernel_shared_1[threadIdx_x // 7 * 144 + 48]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 468] * kernel_shared_1[threadIdx_x // 7 * 144 + 120]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 469] * kernel_shared_1[threadIdx_x // 7 * 144 + 49]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 469] * kernel_shared_1[threadIdx_x // 7 * 144 + 121]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 470] * kernel_shared_1[threadIdx_x // 7 * 144 + 50]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 470] * kernel_shared_1[threadIdx_x // 7 * 144 + 122]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 423] * kernel_shared_1[threadIdx_x // 7 * 144 + 51]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 423] * kernel_shared_1[threadIdx_x // 7 * 144 + 123]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 424] * kernel_shared_1[threadIdx_x // 7 * 144 + 52]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 424] * kernel_shared_1[threadIdx_x // 7 * 144 + 124]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 425] * kernel_shared_1[threadIdx_x // 7 * 144 + 53]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 425] * kernel_shared_1[threadIdx_x // 7 * 144 + 125]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 432] * kernel_shared_1[threadIdx_x // 7 * 144 + 51]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 432] * kernel_shared_1[threadIdx_x // 7 * 144 + 123]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 433] * kernel_shared_1[threadIdx_x // 7 * 144 + 52]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 433] * kernel_shared_1[threadIdx_x // 7 * 144 + 124]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 434] * kernel_shared_1[threadIdx_x // 7 * 144 + 53]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 434] * kernel_shared_1[threadIdx_x // 7 * 144 + 125]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 441] * kernel_shared_1[threadIdx_x // 7 * 144 + 51]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 441] * kernel_shared_1[threadIdx_x // 7 * 144 + 123]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 442] * kernel_shared_1[threadIdx_x // 7 * 144 + 52]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 442] * kernel_shared_1[threadIdx_x // 7 * 144 + 124]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 443] * kernel_shared_1[threadIdx_x // 7 * 144 + 53]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 443] * kernel_shared_1[threadIdx_x // 7 * 144 + 125]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 450] * kernel_shared_1[threadIdx_x // 7 * 144 + 51]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 450] * kernel_shared_1[threadIdx_x // 7 * 144 + 123]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 451] * kernel_shared_1[threadIdx_x // 7 * 144 + 52]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 451] * kernel_shared_1[threadIdx_x // 7 * 144 + 124]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 452] * kernel_shared_1[threadIdx_x // 7 * 144 + 53]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 452] * kernel_shared_1[threadIdx_x // 7 * 144 + 125]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 459] * kernel_shared_1[threadIdx_x // 7 * 144 + 51]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 459] * kernel_shared_1[threadIdx_x // 7 * 144 + 123]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 460] * kernel_shared_1[threadIdx_x // 7 * 144 + 52]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 460] * kernel_shared_1[threadIdx_x // 7 * 144 + 124]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 461] * kernel_shared_1[threadIdx_x // 7 * 144 + 53]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 461] * kernel_shared_1[threadIdx_x // 7 * 144 + 125]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 468] * kernel_shared_1[threadIdx_x // 7 * 144 + 51]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 468] * kernel_shared_1[threadIdx_x // 7 * 144 + 123]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 469] * kernel_shared_1[threadIdx_x // 7 * 144 + 52]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 469] * kernel_shared_1[threadIdx_x // 7 * 144 + 124]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 470] * kernel_shared_1[threadIdx_x // 7 * 144 + 53]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 470] * kernel_shared_1[threadIdx_x // 7 * 144 + 125]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 477] * kernel_shared_1[threadIdx_x // 7 * 144 + 51]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 477] * kernel_shared_1[threadIdx_x // 7 * 144 + 123]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 478] * kernel_shared_1[threadIdx_x // 7 * 144 + 52]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 478] * kernel_shared_1[threadIdx_x // 7 * 144 + 124]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 479] * kernel_shared_1[threadIdx_x // 7 * 144 + 53]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 479] * kernel_shared_1[threadIdx_x // 7 * 144 + 125]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 486] * kernel_shared_1[threadIdx_x // 7 * 144 + 54]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 486] * kernel_shared_1[threadIdx_x // 7 * 144 + 126]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 487] * kernel_shared_1[threadIdx_x // 7 * 144 + 55]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 487] * kernel_shared_1[threadIdx_x // 7 * 144 + 127]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 488] * kernel_shared_1[threadIdx_x // 7 * 144 + 56]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 488] * kernel_shared_1[threadIdx_x // 7 * 144 + 128]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 495] * kernel_shared_1[threadIdx_x // 7 * 144 + 54]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 495] * kernel_shared_1[threadIdx_x // 7 * 144 + 126]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 496] * kernel_shared_1[threadIdx_x // 7 * 144 + 55]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 496] * kernel_shared_1[threadIdx_x // 7 * 144 + 127]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 497] * kernel_shared_1[threadIdx_x // 7 * 144 + 56]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 497] * kernel_shared_1[threadIdx_x // 7 * 144 + 128]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 504] * kernel_shared_1[threadIdx_x // 7 * 144 + 54]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 504] * kernel_shared_1[threadIdx_x // 7 * 144 + 126]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 505] * kernel_shared_1[threadIdx_x // 7 * 144 + 55]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 505] * kernel_shared_1[threadIdx_x // 7 * 144 + 127]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 506] * kernel_shared_1[threadIdx_x // 7 * 144 + 56]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 506] * kernel_shared_1[threadIdx_x // 7 * 144 + 128]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 513] * kernel_shared_1[threadIdx_x // 7 * 144 + 54]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 513] * kernel_shared_1[threadIdx_x // 7 * 144 + 126]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 514] * kernel_shared_1[threadIdx_x // 7 * 144 + 55]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 514] * kernel_shared_1[threadIdx_x // 7 * 144 + 127]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 515] * kernel_shared_1[threadIdx_x // 7 * 144 + 56]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 515] * kernel_shared_1[threadIdx_x // 7 * 144 + 128]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 522] * kernel_shared_1[threadIdx_x // 7 * 144 + 54]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 522] * kernel_shared_1[threadIdx_x // 7 * 144 + 126]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 523] * kernel_shared_1[threadIdx_x // 7 * 144 + 55]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 523] * kernel_shared_1[threadIdx_x // 7 * 144 + 127]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 524] * kernel_shared_1[threadIdx_x // 7 * 144 + 56]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 524] * kernel_shared_1[threadIdx_x // 7 * 144 + 128]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 531] * kernel_shared_1[threadIdx_x // 7 * 144 + 54]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 531] * kernel_shared_1[threadIdx_x // 7 * 144 + 126]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 532] * kernel_shared_1[threadIdx_x // 7 * 144 + 55]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 532] * kernel_shared_1[threadIdx_x // 7 * 144 + 127]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 533] * kernel_shared_1[threadIdx_x // 7 * 144 + 56]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 533] * kernel_shared_1[threadIdx_x // 7 * 144 + 128]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 540] * kernel_shared_1[threadIdx_x // 7 * 144 + 54]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 540] * kernel_shared_1[threadIdx_x // 7 * 144 + 126]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 541] * kernel_shared_1[threadIdx_x // 7 * 144 + 55]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 541] * kernel_shared_1[threadIdx_x // 7 * 144 + 127]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 542] * kernel_shared_1[threadIdx_x // 7 * 144 + 56]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 542] * kernel_shared_1[threadIdx_x // 7 * 144 + 128]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 495] * kernel_shared_1[threadIdx_x // 7 * 144 + 57]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 495] * kernel_shared_1[threadIdx_x // 7 * 144 + 129]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 496] * kernel_shared_1[threadIdx_x // 7 * 144 + 58]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 496] * kernel_shared_1[threadIdx_x // 7 * 144 + 130]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 497] * kernel_shared_1[threadIdx_x // 7 * 144 + 59]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 497] * kernel_shared_1[threadIdx_x // 7 * 144 + 131]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 504] * kernel_shared_1[threadIdx_x // 7 * 144 + 57]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 504] * kernel_shared_1[threadIdx_x // 7 * 144 + 129]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 505] * kernel_shared_1[threadIdx_x // 7 * 144 + 58]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 505] * kernel_shared_1[threadIdx_x // 7 * 144 + 130]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 506] * kernel_shared_1[threadIdx_x // 7 * 144 + 59]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 506] * kernel_shared_1[threadIdx_x // 7 * 144 + 131]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 513] * kernel_shared_1[threadIdx_x // 7 * 144 + 57]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 513] * kernel_shared_1[threadIdx_x // 7 * 144 + 129]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 514] * kernel_shared_1[threadIdx_x // 7 * 144 + 58]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 514] * kernel_shared_1[threadIdx_x // 7 * 144 + 130]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 515] * kernel_shared_1[threadIdx_x // 7 * 144 + 59]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 515] * kernel_shared_1[threadIdx_x // 7 * 144 + 131]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 522] * kernel_shared_1[threadIdx_x // 7 * 144 + 57]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 522] * kernel_shared_1[threadIdx_x // 7 * 144 + 129]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 523] * kernel_shared_1[threadIdx_x // 7 * 144 + 58]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 523] * kernel_shared_1[threadIdx_x // 7 * 144 + 130]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 524] * kernel_shared_1[threadIdx_x // 7 * 144 + 59]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 524] * kernel_shared_1[threadIdx_x // 7 * 144 + 131]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 531] * kernel_shared_1[threadIdx_x // 7 * 144 + 57]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 531] * kernel_shared_1[threadIdx_x // 7 * 144 + 129]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 532] * kernel_shared_1[threadIdx_x // 7 * 144 + 58]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 532] * kernel_shared_1[threadIdx_x // 7 * 144 + 130]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 533] * kernel_shared_1[threadIdx_x // 7 * 144 + 59]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 533] * kernel_shared_1[threadIdx_x // 7 * 144 + 131]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 540] * kernel_shared_1[threadIdx_x // 7 * 144 + 57]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 540] * kernel_shared_1[threadIdx_x // 7 * 144 + 129]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 541] * kernel_shared_1[threadIdx_x // 7 * 144 + 58]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 541] * kernel_shared_1[threadIdx_x // 7 * 144 + 130]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 542] * kernel_shared_1[threadIdx_x // 7 * 144 + 59]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 542] * kernel_shared_1[threadIdx_x // 7 * 144 + 131]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 549] * kernel_shared_1[threadIdx_x // 7 * 144 + 57]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 549] * kernel_shared_1[threadIdx_x // 7 * 144 + 129]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 550] * kernel_shared_1[threadIdx_x // 7 * 144 + 58]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 550] * kernel_shared_1[threadIdx_x // 7 * 144 + 130]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 551] * kernel_shared_1[threadIdx_x // 7 * 144 + 59]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 551] * kernel_shared_1[threadIdx_x // 7 * 144 + 131]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 504] * kernel_shared_1[threadIdx_x // 7 * 144 + 60]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 504] * kernel_shared_1[threadIdx_x // 7 * 144 + 132]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 505] * kernel_shared_1[threadIdx_x // 7 * 144 + 61]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 505] * kernel_shared_1[threadIdx_x // 7 * 144 + 133]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 506] * kernel_shared_1[threadIdx_x // 7 * 144 + 62]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 506] * kernel_shared_1[threadIdx_x // 7 * 144 + 134]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 513] * kernel_shared_1[threadIdx_x // 7 * 144 + 60]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 513] * kernel_shared_1[threadIdx_x // 7 * 144 + 132]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 514] * kernel_shared_1[threadIdx_x // 7 * 144 + 61]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 514] * kernel_shared_1[threadIdx_x // 7 * 144 + 133]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 515] * kernel_shared_1[threadIdx_x // 7 * 144 + 62]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 515] * kernel_shared_1[threadIdx_x // 7 * 144 + 134]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 522] * kernel_shared_1[threadIdx_x // 7 * 144 + 60]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 522] * kernel_shared_1[threadIdx_x // 7 * 144 + 132]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 523] * kernel_shared_1[threadIdx_x // 7 * 144 + 61]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 523] * kernel_shared_1[threadIdx_x // 7 * 144 + 133]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 524] * kernel_shared_1[threadIdx_x // 7 * 144 + 62]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 524] * kernel_shared_1[threadIdx_x // 7 * 144 + 134]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 531] * kernel_shared_1[threadIdx_x // 7 * 144 + 60]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 531] * kernel_shared_1[threadIdx_x // 7 * 144 + 132]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 532] * kernel_shared_1[threadIdx_x // 7 * 144 + 61]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 532] * kernel_shared_1[threadIdx_x // 7 * 144 + 133]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 533] * kernel_shared_1[threadIdx_x // 7 * 144 + 62]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 533] * kernel_shared_1[threadIdx_x // 7 * 144 + 134]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 540] * kernel_shared_1[threadIdx_x // 7 * 144 + 60]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 540] * kernel_shared_1[threadIdx_x // 7 * 144 + 132]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 541] * kernel_shared_1[threadIdx_x // 7 * 144 + 61]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 541] * kernel_shared_1[threadIdx_x // 7 * 144 + 133]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 542] * kernel_shared_1[threadIdx_x // 7 * 144 + 62]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 542] * kernel_shared_1[threadIdx_x // 7 * 144 + 134]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 549] * kernel_shared_1[threadIdx_x // 7 * 144 + 60]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 549] * kernel_shared_1[threadIdx_x // 7 * 144 + 132]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 550] * kernel_shared_1[threadIdx_x // 7 * 144 + 61]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 550] * kernel_shared_1[threadIdx_x // 7 * 144 + 133]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 551] * kernel_shared_1[threadIdx_x // 7 * 144 + 62]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 551] * kernel_shared_1[threadIdx_x // 7 * 144 + 134]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 558] * kernel_shared_1[threadIdx_x // 7 * 144 + 60]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 558] * kernel_shared_1[threadIdx_x // 7 * 144 + 132]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 559] * kernel_shared_1[threadIdx_x // 7 * 144 + 61]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 559] * kernel_shared_1[threadIdx_x // 7 * 144 + 133]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 560] * kernel_shared_1[threadIdx_x // 7 * 144 + 62]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 560] * kernel_shared_1[threadIdx_x // 7 * 144 + 134]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 567] * kernel_shared_1[threadIdx_x // 7 * 144 + 63]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 567] * kernel_shared_1[threadIdx_x // 7 * 144 + 135]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 568] * kernel_shared_1[threadIdx_x // 7 * 144 + 64]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 568] * kernel_shared_1[threadIdx_x // 7 * 144 + 136]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 569] * kernel_shared_1[threadIdx_x // 7 * 144 + 65]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 569] * kernel_shared_1[threadIdx_x // 7 * 144 + 137]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 576] * kernel_shared_1[threadIdx_x // 7 * 144 + 63]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 576] * kernel_shared_1[threadIdx_x // 7 * 144 + 135]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 577] * kernel_shared_1[threadIdx_x // 7 * 144 + 64]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 577] * kernel_shared_1[threadIdx_x // 7 * 144 + 136]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 578] * kernel_shared_1[threadIdx_x // 7 * 144 + 65]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 578] * kernel_shared_1[threadIdx_x // 7 * 144 + 137]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 585] * kernel_shared_1[threadIdx_x // 7 * 144 + 63]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 585] * kernel_shared_1[threadIdx_x // 7 * 144 + 135]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 586] * kernel_shared_1[threadIdx_x // 7 * 144 + 64]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 586] * kernel_shared_1[threadIdx_x // 7 * 144 + 136]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 587] * kernel_shared_1[threadIdx_x // 7 * 144 + 65]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 587] * kernel_shared_1[threadIdx_x // 7 * 144 + 137]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 594] * kernel_shared_1[threadIdx_x // 7 * 144 + 63]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 594] * kernel_shared_1[threadIdx_x // 7 * 144 + 135]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 595] * kernel_shared_1[threadIdx_x // 7 * 144 + 64]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 595] * kernel_shared_1[threadIdx_x // 7 * 144 + 136]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 596] * kernel_shared_1[threadIdx_x // 7 * 144 + 65]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 596] * kernel_shared_1[threadIdx_x // 7 * 144 + 137]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 603] * kernel_shared_1[threadIdx_x // 7 * 144 + 63]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 603] * kernel_shared_1[threadIdx_x // 7 * 144 + 135]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 604] * kernel_shared_1[threadIdx_x // 7 * 144 + 64]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 604] * kernel_shared_1[threadIdx_x // 7 * 144 + 136]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 605] * kernel_shared_1[threadIdx_x // 7 * 144 + 65]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 605] * kernel_shared_1[threadIdx_x // 7 * 144 + 137]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 612] * kernel_shared_1[threadIdx_x // 7 * 144 + 63]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 612] * kernel_shared_1[threadIdx_x // 7 * 144 + 135]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 613] * kernel_shared_1[threadIdx_x // 7 * 144 + 64]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 613] * kernel_shared_1[threadIdx_x // 7 * 144 + 136]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 614] * kernel_shared_1[threadIdx_x // 7 * 144 + 65]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 614] * kernel_shared_1[threadIdx_x // 7 * 144 + 137]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 621] * kernel_shared_1[threadIdx_x // 7 * 144 + 63]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 621] * kernel_shared_1[threadIdx_x // 7 * 144 + 135]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 622] * kernel_shared_1[threadIdx_x // 7 * 144 + 64]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 622] * kernel_shared_1[threadIdx_x // 7 * 144 + 136]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 623] * kernel_shared_1[threadIdx_x // 7 * 144 + 65]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 623] * kernel_shared_1[threadIdx_x // 7 * 144 + 137]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 576] * kernel_shared_1[threadIdx_x // 7 * 144 + 66]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 576] * kernel_shared_1[threadIdx_x // 7 * 144 + 138]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 577] * kernel_shared_1[threadIdx_x // 7 * 144 + 67]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 577] * kernel_shared_1[threadIdx_x // 7 * 144 + 139]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 578] * kernel_shared_1[threadIdx_x // 7 * 144 + 68]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 578] * kernel_shared_1[threadIdx_x // 7 * 144 + 140]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 585] * kernel_shared_1[threadIdx_x // 7 * 144 + 66]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 585] * kernel_shared_1[threadIdx_x // 7 * 144 + 138]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 586] * kernel_shared_1[threadIdx_x // 7 * 144 + 67]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 586] * kernel_shared_1[threadIdx_x // 7 * 144 + 139]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 587] * kernel_shared_1[threadIdx_x // 7 * 144 + 68]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 587] * kernel_shared_1[threadIdx_x // 7 * 144 + 140]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 594] * kernel_shared_1[threadIdx_x // 7 * 144 + 66]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 594] * kernel_shared_1[threadIdx_x // 7 * 144 + 138]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 595] * kernel_shared_1[threadIdx_x // 7 * 144 + 67]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 595] * kernel_shared_1[threadIdx_x // 7 * 144 + 139]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 596] * kernel_shared_1[threadIdx_x // 7 * 144 + 68]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 596] * kernel_shared_1[threadIdx_x // 7 * 144 + 140]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 603] * kernel_shared_1[threadIdx_x // 7 * 144 + 66]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 603] * kernel_shared_1[threadIdx_x // 7 * 144 + 138]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 604] * kernel_shared_1[threadIdx_x // 7 * 144 + 67]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 604] * kernel_shared_1[threadIdx_x // 7 * 144 + 139]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 605] * kernel_shared_1[threadIdx_x // 7 * 144 + 68]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 605] * kernel_shared_1[threadIdx_x // 7 * 144 + 140]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 612] * kernel_shared_1[threadIdx_x // 7 * 144 + 66]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 612] * kernel_shared_1[threadIdx_x // 7 * 144 + 138]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 613] * kernel_shared_1[threadIdx_x // 7 * 144 + 67]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 613] * kernel_shared_1[threadIdx_x // 7 * 144 + 139]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 614] * kernel_shared_1[threadIdx_x // 7 * 144 + 68]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 614] * kernel_shared_1[threadIdx_x // 7 * 144 + 140]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 621] * kernel_shared_1[threadIdx_x // 7 * 144 + 66]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 621] * kernel_shared_1[threadIdx_x // 7 * 144 + 138]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 622] * kernel_shared_1[threadIdx_x // 7 * 144 + 67]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 622] * kernel_shared_1[threadIdx_x // 7 * 144 + 139]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 623] * kernel_shared_1[threadIdx_x // 7 * 144 + 68]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 623] * kernel_shared_1[threadIdx_x // 7 * 144 + 140]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 630] * kernel_shared_1[threadIdx_x // 7 * 144 + 66]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 630] * kernel_shared_1[threadIdx_x // 7 * 144 + 138]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 631] * kernel_shared_1[threadIdx_x // 7 * 144 + 67]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 631] * kernel_shared_1[threadIdx_x // 7 * 144 + 139]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 632] * kernel_shared_1[threadIdx_x // 7 * 144 + 68]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 632] * kernel_shared_1[threadIdx_x // 7 * 144 + 140]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 585] * kernel_shared_1[threadIdx_x // 7 * 144 + 69]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 585] * kernel_shared_1[threadIdx_x // 7 * 144 + 141]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 586] * kernel_shared_1[threadIdx_x // 7 * 144 + 70]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 586] * kernel_shared_1[threadIdx_x // 7 * 144 + 142]
+                conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[threadIdx_x % 7 + 587] * kernel_shared_1[threadIdx_x // 7 * 144 + 71]
+                conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[threadIdx_x % 7 + 587] * kernel_shared_1[threadIdx_x // 7 * 144 + 143]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 594] * kernel_shared_1[threadIdx_x // 7 * 144 + 69]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 594] * kernel_shared_1[threadIdx_x // 7 * 144 + 141]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 595] * kernel_shared_1[threadIdx_x // 7 * 144 + 70]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 595] * kernel_shared_1[threadIdx_x // 7 * 144 + 142]
+                conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[threadIdx_x % 7 + 596] * kernel_shared_1[threadIdx_x // 7 * 144 + 71]
+                conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[threadIdx_x % 7 + 596] * kernel_shared_1[threadIdx_x // 7 * 144 + 143]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 603] * kernel_shared_1[threadIdx_x // 7 * 144 + 69]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 603] * kernel_shared_1[threadIdx_x // 7 * 144 + 141]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 604] * kernel_shared_1[threadIdx_x // 7 * 144 + 70]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 604] * kernel_shared_1[threadIdx_x // 7 * 144 + 142]
+                conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[threadIdx_x % 7 + 605] * kernel_shared_1[threadIdx_x // 7 * 144 + 71]
+                conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[threadIdx_x % 7 + 605] * kernel_shared_1[threadIdx_x // 7 * 144 + 143]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 612] * kernel_shared_1[threadIdx_x // 7 * 144 + 69]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 612] * kernel_shared_1[threadIdx_x // 7 * 144 + 141]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 613] * kernel_shared_1[threadIdx_x // 7 * 144 + 70]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 613] * kernel_shared_1[threadIdx_x // 7 * 144 + 142]
+                conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[threadIdx_x % 7 + 614] * kernel_shared_1[threadIdx_x // 7 * 144 + 71]
+                conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[threadIdx_x % 7 + 614] * kernel_shared_1[threadIdx_x // 7 * 144 + 143]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 621] * kernel_shared_1[threadIdx_x // 7 * 144 + 69]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 621] * kernel_shared_1[threadIdx_x // 7 * 144 + 141]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 622] * kernel_shared_1[threadIdx_x // 7 * 144 + 70]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 622] * kernel_shared_1[threadIdx_x // 7 * 144 + 142]
+                conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[threadIdx_x % 7 + 623] * kernel_shared_1[threadIdx_x // 7 * 144 + 71]
+                conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[threadIdx_x % 7 + 623] * kernel_shared_1[threadIdx_x // 7 * 144 + 143]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 630] * kernel_shared_1[threadIdx_x // 7 * 144 + 69]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 630] * kernel_shared_1[threadIdx_x // 7 * 144 + 141]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 631] * kernel_shared_1[threadIdx_x // 7 * 144 + 70]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 631] * kernel_shared_1[threadIdx_x // 7 * 144 + 142]
+                conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[threadIdx_x % 7 + 632] * kernel_shared_1[threadIdx_x // 7 * 144 + 71]
+                conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[threadIdx_x % 7 + 632] * kernel_shared_1[threadIdx_x // 7 * 144 + 143]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 639] * kernel_shared_1[threadIdx_x // 7 * 144 + 69]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 639] * kernel_shared_1[threadIdx_x // 7 * 144 + 141]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 640] * kernel_shared_1[threadIdx_x // 7 * 144 + 70]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 640] * kernel_shared_1[threadIdx_x // 7 * 144 + 142]
+                conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[threadIdx_x % 7 + 641] * kernel_shared_1[threadIdx_x // 7 * 144 + 71]
+                conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[threadIdx_x % 7 + 641] * kernel_shared_1[threadIdx_x // 7 * 144 + 143]
+            for i1_inner, i2_inner in T.grid(2, 7):
+                compute_1 = T.Buffer((25088,), data=compute.data)
+                bias_1 = T.Buffer((512,), data=bias.data)
+                compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + i2_inner * 7 + threadIdx_x % 7] = T.max(conv2d_nchw_1[i1_inner * 7 + i2_inner] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
 
 
 
@@ -1074,7 +1406,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 0.390 ms
+    Execution time of this operator: 0.163 ms
 
 
 
@@ -1122,36 +1454,36 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_i, factor=1)
     conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
     conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
-    conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
+    conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=2)
     conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
-    conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=16)
-    conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=8)
+    conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=8)
+    conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
     conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
-    conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
-    conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
+    conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=7)
+    conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
     conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
     conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
     conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
-    conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
+    conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
     conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
-    conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=4)
-    conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=2)
-    conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=3)
-    conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
-    conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
-    conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
+    conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=1)
+    conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=8)
+    conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
+    conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3)
+    conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=3)
+    conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1)
     s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2 [...]
     compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
     compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
     compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
-    compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
-    compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=16)
-    compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=8)
-    compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
-    compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
+    compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
+    compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=8)
+    compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
+    compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=7)
+    compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
     compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
     compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
-    compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
+    compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
     compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
     s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
     s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -1171,12 +1503,12 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
     kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
     kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
     s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=112)
+    kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
     s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
     pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
-    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
+    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=27)
     s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=112)
+    pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
     s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
     s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 1024)
     s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
@@ -1196,695 +1528,1093 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
       #define int64_t long long
       #define uint64_t unsigned long long
     #endif
-    extern "C" __global__ void __launch_bounds__(112) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
-      float conv2d_nchw[8];
-      __shared__ float pad_temp_shared[216];
-      __shared__ float kernel_shared[9216];
+    extern "C" __global__ void __launch_bounds__(56) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+      float conv2d_nchw[14];
+      __shared__ float pad_temp_shared[648];
+      __shared__ float kernel_shared[1152];
       conv2d_nchw[0] = 0.000000e+00f;
+      conv2d_nchw[7] = 0.000000e+00f;
       conv2d_nchw[1] = 0.000000e+00f;
+      conv2d_nchw[8] = 0.000000e+00f;
       conv2d_nchw[2] = 0.000000e+00f;
+      conv2d_nchw[9] = 0.000000e+00f;
       conv2d_nchw[3] = 0.000000e+00f;
+      conv2d_nchw[10] = 0.000000e+00f;
       conv2d_nchw[4] = 0.000000e+00f;
+      conv2d_nchw[11] = 0.000000e+00f;
       conv2d_nchw[5] = 0.000000e+00f;
+      conv2d_nchw[12] = 0.000000e+00f;
       conv2d_nchw[6] = 0.000000e+00f;
-      conv2d_nchw[7] = 0.000000e+00f;
+      conv2d_nchw[13] = 0.000000e+00f;
       for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
         __syncthreads();
-        pad_temp_shared[((int)threadIdx.x)] = (((((3 <= (((int)threadIdx.x) % 27)) && ((((int)threadIdx.x) % 27) < 24)) && (1 <= ((((int)blockIdx.x) % 7) + (((int)threadIdx.x) % 3)))) && (((((int)blockIdx.x) % 7) + (((int)threadIdx.x) % 3)) < 8)) ? data[((((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 27) * 49)) + (((((int)threadIdx.x) % 27) / 3) * 7)) + (((int)blockIdx.x) % 7)) + (((int)threadIdx.x) % 3)) - 8)] : 0.000000e+00f);
-        if (((int)threadIdx.x) < 104) {
-          pad_temp_shared[(((int)threadIdx.x) + 112)] = (((((3 <= ((((int)threadIdx.x) + 4) % 27)) && (((((int)threadIdx.x) + 4) % 27) < 24)) && (1 <= ((((int)blockIdx.x) % 7) + ((((int)threadIdx.x) + 1) % 3)))) && (((((int)blockIdx.x) % 7) + ((((int)threadIdx.x) + 1) % 3)) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) + 112) / 27) * 49)) + ((((((int)threadIdx.x) + 4) % 27) / 3) * 7)) + (((int)blockIdx.x) % 7)) + ((((int)threadIdx.x) + 1) % 3)) - 8)] : 0.000000e+00f);
+        if (((int)threadIdx.x) < 24) {
+          pad_temp_shared[(((int)threadIdx.x) * 27)] = 0.000000e+00f;
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 1)] = ((1 <= (((int)threadIdx.x) % 3)) ? data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) - 7)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 2)] = ((1 <= (((int)threadIdx.x) % 3)) ? data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) - 6)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 3)] = ((1 <= (((int)threadIdx.x) % 3)) ? data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) - 5)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 4)] = ((1 <= (((int)threadIdx.x) % 3)) ? data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) - 4)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 5)] = ((1 <= (((int)threadIdx.x) % 3)) ? data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) - 3)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 6)] = ((1 <= (((int)threadIdx.x) % 3)) ? data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) - 2)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 7)] = ((1 <= (((int)threadIdx.x) % 3)) ? data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) - 1)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 8)] = 0.000000e+00f;
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 9)] = 0.000000e+00f;
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 10)] = data[(((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21))];
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 11)] = data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) + 1)];
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 12)] = data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) + 2)];
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 13)] = data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) + 3)];
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 14)] = data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) + 4)];
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 15)] = data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) + 5)];
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 16)] = data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) + 6)];
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 17)] = 0.000000e+00f;
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 18)] = 0.000000e+00f;
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 19)] = (((((int)threadIdx.x) % 3) < 2) ? data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) + 7)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 20)] = (((((int)threadIdx.x) % 3) < 2) ? data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) + 8)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 21)] = (((((int)threadIdx.x) % 3) < 2) ? data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) + 9)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 22)] = (((((int)threadIdx.x) % 3) < 2) ? data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) + 10)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 23)] = (((((int)threadIdx.x) % 3) < 2) ? data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) + 11)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 24)] = (((((int)threadIdx.x) % 3) < 2) ? data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) + 12)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 25)] = (((((int)threadIdx.x) % 3) < 2) ? data[((((rc_outer_outer * 392) + ((((int)threadIdx.x) / 3) * 49)) + ((((int)threadIdx.x) % 3) * 21)) + 13)] : 0.000000e+00f);
+          pad_temp_shared[((((int)threadIdx.x) * 27) + 26)] = 0.000000e+00f;
         }
-        kernel_shared[((int)threadIdx.x)] = kernel[(((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 72) * 4608)) + (rc_outer_outer * 72)) + (((int)threadIdx.x) % 72))];
-        kernel_shared[(((int)threadIdx.x) + 112)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 112) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 40) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 4) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 224)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 224) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 336)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 336) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 48) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 1) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 560)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 560) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 56) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 2) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 672)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 672) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 24) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 2) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 784)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 784) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 64) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 1) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 32) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 5) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 1008)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 72) * 4608)) + (rc_outer_outer * 72)) + (((int)threadIdx.x) % 72)) + 64512)];
-        kernel_shared[(((int)threadIdx.x) + 1120)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1120) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 40) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 4) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 1232)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1232) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1344) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 48) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 1) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 1456)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1456) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 1568)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1568) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 56) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 2) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 1680)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1680) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 24) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 2) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 64) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 1) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 1904)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1904) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 32) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 5) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 2016)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 72) * 4608)) + (rc_outer_outer * 72)) + (((int)threadIdx.x) % 72)) + 129024)];
-        kernel_shared[(((int)threadIdx.x) + 2128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2128) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 40) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 4) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 2352)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2352) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 48) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 1) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 2464)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2464) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 2576)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2576) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 56) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 2) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2688) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 24) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 2) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 2800)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2800) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 64) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 1) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 2912)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2912) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 32) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 5) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 3024)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 72) * 4608)) + (rc_outer_outer * 72)) + (((int)threadIdx.x) % 72)) + 193536)];
-        kernel_shared[(((int)threadIdx.x) + 3136)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3136) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 40) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 4) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 3248)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3248) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 3360)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3360) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 48) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 1) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 3472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3472) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 3584)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3584) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 56) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 2) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 3696)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3696) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 24) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 2) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 3808)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3808) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 64) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 1) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 3920)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3920) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 32) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 5) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 4032)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 72) * 4608)) + (rc_outer_outer * 72)) + (((int)threadIdx.x) % 72)) + 258048)];
-        kernel_shared[(((int)threadIdx.x) + 4144)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 4144) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 40) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 4) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 4256)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 4256) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 4368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 4368) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 48) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 1) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 4480)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 4480) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 4592)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 4592) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 56) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 2) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 4704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 4704) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 24) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 2) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 4816)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 4816) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 64) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 1) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 4928)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 4928) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 32) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 5) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 5040)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 72) * 4608)) + (rc_outer_outer * 72)) + (((int)threadIdx.x) % 72)) + 322560)];
-        kernel_shared[(((int)threadIdx.x) + 5152)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 5152) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 40) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 4) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 5264)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 5264) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 5376)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 5376) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 48) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 1) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 5488)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 5488) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 5600)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 5600) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 56) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 2) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 5712)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 5712) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 24) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 2) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 5824)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 5824) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 64) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 1) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 5936)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 5936) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 32) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 5) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 6048)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 72) * 4608)) + (rc_outer_outer * 72)) + (((int)threadIdx.x) % 72)) + 387072)];
-        kernel_shared[(((int)threadIdx.x) + 6160)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 6160) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 40) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 4) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 6272)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 6272) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 6384)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 6384) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 48) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 1) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 6496)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 6496) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 6608)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 6608) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 56) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 2) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 6720)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 6720) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 24) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 2) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 6832)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 6832) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 64) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 1) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 6944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 6944) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 32) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 5) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 7056)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 72) * 4608)) + (rc_outer_outer * 72)) + (((int)threadIdx.x) % 72)) + 451584)];
-        kernel_shared[(((int)threadIdx.x) + 7168)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 7168) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 40) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 4) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 7280)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 7280) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 7392)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 7392) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 48) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 1) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 7504)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 7504) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 7616)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 7616) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 56) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 2) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 7728)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 7728) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 24) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 2) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 7840)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 7840) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 64) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 1) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 7952)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 7952) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 32) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 5) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 8064)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 72) * 4608)) + (rc_outer_outer * 72)) + (((int)threadIdx.x) % 72)) + 516096)];
-        kernel_shared[(((int)threadIdx.x) + 8176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 8176) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 40) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 4) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 8288)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 8288) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 8400)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 8400) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 48) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 1) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 8512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 8512) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 7) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 8624)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 8624) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 56) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 2) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 8736)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 8736) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 24) % 72) / 9) * 9)) + ((((((int)threadIdx.x) / 3) + 2) % 3) * 3)) + (((int)threadIdx.x) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 8848)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 8848) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 64) % 72) / 9) * 9)) + ((((int)threadIdx.x) + 1) % 9))];
-        kernel_shared[(((int)threadIdx.x) + 8960)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 8960) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 32) % 72) / 9) * 9)) + ((((((int)threadIdx.x) + 5) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
-        kernel_shared[(((int)threadIdx.x) + 9072)] = kernel[((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 72) * 4608)) + (rc_outer_outer * 72)) + (((int)threadIdx.x) % 72)) + 580608)];
+        kernel_shared[((int)threadIdx.x)] = kernel[(((((int)blockIdx.x) * 73728) + (rc_outer_outer * 72)) + ((int)threadIdx.x))];
+        kernel_shared[(((int)threadIdx.x) + 56)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 56) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 56) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 112)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 112) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 40) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 168)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 168) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) / 3) + 8) % 24) * 3)) + (((int)threadIdx.x) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 224)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 224) / 72) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) + 8) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 280)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 280) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 64) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 336)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 336) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) / 3) + 16) % 24) * 3)) + (((int)threadIdx.x) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 392)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 392) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 32) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 448) / 72) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) + 16) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 504)] = kernel[((((((int)blockIdx.x) * 73728) + (rc_outer_outer * 72)) + ((int)threadIdx.x)) + 32256)];
+        kernel_shared[(((int)threadIdx.x) + 560)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 560) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 56) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 616)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 616) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 40) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 672)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 672) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) / 3) + 8) % 24) * 3)) + (((int)threadIdx.x) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 728)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 728) / 72) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) + 8) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 784)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 784) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 64) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 840)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 840) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) / 3) + 16) % 24) * 3)) + (((int)threadIdx.x) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 896) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 32) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 952)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 952) / 72) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) + 16) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+        kernel_shared[(((int)threadIdx.x) + 1008)] = kernel[((((((int)blockIdx.x) * 73728) + (rc_outer_outer * 72)) + ((int)threadIdx.x)) + 64512)];
+        kernel_shared[(((int)threadIdx.x) + 1064)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 1064) / 72) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 56) % 72) / 3) * 3)) + ((((int)threadIdx.x) + 2) % 3))];
         if (((int)threadIdx.x) < 32) {
-          kernel_shared[(((int)threadIdx.x) + 9184)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 9184) / 72) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) + 40) / 9) * 9)) + ((((((int)threadIdx.x) + 4) % 9) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
+          kernel_shared[(((int)threadIdx.x) + 1120)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 1120) / 72) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) + 40) / 3) * 3)) + ((((int)threadIdx.x) + 1) % 3))];
         }
         __syncthreads();
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) * 3)] * kernel_shared[((((int)threadIdx.x) / 7) * 72)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) * 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1152)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) * 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2304)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) * 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3456)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) * 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4608)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) * 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5760)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) * 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6912)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) * 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8064)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1155)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2307)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3459)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4611)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5763)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6915)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8067)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1158)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2310)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3462)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4614)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5766)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6918)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8070)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 9)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1161)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2313)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3465)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4617)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5769)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6921)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8073)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 30)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 12)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 30)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1164)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 30)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2316)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 30)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3468)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 30)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4620)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 30)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5772)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 30)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6924)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 30)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8076)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 33)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 15)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 33)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1167)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 33)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2319)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 33)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3471)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 33)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4623)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 33)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5775)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 33)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6927)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 33)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8079)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 18)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1170)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2322)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3474)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4626)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5778)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6930)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8082)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 57)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 21)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 57)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1173)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 57)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2325)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 57)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3477)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 57)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4629)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 57)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5781)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 57)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6933)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 57)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8085)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 60)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 24)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 60)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1176)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 60)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2328)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 60)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3480)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 60)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4632)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 60)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5784)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 60)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6936)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 60)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8088)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 81)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 27)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 81)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1179)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 81)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2331)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 81)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3483)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 81)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4635)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 81)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5787)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 81)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6939)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 81)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8091)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 84)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 30)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 84)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1182)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 84)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2334)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 84)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3486)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 84)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4638)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 84)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5790)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 84)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6942)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 84)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8094)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 87)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 33)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 87)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1185)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 87)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2337)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 87)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3489)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 87)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4641)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 87)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5793)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 87)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6945)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 87)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8097)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1153)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2305)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3457)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4609)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5761)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6913)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8065)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1156)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2308)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3460)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4612)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5764)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6916)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8068)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 7)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1159)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2311)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3463)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4615)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5767)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6919)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8071)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 10)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1162)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2314)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3466)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4618)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5770)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6922)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8074)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 31)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 13)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 31)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1165)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 31)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2317)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 31)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3469)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 31)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4621)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 31)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5773)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 31)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6925)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 31)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8077)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 34)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 16)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 34)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1168)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 34)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2320)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 34)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3472)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 34)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4624)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 34)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5776)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 34)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6928)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 34)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8080)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 55)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 19)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 55)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1171)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 55)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2323)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 55)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3475)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 55)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4627)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 55)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5779)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 55)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6931)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 55)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8083)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 58)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 22)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 58)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1174)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 58)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2326)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 58)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3478)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 58)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4630)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 58)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5782)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 58)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6934)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 58)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8086)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 61)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 25)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 61)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1177)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 61)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2329)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 61)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3481)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 61)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4633)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 61)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5785)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 61)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6937)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 61)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8089)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 82)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 28)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 82)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1180)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 82)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2332)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 82)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3484)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 82)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4636)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 82)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5788)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 82)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6940)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 82)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8092)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 85)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 31)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 85)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1183)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 85)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2335)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 85)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3487)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 85)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4639)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 85)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5791)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 85)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6943)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 85)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8095)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 88)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 34)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 88)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1186)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 88)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2338)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 88)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3490)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 88)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4642)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 88)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5794)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 88)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6946)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 88)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8098)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1154)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2306)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3458)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4610)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5762)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6914)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8066)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1157)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2309)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3461)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4613)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5765)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6917)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8069)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 8)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 8)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1160)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 8)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2312)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 8)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3464)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 8)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4616)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 8)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5768)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 8)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6920)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 8)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8072)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 29)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 11)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 29)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1163)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 29)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2315)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 29)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3467)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 29)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4619)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 29)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5771)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 29)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6923)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 29)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8075)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 32)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 14)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 32)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1166)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 32)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2318)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 32)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3470)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 32)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4622)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 32)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5774)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 32)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6926)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 32)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8078)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 35)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 17)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 35)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1169)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 35)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2321)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 35)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3473)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 35)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4625)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 35)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5777)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 35)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6929)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 35)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8081)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 56)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 20)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 56)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1172)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 56)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2324)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 56)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3476)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 56)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4628)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 56)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5780)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 56)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6932)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 56)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8084)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 59)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 23)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 59)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1175)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 59)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2327)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 59)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3479)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 59)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4631)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 59)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5783)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 59)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6935)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 59)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8087)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 62)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 26)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 62)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1178)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 62)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2330)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 62)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3482)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 62)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4634)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 62)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5786)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 62)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6938)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 62)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8090)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 29)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1181)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2333)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3485)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4637)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5789)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6941)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8093)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 86)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 32)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 86)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1184)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 86)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2336)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 86)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3488)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 86)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4640)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 86)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5792)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 86)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6944)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 86)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8096)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 89)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 35)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 89)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1187)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 89)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2339)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 89)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3491)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 89)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4643)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 89)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5795)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 89)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6947)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 89)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8099)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 108)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 36)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 108)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1188)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 108)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2340)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 108)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3492)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 108)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4644)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 108)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5796)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 108)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6948)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 108)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8100)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 111)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 39)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 111)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1191)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 111)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2343)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 111)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3495)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 111)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4647)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 111)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5799)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 111)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6951)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 111)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8103)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 114)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 42)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 114)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1194)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 114)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2346)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 114)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3498)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 114)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4650)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 114)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5802)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 114)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6954)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 114)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8106)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 45)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1197)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2349)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3501)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4653)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5805)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6957)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8109)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 138)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 48)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 138)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1200)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 138)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2352)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 138)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3504)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 138)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4656)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 138)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5808)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 138)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6960)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 138)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8112)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 141)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 51)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 141)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1203)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 141)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2355)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 141)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3507)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 141)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4659)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 141)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5811)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 141)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6963)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 141)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8115)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 162)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 54)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 162)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1206)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 162)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2358)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 162)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3510)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 162)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4662)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 162)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5814)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 162)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6966)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 162)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8118)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 165)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 57)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 165)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1209)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 165)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2361)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 165)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3513)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 165)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4665)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 165)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5817)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 165)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6969)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 165)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8121)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 168)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 60)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 168)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1212)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 168)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2364)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 168)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3516)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 168)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4668)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 168)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5820)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 168)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6972)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 168)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8124)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 189)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 63)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 189)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1215)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 189)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2367)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 189)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3519)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 189)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4671)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 189)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5823)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 189)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6975)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 189)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8127)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 192)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 66)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 192)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1218)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 192)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2370)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 192)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3522)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 192)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4674)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 192)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5826)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 192)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6978)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 192)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8130)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 195)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 69)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 195)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1221)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 195)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2373)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 195)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3525)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 195)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4677)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 195)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5829)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 195)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6981)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 195)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8133)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 109)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 37)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 109)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1189)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 109)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2341)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 109)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3493)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 109)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4645)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 109)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5797)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 109)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6949)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 109)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8101)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 112)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 40)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 112)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1192)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 112)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2344)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 112)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3496)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 112)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4648)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 112)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5800)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 112)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6952)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 112)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8104)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 115)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 43)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 115)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1195)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 115)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2347)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 115)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3499)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 115)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4651)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 115)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5803)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 115)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6955)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 115)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8107)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 46)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1198)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2350)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3502)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4654)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5806)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6958)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8110)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 139)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 49)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 139)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1201)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 139)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2353)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 139)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3505)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 139)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4657)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 139)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5809)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 139)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6961)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 139)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8113)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 142)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 52)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 142)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1204)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 142)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2356)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 142)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3508)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 142)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4660)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 142)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5812)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 142)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6964)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 142)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8116)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 163)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 55)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 163)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1207)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 163)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2359)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 163)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3511)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 163)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4663)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 163)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5815)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 163)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6967)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 163)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8119)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 166)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 58)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 166)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1210)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 166)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2362)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 166)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3514)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 166)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4666)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 166)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5818)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 166)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6970)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 166)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8122)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 169)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 61)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 169)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1213)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 169)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2365)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 169)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3517)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 169)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4669)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 169)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5821)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 169)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6973)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 169)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8125)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 190)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 64)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 190)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1216)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 190)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2368)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 190)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3520)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 190)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4672)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 190)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5824)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 190)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6976)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 190)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8128)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 193)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 67)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 193)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1219)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 193)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2371)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 193)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3523)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 193)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4675)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 193)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5827)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 193)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6979)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 193)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8131)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 196)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 70)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 196)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1222)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 196)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2374)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 196)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3526)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 196)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4678)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 196)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5830)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 196)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6982)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 196)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8134)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 110)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 38)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 110)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1190)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 110)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2342)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 110)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3494)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 110)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4646)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 110)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5798)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 110)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6950)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 110)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8102)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 113)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 41)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 113)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1193)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 113)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2345)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 113)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3497)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 113)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4649)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 113)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5801)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 113)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6953)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 113)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8105)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 116)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 44)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 116)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1196)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 116)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2348)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 116)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3500)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 116)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4652)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 116)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5804)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 116)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6956)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 116)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8108)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 47)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1199)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2351)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3503)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4655)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5807)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6959)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8111)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 140)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 50)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 140)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1202)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 140)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2354)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 140)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3506)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 140)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4658)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 140)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5810)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 140)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6962)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 140)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8114)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 143)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 53)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 143)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1205)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 143)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2357)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 143)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3509)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 143)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4661)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 143)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5813)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 143)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6965)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 143)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8117)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 56)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1208)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2360)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3512)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4664)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5816)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6968)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8120)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 167)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 59)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 167)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1211)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 167)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2363)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 167)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3515)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 167)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4667)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 167)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5819)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 167)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6971)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 167)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8123)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 170)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 62)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 170)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1214)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 170)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2366)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 170)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3518)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 170)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4670)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 170)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5822)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 170)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6974)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 170)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8126)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 191)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 65)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 191)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1217)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 191)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2369)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 191)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3521)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 191)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4673)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 191)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5825)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 191)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6977)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 191)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8129)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 194)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 68)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 194)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1220)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 194)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2372)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 194)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3524)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 194)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4676)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 194)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5828)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 194)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6980)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 194)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8132)]));
-        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 197)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 71)]));
-        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 197)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 1223)]));
-        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 197)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 2375)]));
-        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 197)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 3527)]));
-        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 197)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 4679)]));
-        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 197)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 5831)]));
-        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 197)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 6983)]));
-        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((((int)threadIdx.x) % 7) * 3) + 197)] * kernel_shared[(((((int)threadIdx.x) / 7) * 72) + 8135)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) % 7)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((int)threadIdx.x) % 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 9)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 9)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 10)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 10)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 11)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 11)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 18)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 18)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 19)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 19)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 20)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 20)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 27)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 29)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 29)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 36)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 36)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 37)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 37)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 38)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 38)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 45)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 45)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 46)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 46)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 47)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 47)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 54)] * kernel_shared[((((int)threadIdx.x) / 7) * 144)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 72)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 55)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 1)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 55)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 73)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 56)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 2)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 56)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 74)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 9)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 9)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 10)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 10)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 11)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 11)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 18)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 18)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 19)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 19)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 20)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 20)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 29)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 29)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 36)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 36)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 37)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 37)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 38)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 38)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 45)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 45)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 46)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 46)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 47)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 47)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 55)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 55)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 56)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 56)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 63)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 3)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 63)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 75)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 64)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 4)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 64)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 76)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 65)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 5)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 65)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 77)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 18)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 18)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 19)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 19)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 20)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 20)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 27)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 29)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 29)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 36)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 36)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 37)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 37)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 38)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 38)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 45)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 45)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 46)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 46)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 47)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 47)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 54)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 55)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 55)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 56)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 56)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 63)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 63)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 64)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 64)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 65)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 65)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 72)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 6)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 72)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 78)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 73)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 7)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 73)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 79)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 74)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 8)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 74)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 80)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 81)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 81)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 82)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 82)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 83)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 90)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 90)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 91)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 91)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 92)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 92)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 99)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 99)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 100)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 100)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 101)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 101)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 108)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 108)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 109)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 109)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 110)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 110)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 117)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 117)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 118)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 118)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 119)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 119)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 126)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 126)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 127)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 127)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 128)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 128)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 9)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 81)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 10)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 82)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 11)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 83)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 90)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 90)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 91)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 91)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 92)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 92)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 99)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 99)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 100)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 100)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 101)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 101)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 108)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 108)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 109)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 109)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 110)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 110)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 117)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 117)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 118)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 118)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 119)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 119)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 126)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 126)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 127)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 127)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 128)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 128)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 144)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 12)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 144)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 84)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 145)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 13)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 145)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 85)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 146)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 14)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 146)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 86)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 99)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 99)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 100)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 100)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 101)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 101)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 108)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 108)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 109)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 109)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 110)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 110)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 117)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 117)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 118)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 118)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 119)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 119)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 126)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 126)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 127)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 127)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 128)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 128)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 135)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 136)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 137)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 144)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 144)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 145)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 145)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 146)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 146)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 153)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 15)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 153)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 87)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 154)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 16)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 154)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 88)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 155)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 17)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 155)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 89)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 162)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 162)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 163)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 163)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 164)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 171)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 171)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 172)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 172)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 173)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 173)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 180)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 180)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 181)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 181)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 182)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 182)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 189)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 189)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 190)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 190)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 191)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 191)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 198)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 198)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 199)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 199)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 200)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 200)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 207)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 207)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 208)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 208)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 209)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 209)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 216)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 18)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 216)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 90)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 217)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 19)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 217)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 91)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 218)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 20)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 218)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 92)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 171)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 171)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 172)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 172)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 173)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 173)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 180)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 180)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 181)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 181)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 182)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 182)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 189)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 189)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 190)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 190)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 191)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 191)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 198)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 198)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 199)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 199)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 200)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 200)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 207)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 207)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 208)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 208)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 209)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 209)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 216)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 216)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 217)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 217)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 218)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 218)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 225)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 21)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 225)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 93)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 226)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 22)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 226)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 94)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 227)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 23)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 227)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 95)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 180)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 180)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 181)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 181)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 182)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 182)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 189)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 189)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 190)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 190)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 191)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 191)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 198)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 198)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 199)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 199)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 200)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 200)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 207)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 207)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 208)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 208)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 209)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 209)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 216)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 216)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 217)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 217)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 218)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 218)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 225)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 225)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 226)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 226)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 227)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 227)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 234)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 24)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 234)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 96)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 235)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 25)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 235)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 97)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 236)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 26)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 236)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 98)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 243)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 243)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 244)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 244)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 245)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 245)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 252)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 252)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 253)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 253)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 254)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 254)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 261)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 261)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 262)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 262)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 263)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 263)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 270)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 270)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 271)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 271)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 272)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 272)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 279)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 279)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 280)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 280)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 281)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 281)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 288)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 288)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 289)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 289)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 290)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 290)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 297)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 27)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 297)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 99)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 298)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 28)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 298)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 100)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 299)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 29)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 299)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 101)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 252)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 252)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 253)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 253)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 254)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 254)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 261)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 261)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 262)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 262)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 263)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 263)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 270)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 270)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 271)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 271)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 272)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 272)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 279)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 279)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 280)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 280)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 281)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 281)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 288)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 288)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 289)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 289)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 290)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 290)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 297)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 297)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 298)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 298)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 299)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 299)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 306)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 30)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 306)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 102)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 307)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 31)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 307)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 103)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 308)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 32)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 308)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 104)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 261)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 261)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 262)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 262)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 263)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 263)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 270)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 270)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 271)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 271)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 272)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 272)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 279)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 279)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 280)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 280)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 281)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 281)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 288)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 288)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 289)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 289)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 290)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 290)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 297)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 297)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 298)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 298)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 299)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 299)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 306)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 306)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 307)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 307)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 308)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 308)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 315)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 33)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 315)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 105)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 316)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 34)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 316)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 106)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 317)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 35)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 317)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 107)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 324)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 324)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 325)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 325)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 326)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 326)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 333)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 333)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 334)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 334)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 335)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 335)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 342)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 342)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 343)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 343)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 344)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 344)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 351)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 351)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 352)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 352)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 353)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 353)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 360)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 360)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 361)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 361)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 362)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 362)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 369)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 369)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 370)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 370)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 371)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 371)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 378)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 36)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 378)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 108)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 379)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 37)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 379)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 109)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 380)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 38)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 380)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 110)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 333)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 333)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 334)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 334)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 335)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 335)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 342)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 342)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 343)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 343)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 344)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 344)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 351)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 351)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 352)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 352)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 353)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 353)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 360)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 360)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 361)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 361)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 362)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 362)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 369)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 369)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 370)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 370)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 371)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 371)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 378)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 378)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 379)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 379)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 380)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 380)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 387)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 39)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 387)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 111)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 388)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 40)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 388)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 112)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 389)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 41)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 389)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 113)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 342)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 342)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 343)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 343)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 344)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 344)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 351)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 351)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 352)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 352)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 353)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 353)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 360)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 360)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 361)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 361)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 362)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 362)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 369)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 369)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 370)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 370)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 371)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 371)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 378)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 378)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 379)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 379)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 380)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 380)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 387)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 387)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 388)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 388)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 389)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 389)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 396)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 42)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 396)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 114)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 397)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 43)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 397)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 115)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 398)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 44)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 398)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 116)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 405)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 405)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 406)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 406)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 407)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 407)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 414)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 414)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 415)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 415)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 416)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 416)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 423)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 423)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 424)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 424)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 425)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 425)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 432)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 432)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 433)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 433)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 434)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 434)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 441)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 441)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 442)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 442)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 443)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 443)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 450)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 450)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 451)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 451)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 452)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 452)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 459)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 45)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 459)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 117)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 460)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 46)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 460)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 118)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 461)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 47)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 461)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 119)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 414)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 414)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 415)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 415)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 416)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 416)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 423)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 423)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 424)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 424)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 425)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 425)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 432)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 432)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 433)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 433)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 434)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 434)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 441)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 441)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 442)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 442)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 443)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 443)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 450)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 450)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 451)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 451)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 452)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 452)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 459)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 459)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 460)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 460)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 461)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 461)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 468)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 48)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 468)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 120)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 469)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 49)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 469)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 121)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 470)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 50)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 470)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 122)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 423)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 423)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 424)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 424)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 425)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 425)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 432)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 432)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 433)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 433)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 434)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 434)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 441)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 441)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 442)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 442)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 443)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 443)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 450)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 450)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 451)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 451)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 452)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 452)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 459)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 459)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 460)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 460)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 461)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 461)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 468)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 468)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 469)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 469)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 470)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 470)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 477)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 51)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 477)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 123)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 478)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 52)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 478)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 124)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 479)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 53)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 479)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 125)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 486)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 486)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 487)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 487)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 488)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 488)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 495)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 495)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 496)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 496)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 497)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 497)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 504)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 504)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 505)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 505)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 506)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 506)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 513)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 513)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 514)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 514)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 515)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 515)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 522)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 522)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 523)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 523)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 524)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 524)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 531)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 531)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 532)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 532)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 533)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 533)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 540)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 54)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 540)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 126)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 541)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 55)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 541)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 127)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 542)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 56)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 542)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 128)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 495)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 495)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 496)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 496)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 497)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 497)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 504)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 504)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 505)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 505)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 506)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 506)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 513)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 513)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 514)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 514)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 515)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 515)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 522)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 522)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 523)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 523)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 524)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 524)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 531)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 531)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 532)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 532)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 533)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 533)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 540)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 540)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 541)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 541)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 542)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 542)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 549)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 57)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 549)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 129)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 550)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 58)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 550)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 130)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 551)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 59)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 551)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 131)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 504)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 504)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 505)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 505)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 506)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 506)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 513)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 513)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 514)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 514)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 515)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 515)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 522)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 522)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 523)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 523)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 524)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 524)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 531)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 531)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 532)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 532)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 533)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 533)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 540)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 540)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 541)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 541)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 542)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 542)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 549)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 549)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 550)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 550)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 551)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 551)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 558)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 60)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 558)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 132)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 559)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 61)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 559)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 133)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 560)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 62)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 560)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 134)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 567)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 567)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 568)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 568)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 569)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 569)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 576)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 576)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 577)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 577)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 578)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 578)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 585)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 585)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 586)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 586)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 587)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 587)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 594)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 594)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 595)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 595)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 596)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 596)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 603)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 603)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 604)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 604)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 605)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 605)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 612)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 612)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 613)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 613)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 614)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 614)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 621)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 63)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 621)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 135)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 622)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 64)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 622)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 136)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 623)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 65)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 623)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 137)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 576)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 576)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 577)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 577)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 578)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 578)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 585)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 585)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 586)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 586)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 587)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 587)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 594)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 594)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 595)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 595)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 596)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 596)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 603)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 603)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 604)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 604)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 605)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 605)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 612)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 612)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 613)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 613)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 614)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 614)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 621)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 621)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 622)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 622)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 623)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 623)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 630)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 66)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 630)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 138)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 631)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 67)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 631)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 139)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 632)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 68)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 632)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 140)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 585)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 585)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 586)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 586)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
+        conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 587)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
+        conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 587)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 594)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 594)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 595)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 595)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
+        conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 596)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
+        conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 596)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 603)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 603)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 604)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 604)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
+        conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 605)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
+        conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 605)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 612)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 612)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 613)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 613)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
+        conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 614)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
+        conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 614)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 621)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 621)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 622)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 622)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
+        conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 623)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
+        conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 623)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 630)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 630)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 631)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 631)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
+        conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 632)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
+        conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 632)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 639)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 69)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 639)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 141)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 640)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 70)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 640)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 142)]));
+        conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 641)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 71)]));
+        conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[((((int)threadIdx.x) % 7) + 641)] * kernel_shared[(((((int)threadIdx.x) / 7) * 144) + 143)]));
+      }
+      for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
+        for (int i2_inner = 0; i2_inner < 7; ++i2_inner) {
+          compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + (i2_inner * 7)) + (((int)threadIdx.x) % 7))] = max((conv2d_nchw[((i1_inner * 7) + i2_inner)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
+        }
       }
-      compute[((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 7)) + (((int)blockIdx.x) % 7))] = max((conv2d_nchw[0] + bias[(((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
-      compute[(((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 7)) + (((int)blockIdx.x) % 7)) + 784)] = max((conv2d_nchw[1] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) / 7)) + 16)]), 0.000000e+00f);
-      compute[(((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 7)) + (((int)blockIdx.x) % 7)) + 1568)] = max((conv2d_nchw[2] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) / 7)) + 32)]), 0.000000e+00f);
-      compute[(((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 7)) + (((int)blockIdx.x) % 7)) + 2352)] = max((conv2d_nchw[3] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) / 7)) + 48)]), 0.000000e+00f);
-      compute[(((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 7)) + (((int)blockIdx.x) % 7)) + 3136)] = max((conv2d_nchw[4] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) / 7)) + 64)]), 0.000000e+00f);
-      compute[(((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 7)) + (((int)blockIdx.x) % 7)) + 3920)] = max((conv2d_nchw[5] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) / 7)) + 80)]), 0.000000e+00f);
-      compute[(((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 7)) + (((int)blockIdx.x) % 7)) + 4704)] = max((conv2d_nchw[6] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) / 7)) + 96)]), 0.000000e+00f);
-      compute[(((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 7)) + (((int)blockIdx.x) % 7)) + 5488)] = max((conv2d_nchw[7] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) / 7)) + 112)]), 0.000000e+00f);
     }
 
 
@@ -1945,7 +2675,7 @@ In the example below we resume the status and do more 5 trials.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 5 minutes  49.007 seconds)
+   **Total running time of the script:** ( 5 minutes  35.959 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
index e0ba93827c..6cab07272e 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
@@ -647,7 +647,7 @@ so we can read the log file and load the best schedules.
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-       7.8547       7.8550       7.8556       7.8535       0.0009   
+       7.9172       7.9167       7.9232       7.9116       0.0047   
                
 
 
@@ -675,7 +675,7 @@ Other Tips
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  7.441 seconds)
+   **Total running time of the script:** ( 1 minutes  5.335 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
index ca98d6f182..a18cdf6d1a 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
@@ -666,7 +666,7 @@ so we can read the log file and load the best schedules.
     Evaluate inference time cost...
     Execution time summary:
      mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
-      762.1014     762.2812     763.9098     760.1134      1.5551   
+      746.7948     746.4840     748.6214     745.2790      1.3822   
                
 
 
@@ -694,7 +694,7 @@ Other Tips
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  42.967 seconds)
+   **Total running time of the script:** ( 1 minutes  38.441 seconds)
 
 
 .. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_x86.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
index 378c6bc1f3..591e08d638 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
@@ -389,12 +389,12 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
         @T.prim_func
         def main(placeholder: T.Buffer((128, 256), "float32"), placeholder_1: T.Buffer((4916, 16, 1), "float32"), placeholder_2: T.Buffer((4916,), "int32"), placeholder_3: T.Buffer((33,), "int32"), placeholder_4: T.Buffer((128, 512), "float32"), compute: T.Buffer((128, 512), "float32")):
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            for i0_outer_i1_outer_fused in T.parallel(32):
-                compute_1 = T.allocate([2048], "float32", "global")
-                compute_2 = T.Buffer((2048,), data=compute_1)
-                for nb_j_inner in range(2):
-                    for i_inner_init in range(64):
-                        cse_var_1: T.int32 = i_inner_init * 32 + nb_j_inner * 16
+            for i0_outer_i1_outer_fused in T.parallel(16):
+                compute_1 = T.allocate([4096], "float32", "global")
+                compute_2 = T.Buffer((4096,), data=compute_1)
+                for i_outer_inner, nb_j_inner in T.grid(8, 2):
+                    for i_inner_init in range(16):
+                        cse_var_1: T.int32 = i_outer_inner * 512 + i_inner_init * 32 + nb_j_inner * 16
                         compute_2[cse_var_1] = T.float32(0)
                         compute_2[cse_var_1 + 1] = T.float32(0)
                         compute_2[cse_var_1 + 2] = T.float32(0)
@@ -411,49 +411,49 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
                         compute_2[cse_var_1 + 13] = T.float32(0)
                         compute_2[cse_var_1 + 14] = T.float32(0)
                         compute_2[cse_var_1 + 15] = T.float32(0)
-                    for elem_idx, i_inner in T.grid(T.let(cse_var_2, i0_outer_i1_outer_fused % 16 * 2 + nb_j_inner, placeholder_5[cse_var_2 + 1] - placeholder_5[cse_var_2]), 64):
+                    for elem_idx, i_inner in T.grid(T.let(cse_var_2, i0_outer_i1_outer_fused * 2 + nb_j_inner, placeholder_5[cse_var_2 + 1] - placeholder_5[cse_var_2]), 16):
                         cse_var_2 = T.var("int32")
                         placeholder_5 = T.Buffer((33,), "int32", data=placeholder_3.data)
                         cse_var_21: T.int32 = elem_idx * 16
-                        cse_var_20: T.int32 = i_inner * 32 + nb_j_inner * 16
-                        cse_var_19: T.int32 = i0_outer_i1_outer_fused % 16 * 2 + nb_j_inner
-                        cse_var_18: T.int32 = i0_outer_i1_outer_fused // 16 * 16384 + i_inner * 256
-                        cse_var_17: T.int32 = cse_var_20 + 9
-                        cse_var_16: T.int32 = cse_var_20 + 8
-                        cse_var_15: T.int32 = cse_var_20 + 7
-                        cse_var_14: T.int32 = cse_var_20 + 6
-                        cse_var_13: T.int32 = cse_var_20 + 5
-                        cse_var_12: T.int32 = cse_var_20 + 4
-                        cse_var_11: T.int32 = cse_var_20 + 3
-                        cse_var_10: T.int32 = cse_var_20 + 2
-                        cse_var_9: T.int32 = cse_var_20 + 15
-                        cse_var_8: T.int32 = cse_var_20 + 14
-                        cse_var_7: T.int32 = cse_var_20 + 13
-                        cse_var_6: T.int32 = cse_var_20 + 12
-                        cse_var_5: T.int32 = cse_var_20 + 11
-                        cse_var_4: T.int32 = cse_var_20 + 10
-                        cse_var_3: T.int32 = cse_var_20 + 1
+                        cse_var_20: T.int32 = i0_outer_i1_outer_fused * 2 + nb_j_inner
+                        cse_var_19: T.int32 = i_outer_inner * 4096 + i_inner * 256
+                        cse_var_18: T.int32 = i_outer_inner * 512 + i_inner * 32 + nb_j_inner * 16
+                        cse_var_17: T.int32 = cse_var_18 + 9
+                        cse_var_16: T.int32 = cse_var_18 + 8
+                        cse_var_15: T.int32 = cse_var_18 + 7
+                        cse_var_14: T.int32 = cse_var_18 + 6
+                        cse_var_13: T.int32 = cse_var_18 + 5
+                        cse_var_12: T.int32 = cse_var_18 + 4
+                        cse_var_11: T.int32 = cse_var_18 + 3
+                        cse_var_10: T.int32 = cse_var_18 + 2
+                        cse_var_9: T.int32 = cse_var_18 + 15
+                        cse_var_8: T.int32 = cse_var_18 + 14
+                        cse_var_7: T.int32 = cse_var_18 + 13
+                        cse_var_6: T.int32 = cse_var_18 + 12
+                        cse_var_5: T.int32 = cse_var_18 + 11
+                        cse_var_4: T.int32 = cse_var_18 + 10
+                        cse_var_3: T.int32 = cse_var_18 + 1
                         placeholder_6 = T.Buffer((78656,), data=placeholder_1.data)
                         placeholder_7 = T.Buffer((32768,), data=placeholder.data)
                         placeholder_8 = T.Buffer((4916,), "int32", data=placeholder_2.data)
-                        compute_2[cse_var_20] = compute_2[cse_var_20] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_3] = compute_2[cse_var_3] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 1] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_10] = compute_2[cse_var_10] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 2] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_11] = compute_2[cse_var_11] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 3] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_12] = compute_2[cse_var_12] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 4] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_13] = compute_2[cse_var_13] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 5] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_14] = compute_2[cse_var_14] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 6] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_15] = compute_2[cse_var_15] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 7] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_16] = compute_2[cse_var_16] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 8] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_17] = compute_2[cse_var_17] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 9] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_4] = compute_2[cse_var_4] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 10] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_5] = compute_2[cse_var_5] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 11] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_6] = compute_2[cse_var_6] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 12] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_7] = compute_2[cse_var_7] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 13] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_8] = compute_2[cse_var_8] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 14] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                        compute_2[cse_var_9] = compute_2[cse_var_9] + placeholder_6[placeholder_5[cse_var_19] * 16 + cse_var_21 + 15] * T.max(placeholder_7[cse_var_18 + placeholder_8[placeholder_5[cse_var_19] + elem_idx]], T.float32(0))
-                for i0_inner in range(64):
-                    cse_var_22: T.int32 = i0_outer_i1_outer_fused // 16 * 32768 + i0_inner * 512 + i0_outer_i1_outer_fused % 16 * 32
+                        compute_2[cse_var_18] = compute_2[cse_var_18] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_3] = compute_2[cse_var_3] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 1] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_10] = compute_2[cse_var_10] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 2] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_11] = compute_2[cse_var_11] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 3] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_12] = compute_2[cse_var_12] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 4] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_13] = compute_2[cse_var_13] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 5] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_14] = compute_2[cse_var_14] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 6] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_15] = compute_2[cse_var_15] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 7] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_16] = compute_2[cse_var_16] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 8] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_17] = compute_2[cse_var_17] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 9] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_4] = compute_2[cse_var_4] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 10] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_5] = compute_2[cse_var_5] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 11] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_6] = compute_2[cse_var_6] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 12] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_7] = compute_2[cse_var_7] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 13] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_8] = compute_2[cse_var_8] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 14] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                        compute_2[cse_var_9] = compute_2[cse_var_9] + placeholder_6[placeholder_5[cse_var_20] * 16 + cse_var_21 + 15] * T.max(placeholder_7[cse_var_19 + placeholder_8[placeholder_5[cse_var_20] + elem_idx]], T.float32(0))
+                for i0_inner in range(128):
+                    cse_var_22: T.int32 = i0_inner * 512 + i0_outer_i1_outer_fused * 32
                     compute_3 = T.Buffer((65536,), data=compute.data)
                     placeholder_5 = T.Buffer((65536,), data=placeholder_4.data)
                     compute_3[cse_var_22:cse_var_22 + 32] = T.max(compute_2[i0_inner * 32:i0_inner * 32 + 32] + placeholder_5[cse_var_22:cse_var_22 + 32], T.Broadcast(T.float32(0), 32))
@@ -506,7 +506,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 1.859 ms
+    Execution time of this operator: 1.714 ms
 
 
 
diff --git a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
index 55a63199c0..a1ba32f444 100644
--- a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
@@ -5,16 +5,16 @@
 
 Computation times
 =================
-**00:40.644** total execution time for **how_to_tune_with_autotvm** files:
+**00:36.302** total execution time for **how_to_tune_with_autotvm** files:
 
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)           | 00:40.609 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``)           | 00:36.266 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)               | 00:00.020 | 0.0 MB |
-+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``) | 00:00.005 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``)               | 00:00.022 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``)             | 00:00.005 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``)               | 00:00.005 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``)               | 00:00.004 | 0.0 MB |
++--------------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``) | 00:00.004 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
index 2584aabb91..51d8b51781 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
@@ -268,8 +268,8 @@ for this template
     waiting for device...
     device available
     Get devices for measurement successfully!
-    No: 1   GFLOPS: 152.81/152.81   result: MeasureResult(costs=(0.0015149482718446602,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.9889278411865234, timestamp=1674849751.468465)       [('tile_f', [-1, 4, 32, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9942552
-    No: 2   GFLOPS: 0.00/152.81     result: Traceback (most recent call last):
+    No: 1   GFLOPS: 377.72/377.72   result: MeasureResult(costs=(0.0006128981807909605,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.122851848602295, timestamp=1674863351.3234568)       [('tile_f', [-1, 1, 2, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4574744
+    No: 2   GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -391,15 +391,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 1, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1525865
-    No: 3   GFLOPS: 13.30/152.81    result: MeasureResult(costs=(0.017408682499999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=9.569729328155518, timestamp=1674849754.323083) [('tile_f', [-1, 2, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 32]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10024961
-    No: 4   GFLOPS: 239.69/239.69   result: MeasureResult(costs=(0.0009658364,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.049877405166626, timestamp=1674849755.3298645)        [('tile_f', [-1, 1, 8, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3326741
-    No: 5   GFLOPS: 158.40/239.69   result: MeasureResult(costs=(0.0014615356375,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.2541542053222656, timestamp=1674849757.940971)     [('tile_f', [-1, 2, 32, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 128, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3510361
-    No: 6   GFLOPS: 63.33/239.69    result: MeasureResult(costs=(0.003655361418604651,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.3859949111938477, timestamp=1674849758.9519112)       [('tile_f', [-1, 1, 2, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 32, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7763248
-    No: 7   GFLOPS: 34.88/239.69    result: MeasureResult(costs=(0.006637296764705883,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8624403476715088, timestamp=1674849760.6983192)       [('tile_f', [-1, 1, 2, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 16, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,1791788
-    No: 8   GFLOPS: 6.77/239.69     result: MeasureResult(costs=(0.034191425250000004,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.1332976818084717, timestamp=1674849761.5729275)       [('tile_f', [-1, 1, 16, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1356482
-    No: 9   GFLOPS: 17.51/239.69    result: MeasureResult(costs=(0.013222627624999999,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.2726881504058838, timestamp=1674849763.015446)        [('tile_f', [-1, 2, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8719041
-    No: 10  GFLOPS: 0.00/239.69     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 8, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,741922
+    No: 3   GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -521,8 +514,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 32, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 32, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4396884
-    No: 11  GFLOPS: 0.00/239.69     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 2, 64]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,296104
+    No: 4   GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -644,8 +637,10 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 2, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 32]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,725927
-    No: 12  GFLOPS: 0.00/239.69     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 1, 64]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 128, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3771221
+    No: 5   GFLOPS: 37.82/377.72    result: MeasureResult(costs=(0.006121584789473685,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6088848114013672, timestamp=1674863355.1972575)       [('tile_f', [-1, 2, 1, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1071537
+    No: 6   GFLOPS: 21.40/377.72    result: MeasureResult(costs=(0.0108183555,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6242423057556152, timestamp=1674863355.9957132)       [('tile_f', [-1, 2, 1, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1457501
+    No: 7   GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -767,9 +762,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 2, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,182271
-    No: 13  GFLOPS: 24.65/239.69    result: MeasureResult(costs=(0.00939338190909091,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.377659559249878, timestamp=1674849764.608265)  [('tile_f', [-1, 1, 1, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,208880
-    No: 14  GFLOPS: 0.00/239.69     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 4, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 32, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8465530
+    No: 8   GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -891,8 +885,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 64, 4, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 64, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,89838
-    No: 15  GFLOPS: 0.00/239.69     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 16, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 512, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4487247
+    No: 9   GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1014,8 +1008,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 2, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6132203
-    No: 16  GFLOPS: 0.00/239.69     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 32, 2, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 256, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3937708
+    No: 10  GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1137,8 +1131,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 2, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 128, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8223967
-    No: 17  GFLOPS: 0.00/239.69     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 1, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 64, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3603260
+    No: 11  GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1260,8 +1254,8 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 32, 8, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9825452
-    No: 18  GFLOPS: 0.00/239.69     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 1, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 32, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,5054688
+    No: 12  GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1383,8 +1377,9 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 1, 32]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1708707
-    No: 19  GFLOPS: 0.00/239.69     result: Traceback (most recent call last):
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 4, 2]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1323376
+    No: 13  GFLOPS: 78.03/377.72    result: MeasureResult(costs=(0.0029666868867924533,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.4817659854888916, timestamp=1674863368.350646)       [('tile_f', [-1, 4, 1, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7366702
+    No: 14  GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
         func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1506,8 +1501,518 @@ for this template
       File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
       File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
         raise InstantiationError("Skipped because of invalid gpu kernel")
-    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 1, 2, 128]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7269013
-    No: 20  GFLOPS: 316.12/316.12   result: MeasureResult(costs=(0.0007323275755395683,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.7681198120117188, timestamp=1674849766.643176)       [('tile_f', [-1, 2, 8, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8364428
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 2, 4, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 32, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3816050
+    No: 15  GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+        func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+        func = build(s, args, target_host=task.target_host, runtime=runtime)
+      File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+        input_mod = lower(inputs, args, name=name, binds=binds)
+      File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+      File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+    tvm._ffi.base.TVMError: Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+    Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 16, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 8, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3920218
+    No: 16  GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
+        res = future.result()
+      File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
+        return self.__get_result()
+      File "/usr/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
+        raise self._exception
+      File "/usr/lib/python3.7/concurrent/futures/thread.py", line 57, in run
+        result = self.fn(*self.args, **self.kwargs)
+      File "/workspace/python/tvm/contrib/popen_pool.py", line 432, in <lambda>
+        worker = lambda *args: self._worker_run(*args)
+      File "/workspace/python/tvm/contrib/popen_pool.py", line 401, in _worker_run
+        return proc.recv()
+      File "/workspace/python/tvm/contrib/popen_pool.py", line 309, in recv
+        raise TimeoutError()
+    TimeoutError
+
+            [('tile_f', [-1, 2, 2, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9242089
+    No: 17  GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+        func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+        func = build(s, args, target_host=task.target_host, runtime=runtime)
+      File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+        input_mod = lower(inputs, args, name=name, binds=binds)
+      File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+      File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+    tvm._ffi.base.TVMError: Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+    Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 8, 16, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,729429
+    No: 18  GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+        func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+        func = build(s, args, target_host=task.target_host, runtime=runtime)
+      File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+        input_mod = lower(inputs, args, name=name, binds=binds)
+      File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+      File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+    tvm._ffi.base.TVMError: Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+    Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 32, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 32, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5533702
+    No: 19  GFLOPS: 4.15/377.72     result: MeasureResult(costs=(0.05577265525,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8923165798187256, timestamp=1674863371.0967312)      [('tile_f', [-1, 2, 4, 32]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5348395
+    No: 20  GFLOPS: 0.00/377.72     result: Traceback (most recent call last):
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
+        func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
+        func = build(s, args, target_host=task.target_host, runtime=runtime)
+      File "/workspace/python/tvm/driver/build_module.py", line 227, in build
+        input_mod = lower(inputs, args, name=name, binds=binds)
+      File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
+        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
+      File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
+    tvm._ffi.base.TVMError: Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
+
+    Traceback (most recent call last):
+      24: TVMFuncCall
+            at ../src/runtime/c_runtime_api.cc:477
+      23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      22: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      21: operator()
+            at ../include/tvm/runtime/packed_func.h:1730
+      20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
+            at ../include/tvm/runtime/packed_func.h:1670
+      19: run<>
+            at ../include/tvm/runtime/packed_func.h:1630
+      18: run<tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1630
+      14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
+            at ../include/tvm/runtime/packed_func.h:1645
+      13: operator()
+            at ../src/driver/driver_api.cc:395
+      12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
+            at ../src/driver/driver_api.cc:381
+      11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
+            at ../src/driver/driver_api.cc:276
+      10: tvm::transform::Pass::operator()(tvm::IRModule) const
+            at ../src/ir/transform.cc:258
+      9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:451
+      7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/ir/transform.cc:274
+      6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
+            at ../src/tir/ir/transform.cc:100
+      5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
+            at ../include/tvm/runtime/packed_func.h:1749
+      4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
+            at ../include/tvm/runtime/packed_func.h:1693
+      3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
+            at ../include/tvm/runtime/packed_func.h:1617
+      2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
+            at ../include/tvm/runtime/packed_func.h:1217
+      1: Call
+            at ../include/tvm/runtime/packed_func.h:1213
+      0: operator()
+            at ../src/runtime/c_runtime_api.cc:534
+      File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
+      File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
+        raise InstantiationError("Skipped because of invalid gpu kernel")
+    tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 32, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 256]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9482262
 
 
 
@@ -1562,9 +2067,9 @@ and measure running time.
     Finish loading 20 records
 
     Best config:
-    [('tile_f', [-1, 2, 8, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8364428
+    [('tile_f', [-1, 1, 2, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4574744
     Finish loading 20 records
-    Time cost of this operator: 0.001120
+    Time cost of this operator: 0.000988
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/index.rst.txt b/docs/_sources/how_to/work_with_microtvm/index.rst.txt
index d02b385ef8..7bd444d076 100644
--- a/docs/_sources/how_to/work_with_microtvm/index.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/index.rst.txt
@@ -17,69 +17,52 @@ demonstrate how to tune and deploy models with microTVM.
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial is showcasing microTVM host-driven AoT compilation with a TFLite model. AoTExecut...">
-
-.. only:: html
-
-  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_aot_thumb.png
-    :alt: microTVM Host-Driven AoT
-
-  :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py`
-
-.. raw:: html
-
-      <div class="sphx-glr-thumbnail-title">microTVM Host-Driven AoT</div>
-    </div>
-
-
-.. raw:: html
-
-    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial explains how to autotune a model using the C runtime.">
+    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial explains how to compile a tiny model for a micro device, build a program on Zephy...">
 
 .. only:: html
 
-  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_autotune_thumb.png
-    :alt: Autotuning with microTVM
+  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_tvmc_thumb.png
+    :alt: 1. microTVM CLI Tool
 
-  :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py`
+  :ref:`sphx_glr_how_to_work_with_microtvm_micro_tvmc.py`
 
 .. raw:: html
 
-      <div class="sphx-glr-thumbnail-title">Autotuning with microTVM</div>
+      <div class="sphx-glr-thumbnail-title">1. microTVM CLI Tool</div>
     </div>
 
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="This section contains an example of how to use TVM to run a model on an Arm(R) Cortex(R)-M55 CP...">
+    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial is an introduction to working with microTVM and a TFLite model with Relay.">
 
 .. only:: html
 
-  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_ethosu_thumb.png
-    :alt: Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN
+  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_tflite_thumb.png
+    :alt: 2. microTVM TFLite Tutorial
 
-  :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py`
+  :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py`
 
 .. raw:: html
 
-      <div class="sphx-glr-thumbnail-title">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</div>
+      <div class="sphx-glr-thumbnail-title">2. microTVM TFLite Tutorial</div>
     </div>
 
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial is showcasing building an MLPerfTiny submission using microTVM. This tutorial sho...">
+    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial is showcasing microTVM host-driven AoT compilation with a TFLite model. AoTExecut...">
 
 .. only:: html
 
-  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_mlperftiny_thumb.png
-    :alt: Creating Your MLPerfTiny Submission with microTVM
+  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_aot_thumb.png
+    :alt: 3. microTVM Ahead-of-Time (AOT) Compilation
 
-  :ref:`sphx_glr_how_to_work_with_microtvm_micro_mlperftiny.py`
+  :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py`
 
 .. raw:: html
 
-      <div class="sphx-glr-thumbnail-title">Creating Your MLPerfTiny Submission with microTVM</div>
+      <div class="sphx-glr-thumbnail-title">3. microTVM Ahead-of-Time (AOT) Compilation</div>
     </div>
 
 
@@ -90,81 +73,81 @@ demonstrate how to tune and deploy models with microTVM.
 .. only:: html
 
   .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_pytorch_thumb.png
-    :alt: microTVM PyTorch Tutorial
+    :alt: 4. microTVM PyTorch Tutorial
 
   :ref:`sphx_glr_how_to_work_with_microtvm_micro_pytorch.py`
 
 .. raw:: html
 
-      <div class="sphx-glr-thumbnail-title">microTVM PyTorch Tutorial</div>
+      <div class="sphx-glr-thumbnail-title">4. microTVM PyTorch Tutorial</div>
     </div>
 
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial explains how to launch microTVM Reference Virtual Machines. You can use these to ...">
+    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial shows how MobileNetV1 models can be trained to fit on embedded devices, and how t...">
 
 .. only:: html
 
-  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_reference_vm_thumb.png
-    :alt: microTVM Reference Virtual Machines
+  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_train_thumb.png
+    :alt: 5. Training Vision Models for microTVM on Arduino
 
-  :ref:`sphx_glr_how_to_work_with_microtvm_micro_reference_vm.py`
+  :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py`
 
 .. raw:: html
 
-      <div class="sphx-glr-thumbnail-title">microTVM Reference Virtual Machines</div>
+      <div class="sphx-glr-thumbnail-title">5. Training Vision Models for microTVM on Arduino</div>
     </div>
 
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial is an introduction to working with microTVM and a TFLite model with Relay.">
+    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial explains how to autotune a model using the C runtime.">
 
 .. only:: html
 
-  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_tflite_thumb.png
-    :alt: microTVM with TFLite Models
+  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_autotune_thumb.png
+    :alt: 6. Model Tuning with microTVM
 
-  :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py`
+  :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py`
 
 .. raw:: html
 
-      <div class="sphx-glr-thumbnail-title">microTVM with TFLite Models</div>
+      <div class="sphx-glr-thumbnail-title">6. Model Tuning with microTVM</div>
     </div>
 
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial shows how MobileNetV1 models can be trained to fit on embedded devices, and how t...">
+    <div class="sphx-glr-thumbcontainer" tooltip="This section contains an example of how to use TVM to run a model on an Arm(R) Cortex(R)-M55 CP...">
 
 .. only:: html
 
-  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_train_thumb.png
-    :alt: Training Vision Models for microTVM on Arduino
+  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_ethosu_thumb.png
+    :alt: 7. Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN
 
-  :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py`
+  :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py`
 
 .. raw:: html
 
-      <div class="sphx-glr-thumbnail-title">Training Vision Models for microTVM on Arduino</div>
+      <div class="sphx-glr-thumbnail-title">7. Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</div>
     </div>
 
 
 .. raw:: html
 
-    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial explains how to compile a tiny model for a micro device, build a program on Zephy...">
+    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial is showcasing building an MLPerfTiny submission using microTVM. This tutorial sho...">
 
 .. only:: html
 
-  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_tvmc_thumb.png
-    :alt: Executing a Tiny Model with TVMC Micro
+  .. image:: /how_to/work_with_microtvm/images/thumb/sphx_glr_micro_mlperftiny_thumb.png
+    :alt: 8. Creating Your MLPerfTiny Submission with microTVM
 
-  :ref:`sphx_glr_how_to_work_with_microtvm_micro_tvmc.py`
+  :ref:`sphx_glr_how_to_work_with_microtvm_micro_mlperftiny.py`
 
 .. raw:: html
 
-      <div class="sphx-glr-thumbnail-title">Executing a Tiny Model with TVMC Micro</div>
+      <div class="sphx-glr-thumbnail-title">8. Creating Your MLPerfTiny Submission with microTVM</div>
     </div>
 
 
@@ -176,15 +159,14 @@ demonstrate how to tune and deploy models with microTVM.
 .. toctree::
    :hidden:
 
+   /how_to/work_with_microtvm/micro_tvmc
+   /how_to/work_with_microtvm/micro_tflite
    /how_to/work_with_microtvm/micro_aot
+   /how_to/work_with_microtvm/micro_pytorch
+   /how_to/work_with_microtvm/micro_train
    /how_to/work_with_microtvm/micro_autotune
    /how_to/work_with_microtvm/micro_ethosu
    /how_to/work_with_microtvm/micro_mlperftiny
-   /how_to/work_with_microtvm/micro_pytorch
-   /how_to/work_with_microtvm/micro_reference_vm
-   /how_to/work_with_microtvm/micro_tflite
-   /how_to/work_with_microtvm/micro_train
-   /how_to/work_with_microtvm/micro_tvmc
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_aot.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_aot.rst.txt
index f6cff8f0dc..83a8da934e 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_aot.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_aot.rst.txt
@@ -22,10 +22,10 @@
 .. _sphx_glr_how_to_work_with_microtvm_micro_aot.py:
 
 
-.. _tutorial-micro-AoT:
+.. _tutorial-micro-aot:
 
-microTVM Host-Driven AoT
-===========================
+3. microTVM Ahead-of-Time (AOT) Compilation
+===========================================
 **Authors**:
 `Mehrdad Hessar <https://github.com/mehrdadh>`_,
 `Alan MacDonald <https://github.com/alanmacd>`_
@@ -73,7 +73,7 @@ Import Python dependencies
 -------------------------------
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 56-65
+.. GENERATED FROM PYTHON SOURCE LINES 56-66
 
 .. code-block:: default
 
@@ -83,6 +83,7 @@ Import Python dependencies
 
     import tvm
     from tvm import relay
+    import tvm.micro.testing
     from tvm.relay.backend import Executor, Runtime
     from tvm.contrib.download import download_testdata
 
@@ -93,7 +94,7 @@ Import Python dependencies
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 66-76
+.. GENERATED FROM PYTHON SOURCE LINES 67-77
 
 Import a TFLite model
 ---------------------
@@ -106,7 +107,7 @@ To test this model, we use samples from `KWS dataset provided by Google <https:/
 you need to export `TVM_MICRO_USE_HW` environment variable.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 76-97
+.. GENERATED FROM PYTHON SOURCE LINES 77-98
 
 .. code-block:: default
 
@@ -138,7 +139,7 @@ you need to export `TVM_MICRO_USE_HW` environment variable.
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 98-108
+.. GENERATED FROM PYTHON SOURCE LINES 99-108
 
 Defining the target
 -------------------
@@ -147,11 +148,10 @@ Now we need to define the target, runtime and executor. In this tutorial, we foc
 using AOT host driven executor. We use the host micro target which is for running a model
 on x86 CPU using CRT runtime or running a model with Zephyr platform on qemu_x86 simulator
 board. In the case of a physical microcontroller, we get the target model for the physical
-board (E.g. nucleo_l4r5zi) and pass it to `tvm.target.target.micro` to create a full
-micro target.
+board (E.g. nucleo_l4r5zi) and change `BOARD` to supported Zephyr board.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 108-127
+.. GENERATED FROM PYTHON SOURCE LINES 108-124
 
 .. code-block:: default
 
@@ -161,18 +161,15 @@ micro target.
 
     # Simulate a microcontroller on the host machine. Uses the main() from `src/runtime/crt/host/main.cc`.
     # To use physical hardware, replace "host" with something matching your hardware.
-    TARGET = tvm.target.target.micro("host")
+    TARGET = tvm.micro.testing.get_target("crt")
 
     # Use the AOT executor rather than graph or vm executors. Don't use unpacked API or C calling style.
     EXECUTOR = Executor("aot")
 
     if use_physical_hw:
-        boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
-        with open(boards_file) as f:
-            boards = json.load(f)
         BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
         SERIAL = os.getenv("TVM_MICRO_SERIAL", default=None)
-        TARGET = tvm.target.target.micro(boards[BOARD]["model"])
+        TARGET = tvm.micro.testing.get_target("zephyr", BOARD)
 
 
 
@@ -181,7 +178,7 @@ micro target.
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 128-133
+.. GENERATED FROM PYTHON SOURCE LINES 125-130
 
 Compile the model
 -----------------
@@ -189,7 +186,7 @@ Compile the model
 Now, we compile the model for the target:
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 133-138
+.. GENERATED FROM PYTHON SOURCE LINES 130-135
 
 .. code-block:: default
 
@@ -205,7 +202,7 @@ Now, we compile the model for the target:
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 139-147
+.. GENERATED FROM PYTHON SOURCE LINES 136-144
 
 Create a microTVM project
 -------------------------
@@ -216,7 +213,7 @@ CRT and Zephyr microTVM template projects which are used for x86 CPU and Zephyr
 respectively.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 147-166
+.. GENERATED FROM PYTHON SOURCE LINES 144-163
 
 .. code-block:: default
 
@@ -246,7 +243,7 @@ respectively.
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 167-175
+.. GENERATED FROM PYTHON SOURCE LINES 164-172
 
 Build, flash and execute the model
 ----------------------------------
@@ -257,7 +254,7 @@ Next, we define the labels for the model output and execute the model with a
 sample with expected value of 6 (label: left).
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 175-199
+.. GENERATED FROM PYTHON SOURCE LINES 172-196
 
 .. code-block:: default
 
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
index 36f0fd3bb3..0d08e4bc28 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
@@ -24,8 +24,8 @@
 
 .. _tutorial-micro-autotune:
 
-Autotuning with microTVM
-=========================
+6. Model Tuning with microTVM
+=============================
 **Authors**:
 `Andrew Reusch <https://github.com/areusch>`_,
 `Mehrdad Hessar <https://github.com/mehrdadh>`_
@@ -67,7 +67,7 @@ Import Python dependencies
 -------------------------------
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 52-59
+.. GENERATED FROM PYTHON SOURCE LINES 52-60
 
 .. code-block:: default
 
@@ -77,6 +77,7 @@ Import Python dependencies
 
     import tvm
     from tvm.relay.backend import Runtime
+    import tvm.micro.testing
 
 
 
@@ -85,7 +86,7 @@ Import Python dependencies
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 60-66
+.. GENERATED FROM PYTHON SOURCE LINES 61-67
 
 Defining the model
 ###################
@@ -94,7 +95,7 @@ Defining the model
  fill parameters with random numbers.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 66-91
+.. GENERATED FROM PYTHON SOURCE LINES 67-92
 
 .. code-block:: default
 
@@ -130,7 +131,7 @@ Defining the model
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 92-103
+.. GENERATED FROM PYTHON SOURCE LINES 93-104
 
 Defining the target
 ######################
@@ -144,26 +145,22 @@ Defining the target
  this tutorial.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 103-121
+.. GENERATED FROM PYTHON SOURCE LINES 104-118
 
 .. code-block:: default
 
 
     RUNTIME = Runtime("crt", {"system-lib": True})
-    TARGET = tvm.target.target.micro("host")
+    TARGET = tvm.micro.testing.get_target("crt")
 
     # Compiling for physical hardware
     # --------------------------------------------------------------------------
     #  When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The
     #  STM32L4R5ZI Nucleo target and board is chosen in the example below.
     if use_physical_hw:
-        boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
-        with open(boards_file) as f:
-            boards = json.load(f)
-
         BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
         SERIAL = os.getenv("TVM_MICRO_SERIAL", default=None)
-        TARGET = tvm.target.target.micro(boards[BOARD]["model"])
+        TARGET = tvm.micro.testing.get_target("zephyr", BOARD)
 
 
 
@@ -173,7 +170,7 @@ Defining the target
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 122-131
+.. GENERATED FROM PYTHON SOURCE LINES 119-128
 
 Extracting tuning tasks
 ########################
@@ -185,7 +182,7 @@ Extracting tuning tasks
  transformation passes; we'll apply the same configuration later on during autotuning.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 131-137
+.. GENERATED FROM PYTHON SOURCE LINES 128-134
 
 .. code-block:: default
 
@@ -202,7 +199,7 @@ Extracting tuning tasks
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 138-148
+.. GENERATED FROM PYTHON SOURCE LINES 135-145
 
 Configuring microTVM
 #####################
@@ -215,7 +212,7 @@ Configuring microTVM
  choose other options by choosing from `PLATFORM` list.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 148-186
+.. GENERATED FROM PYTHON SOURCE LINES 145-183
 
 .. code-block:: default
 
@@ -264,14 +261,14 @@ Configuring microTVM
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 187-191
+.. GENERATED FROM PYTHON SOURCE LINES 184-188
 
 Run Autotuning
 #########################
  Now we can run autotuning separately on each extracted task on microTVM device.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 191-209
+.. GENERATED FROM PYTHON SOURCE LINES 188-206
 
 .. code-block:: default
 
@@ -300,7 +297,7 @@ Run Autotuning
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 210-216
+.. GENERATED FROM PYTHON SOURCE LINES 207-213
 
 Timing the untuned program
 ###########################
@@ -309,7 +306,7 @@ Timing the untuned program
  the tuned operator.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 216-255
+.. GENERATED FROM PYTHON SOURCE LINES 213-252
 
 .. code-block:: default
 
@@ -363,21 +360,21 @@ Timing the untuned program
     ########## Build without Autotuning ##########
     Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  Measurements(us)  
     ---------                                     ---                                           --------  -------  -----              ------  -------  ----------------  
-    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  311.6     98.67    (1, 2, 10, 10, 3)  2       1        [311.6]           
-    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.215     1.018    (1, 6, 10, 10)     1       1        [3.215]           
-    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.984     0.312    (1, 1, 10, 10, 3)  1       1        [0.984]           
-    Total_time                                    -                                             315.799   -        -                  -       -        -                 
+    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  311.6     98.595   (1, 2, 10, 10, 3)  2       1        [311.6]           
+    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       3.111     0.984    (1, 6, 10, 10)     1       1        [3.111]           
+    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         1.328     0.42     (1, 1, 10, 10, 3)  1       1        [1.328]           
+    Total_time                                    -                                             316.04    -        -                  -       -        -                 
 
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 256-259
+.. GENERATED FROM PYTHON SOURCE LINES 253-256
 
 Timing the tuned program
 #########################
  Once autotuning completes, you can time execution of the entire program using the Debug Runtime:
 
-.. GENERATED FROM PYTHON SOURCE LINES 259-298
+.. GENERATED FROM PYTHON SOURCE LINES 256-295
 
 .. code-block:: default
 
@@ -431,10 +428,10 @@ Timing the tuned program
     ########## Build with Autotuning ##########
     Node Name                                     Ops                                           Time(us)  Time(%)  Shape              Inputs  Outputs  Measurements(us)  
     ---------                                     ---                                           --------  -------  -----              ------  -------  ----------------  
-    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  102.7     97.439   (1, 6, 10, 10, 1)  2       1        [102.7]           
-    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.823     1.73     (1, 6, 10, 10)     1       1        [1.823]           
-    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.876     0.831    (1, 3, 10, 10, 1)  1       1        [0.876]           
-    Total_time                                    -                                             105.4     -        -                  -       -        -                 
+    tvmgen_default_fused_nn_contrib_conv2d_NCHWc  tvmgen_default_fused_nn_contrib_conv2d_NCHWc  101.3     97.386   (1, 6, 10, 10, 1)  2       1        [101.3]           
+    tvmgen_default_fused_layout_transform_1       tvmgen_default_fused_layout_transform_1       1.765     1.697    (1, 6, 10, 10)     1       1        [1.765]           
+    tvmgen_default_fused_layout_transform         tvmgen_default_fused_layout_transform         0.954     0.917    (1, 1, 10, 10, 3)  1       1        [0.954]           
+    Total_time                                    -                                             104.019   -        -                  -       -        -                 
 
 
 
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_ethosu.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_ethosu.rst.txt
index 3ec24b6e59..e9b3f07a42 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_ethosu.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_ethosu.rst.txt
@@ -22,8 +22,10 @@
 .. _sphx_glr_how_to_work_with_microtvm_micro_ethosu.py:
 
 
-Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN
-======================================================================================
+.. _tutorial-micro-ethosu:
+
+7. Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN
+=========================================================================================
 **Author**:
 `Grant Watson <https://github.com/grant-arm>`_
 
@@ -43,7 +45,7 @@ It provides a programmer's view that is suitable for software development.
 In this tutorial, we will be compiling a MobileNet v1 model and instructing
 TVM to offload operators to the Ethos(TM)-U55 where possible.
 
-.. GENERATED FROM PYTHON SOURCE LINES 42-70
+.. GENERATED FROM PYTHON SOURCE LINES 44-72
 
 Obtaining TVM
 -------------
@@ -74,7 +76,7 @@ Typing ``tvmc`` on the command line should display the following:
     TVMC - TVM driver command-line interface
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 72-102
+.. GENERATED FROM PYTHON SOURCE LINES 74-104
 
 Installing additional python dependencies
 -----------------------------------------
@@ -107,7 +109,7 @@ These packages can be installed by running the following from the command line:
   pip install -r requirements.txt
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 104-124
+.. GENERATED FROM PYTHON SOURCE LINES 106-126
 
 Obtaining the Model
 -------------------
@@ -130,7 +132,7 @@ For this tutorial we will be using the model in Tflite format.
   tar xvf mobilenet_v1_1.0_224_quant.tar
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 126-152
+.. GENERATED FROM PYTHON SOURCE LINES 128-154
 
 Compiling the model for Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN
 ------------------------------------------------------------------------------------
@@ -159,7 +161,7 @@ on our target device using the TVM runtime.
                --output-format=mlf
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 154-182
+.. GENERATED FROM PYTHON SOURCE LINES 156-184
 
 .. note:: Explanation of tvmc compile arguments:
 
@@ -190,7 +192,7 @@ on our target device using the TVM runtime.
   * ``--output-format=mlf`` : Output should be generated in the Model Library Format.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 184-191
+.. GENERATED FROM PYTHON SOURCE LINES 186-193
 
 .. note:: If you don't want to make use of the microNPU and want to offload
    operators to CMSIS-NN only:
@@ -200,7 +202,7 @@ on our target device using the TVM runtime.
   * Remove the microNPU config parameter ``--target-ethos-u-accelerator_config=ethos-u55-256``
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 193-200
+.. GENERATED FROM PYTHON SOURCE LINES 195-202
 
 Extracting the generated code into the current directory
 --------------------------------------------------------
@@ -210,7 +212,7 @@ Extracting the generated code into the current directory
   tar xvf module.tar
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 202-216
+.. GENERATED FROM PYTHON SOURCE LINES 204-218
 
 Getting ImageNet labels
 -----------------------
@@ -227,7 +229,7 @@ to include them in our C application later.
   -o ./labels_mobilenet_quant_v1_224.txt
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 218-236
+.. GENERATED FROM PYTHON SOURCE LINES 220-238
 
 Getting the input image
 -----------------------
@@ -248,7 +250,7 @@ in the next step to convert the image into an array of bytes in a C header file.
   curl -sS https://s3.amazonaws.com/model-server/inputs/kitten.jpg -o ./kitten.jpg
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 238-315
+.. GENERATED FROM PYTHON SOURCE LINES 240-317
 
 Pre-processing the image
 ------------------------
@@ -328,7 +330,7 @@ Run the script from the command line:
 
   python convert_image.py ./kitten.jpg
 
-.. GENERATED FROM PYTHON SOURCE LINES 317-362
+.. GENERATED FROM PYTHON SOURCE LINES 319-364
 
 Pre-processing the labels
 -------------------------
@@ -376,7 +378,7 @@ Run the script from the command line:
 
   python convert_labels.py
 
-.. GENERATED FROM PYTHON SOURCE LINES 364-436
+.. GENERATED FROM PYTHON SOURCE LINES 366-438
 
 Writing the demo application
 ----------------------------
@@ -451,14 +453,14 @@ In addition, you will need these header files from github in your ``./include``
 
 `include files <https://github.com/apache/tvm/tree/main/apps/microtvm/ethosu/include>`_
 
-.. GENERATED FROM PYTHON SOURCE LINES 438-442
+.. GENERATED FROM PYTHON SOURCE LINES 440-444
 
 .. note::
 
   If you'd like to use FreeRTOS for task scheduling and queues, a sample application can be found here
   `demo_freertos.c <https://github.com/apache/tvm/blob/main/apps/microtvm/ethosu/src/demo_freertos.c>`
 
-.. GENERATED FROM PYTHON SOURCE LINES 444-454
+.. GENERATED FROM PYTHON SOURCE LINES 446-456
 
 Creating the linker script
 --------------------------
@@ -471,7 +473,7 @@ placed in your working directory.
 An example linker script for the FVP can be found here
 `corstone300.ld <https://github.com/apache/tvm/blob/main/apps/microtvm/ethosu/corstone300.ld>`_
 
-.. GENERATED FROM PYTHON SOURCE LINES 456-463
+.. GENERATED FROM PYTHON SOURCE LINES 458-465
 
 .. note::
 
@@ -481,7 +483,7 @@ An example linker script for the FVP can be found here
   fit into the limited SRAM available. For this reason it's important that the
   linker script places the ``ethosu_scratch`` section into DRAM (DDR).
 
-.. GENERATED FROM PYTHON SOURCE LINES 465-474
+.. GENERATED FROM PYTHON SOURCE LINES 467-476
 
 .. note::
 
@@ -493,7 +495,7 @@ An example linker script for the FVP can be found here
   ``export PATH=/opt/arm/FVP_Corstone_SSE-300_Ethos-U55/models/Linux64_GCC-6.4:/opt/arm/cmake/bin:$PATH``
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 476-484
+.. GENERATED FROM PYTHON SOURCE LINES 478-486
 
 Building the demo application using make
 ----------------------------------------
@@ -504,7 +506,7 @@ in your working directory before running ``make`` on the command line:
 An example Makefile can be found here:
 `Makefile <https://github.com/apache/tvm/blob/main/apps/microtvm/ethosu/Makefile>`_
 
-.. GENERATED FROM PYTHON SOURCE LINES 486-491
+.. GENERATED FROM PYTHON SOURCE LINES 488-493
 
 .. note::
 
@@ -512,7 +514,7 @@ An example Makefile can be found here:
     ``make FREERTOS_PATH=<FreeRTOS directory>``
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 493-573
+.. GENERATED FROM PYTHON SOURCE LINES 495-575
 
 Running the demo application
 ----------------------------
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_mlperftiny.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_mlperftiny.rst.txt
index a94af8f3bf..3a08be140d 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_mlperftiny.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_mlperftiny.rst.txt
@@ -22,10 +22,10 @@
 .. _sphx_glr_how_to_work_with_microtvm_micro_mlperftiny.py:
 
 
-.. _tutorial-micro-MLPerfTiny:
+.. _tutorial-micro-mlperftiny:
 
-Creating Your MLPerfTiny Submission with microTVM
-=================================================
+8. Creating Your MLPerfTiny Submission with microTVM
+====================================================
 **Authors**:
 `Mehrdad Hessar <https://github.com/mehrdadh>`_
 
@@ -73,7 +73,7 @@ Import Python dependencies
 -------------------------------
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 63-77
+.. GENERATED FROM PYTHON SOURCE LINES 63-78
 
 .. code-block:: default
 
@@ -86,13 +86,14 @@ Import Python dependencies
     from tvm.contrib.download import download_testdata
     from tvm.micro import export_model_library_format
     from tvm.micro.model_library_format import generate_c_interface_header
+    import tvm.micro.testing
     from tvm.micro.testing.utils import (
         create_header_file,
         mlf_extract_workspace_size_bytes,
     )
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 78-100
+.. GENERATED FROM PYTHON SOURCE LINES 79-101
 
 Import Visual Wake Word Model
 --------------------------------------------------------------------
@@ -117,7 +118,7 @@ If you would like to build the submission with CMSIS-NN, modify USE_CMSIS enviro
     export USE_CMSIS=1
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 100-144
+.. GENERATED FROM PYTHON SOURCE LINES 101-145
 
 .. code-block:: default
 
@@ -166,7 +167,7 @@ If you would like to build the submission with CMSIS-NN, modify USE_CMSIS enviro
     )
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 145-153
+.. GENERATED FROM PYTHON SOURCE LINES 146-154
 
 Defining Target, Runtime and Executor
 --------------------------------------------------------------------
@@ -177,7 +178,7 @@ than using AoT with host-driven mode where the target would communicate with hos
 AoT executor to run inference.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 153-172
+.. GENERATED FROM PYTHON SOURCE LINES 154-173
 
 .. code-block:: default
 
@@ -201,7 +202,7 @@ AoT executor to run inference.
     TARGET = tvm.micro.testing.get_target("zephyr", BOARD)
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 173-181
+.. GENERATED FROM PYTHON SOURCE LINES 174-182
 
 Compile the model and export model library format
 --------------------------------------------------------------------
@@ -212,7 +213,7 @@ workspace size that is required for the compiled model.
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 181-199
+.. GENERATED FROM PYTHON SOURCE LINES 182-200
 
 .. code-block:: default
 
@@ -235,7 +236,7 @@ workspace size that is required for the compiled model.
     workspace_size = mlf_extract_workspace_size_bytes(model_tar_path)
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 200-209
+.. GENERATED FROM PYTHON SOURCE LINES 201-210
 
 Generate input/output header files
 --------------------------------------------------------------------
@@ -247,7 +248,7 @@ standalone project. For this specific submission, we only need to generate
 output header file since the input API call is handled differently.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 209-232
+.. GENERATED FROM PYTHON SOURCE LINES 210-233
 
 .. code-block:: default
 
@@ -275,7 +276,7 @@ output header file since the input API call is handled differently.
         )
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 233-243
+.. GENERATED FROM PYTHON SOURCE LINES 234-244
 
 Create the project, build and prepare the project tar file
 --------------------------------------------------------------------
@@ -288,7 +289,7 @@ current working directory which could be downloaded and used on
 your development kit.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 243-292
+.. GENERATED FROM PYTHON SOURCE LINES 244-293
 
 .. code-block:: default
 
@@ -342,7 +343,7 @@ your development kit.
     print(f"The generated project is located here: {project_tar_path}")
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 293-313
+.. GENERATED FROM PYTHON SOURCE LINES 294-314
 
 Use this project with your board
 --------------------------------------------------------------------
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
index f20259bcd9..b5d306fa29 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
@@ -22,10 +22,10 @@
 .. _sphx_glr_how_to_work_with_microtvm_micro_pytorch.py:
 
 
-.. _tutorial-micro-Pytorch:
+.. _tutorial-micro-pytorch:
 
-microTVM PyTorch Tutorial
-===========================
+4. microTVM PyTorch Tutorial
+============================
 **Authors**:
 `Mehrdad Hessar <https://github.com/mehrdadh>`_
 
@@ -40,7 +40,7 @@ since the model would not fit on our current supported Zephyr boards.
 .. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 36-50
+.. GENERATED FROM PYTHON SOURCE LINES 36-51
 
 .. code-block:: default
 
@@ -57,6 +57,7 @@ since the model would not fit on our current supported Zephyr boards.
     from tvm import relay
     from tvm.contrib.download import download_testdata
     from tvm.relay.backend import Executor
+    import tvm.micro.testing
 
 
 
@@ -65,7 +66,7 @@ since the model would not fit on our current supported Zephyr boards.
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 51-57
+.. GENERATED FROM PYTHON SOURCE LINES 52-58
 
 Load a pre-trained PyTorch model
 --------------------------------
@@ -74,7 +75,7 @@ To begin with, load pre-trained MobileNetV2 from torchvision. Then,
 download a cat image and preprocess it to use as the model input.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 57-85
+.. GENERATED FROM PYTHON SOURCE LINES 58-86
 
 .. code-block:: default
 
@@ -117,7 +118,7 @@ download a cat image and preprocess it to use as the model input.
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/ao/quantization/utils.py:281: UserWarning: must run observer before calling calculate_qparams. Returning default values.
       "must run observer before calling calculate_qparams. " +
     Downloading: "https://download.pytorch.org/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2_qnnpack_37f702c5.pth
-
      0%|          | 0.00/3.42M [00:00<?, ?B/s]
    100%|##########| 3.42M/3.42M [00:00<00:00, 40.0MB/s]
+
      0%|          | 0.00/3.42M [00:00<?, ?B/s]
     61%|######    | 2.09M/3.42M [00:00<00:00, 20.9MB/s]
    100%|##########| 3.42M/3.42M [00:00<00:00, 32.9MB/s]
     /workspace/python/tvm/relay/frontend/pytorch_utils.py:47: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
       return LooseVersion(torch_ver) > ver
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/setuptools/_distutils/version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
@@ -126,7 +127,7 @@ download a cat image and preprocess it to use as the model input.
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 86-95
+.. GENERATED FROM PYTHON SOURCE LINES 87-101
 
 Define Target, Runtime and Executor
 -----------------------------------
@@ -136,18 +137,19 @@ for an emulated embedded environment on an x86 machine we use C runtime (CRT)
 and we use `host` micro target. Using this setup, TVM compiles the model
 for C runtime which can run on a x86 CPU machine with the same flow that
 would run on a physical microcontroller.
+CRT Uses the main() from `src/runtime/crt/host/main.cc`
+To use physical hardware, replace `board` with another physical micro target, e.g. `nrf5340dk_nrf5340_cpuapp`
+or `mps2_an521` and change the platform type to Zephyr.
+See more target examples in :ref:`Training Vision Models for microTVM on Arduino <tutorial-micro-train-arduino>`
+and :ref:`microTVM TFLite Tutorial<tutorial_micro_tflite>`.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 95-108
+.. GENERATED FROM PYTHON SOURCE LINES 101-110
 
 .. code-block:: default
 
 
-
-    # Simulate a microcontroller on the host machine. Uses the main() from `src/runtime/crt/host/main.cc`
-    # To use physical hardware, replace "host" with another physical micro target, e.g. `nrf52840`
-    # or `mps2_an521`. See more more target examples in micro_train.py and micro_tflite.py tutorials.
-    target = tvm.target.target.micro("host")
+    target = tvm.micro.testing.get_target(platform="crt", board=None)
 
     # Use the C runtime (crt) and enable static linking by setting system-lib to True
     runtime = tvm.relay.backend.Runtime("crt", {"system-lib": True})
@@ -162,7 +164,7 @@ would run on a physical microcontroller.
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 109-114
+.. GENERATED FROM PYTHON SOURCE LINES 111-116
 
 Compile the model
 ------------------
@@ -170,7 +172,7 @@ Compile the model
 Now, we compile the model for the target:
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 114-123
+.. GENERATED FROM PYTHON SOURCE LINES 116-125
 
 .. code-block:: default
 
@@ -190,7 +192,7 @@ Now, we compile the model for the target:
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 124-130
+.. GENERATED FROM PYTHON SOURCE LINES 126-132
 
 Create a microTVM project
 -------------------------
@@ -199,7 +201,7 @@ Now that we have the compiled model as an IRModule, we need to create a firmware
 to use the compiled model with microTVM. To do this, we use Project API.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 130-142
+.. GENERATED FROM PYTHON SOURCE LINES 132-144
 
 .. code-block:: default
 
@@ -222,7 +224,7 @@ to use the compiled model with microTVM. To do this, we use Project API.
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 143-149
+.. GENERATED FROM PYTHON SOURCE LINES 145-151
 
 Build, flash and execute the model
 ----------------------------------
@@ -231,7 +233,7 @@ physical microcontroller and it is skipped if it is simulating a microcontroller
 via the host `main.cc`` or if a Zephyr emulated board is selected as the target.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 149-160
+.. GENERATED FROM PYTHON SOURCE LINES 151-162
 
 .. code-block:: default
 
@@ -253,14 +255,14 @@ via the host `main.cc`` or if a Zephyr emulated board is selected as the target.
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 161-165
+.. GENERATED FROM PYTHON SOURCE LINES 163-167
 
 Look up synset name
 -------------------
 Look up prediction top 1 index in 1000 class synset.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 165-206
+.. GENERATED FROM PYTHON SOURCE LINES 167-208
 
 .. code-block:: default
 
@@ -322,7 +324,7 @@ Look up prediction top 1 index in 1000 class synset.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  13.915 seconds)
+   **Total running time of the script:** ( 1 minutes  8.660 seconds)
 
 
 .. _sphx_glr_download_how_to_work_with_microtvm_micro_pytorch.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_reference_vm.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_reference_vm.rst.txt
deleted file mode 100644
index 9bb7c91e15..0000000000
--- a/docs/_sources/how_to/work_with_microtvm/micro_reference_vm.rst.txt
+++ /dev/null
@@ -1,185 +0,0 @@
-
-.. DO NOT EDIT. THIS FILE WAS AUTOMATICALLY GENERATED BY
-.. TVM'S MONKEY-PATCHED VERSION OF SPHINX-GALLERY. TO MAKE
-.. CHANGES, EDIT THE SOURCE PYTHON FILE:
-.. "how_to/work_with_microtvm/micro_reference_vm.py"
-
-.. only:: html
-
-    .. note::
-        :class: sphx-glr-download-link-note
-
-        This tutorial can be used interactively with Google Colab! You can also click
-        :ref:`here <sphx_glr_download_how_to_work_with_microtvm_micro_reference_vm.py>` to run the Jupyter notebook locally.
-
-        .. image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/utilities/colab_button.svg
-            :align: center
-            :target: https://colab.research.google.com/github/apache/tvm-site/blob/asf-site/docs/_downloads/7ef06253b3d2676eb50e20a5f81ef8f9/micro_reference_vm.ipynb
-            :width: 300px
-
-.. rst-class:: sphx-glr-example-title
-
-.. _sphx_glr_how_to_work_with_microtvm_micro_reference_vm.py:
-
-
-.. _tutorial-micro-reference-vm:
-
-===================================
-microTVM Reference Virtual Machines
-===================================
-**Author**: `Andrew Reusch <ar...@octoml.ai>`_
-
-This tutorial explains how to launch microTVM Reference Virtual Machines. You can use these to
-develop on real physical hardware without needing to individually install the microTVM
-dependencies. These are also particularly useful when trying to reproduce behavior with
-microTVM, such as when filing bug reports.
-
-microTVM is the effort to allow TVM to build and execute models on bare-metal microcontrollers.
-microTVM aims to be compatible with a wide variety of SoCs and runtime environments (i.e. bare metal,
-RTOS, etc). However, some stable software environment is needed to allow developers to share and
-reproduce bugs and results. The microTVM Reference Virtual Machines are intended to provide that
-environment.
-
-How it works
-============
-
-No Virtual Machines are stored in the TVM repository--instead, the files stored in
-``apps/microtvm/reference-vm`` describe how to build VMs to the Vagrant_ VM builder tool.
-
-The Reference VMs are split into two parts:
-
-1. A Vagrant Base Box, which contains all of the stable dependencies for that platform. Build
-   scripts are stored in ``apps/microtvm/reference-vm/<platform>/base-box``. TVM committers run
-   these when a platform's "stable" dependencies change, and the generated base boxes are stored in
-   `Vagrant Cloud`_.
-2. A per-workspace VM, which users normally build using the Base Box as a starting point. Build
-   scripts are stored in ``apps/microtvm/reference-vm/<platform>`` (everything except ``base-box``).
-
-.. _Vagrant: https://vagrantup.com
-.. _Vagrant Cloud: https://app.vagrantup.com/tlcpack
-
-Setting up the VM
-=================
-
-Installing prerequisites
-------------------------
-
-A minimal set of prerequisites are needed:
-
-1. `Vagrant <https://vagrantup.com>`__
-2. A supported Virtual Machine hypervisor (**VirtualBox**, **Parallels**, or **VMWare Fusion/Workstation**).
-   `VirtualBox <https://www.virtualbox.org>`__ is a suggested free hypervisor, but please note
-   that the `VirtualBox Extension Pack`_ is required for proper USB forwarding. If using VirtualBox,
-   also consider installing the `vbguest <https://github.com/dotless-de/vagrant-vbguest>`_ plugin.
-
-.. _VirtualBox Extension Pack: https://www.virtualbox.org/wiki/Downloads#VirtualBox6.1.16OracleVMVirtualBoxExtensionPack
-
-3. If required for your hypervisor, the
-   `Vagrant provider plugin <https://github.com/hashicorp/vagrant/wiki/Available-Vagrant-Plugins#providers>`__ (or see `here <https://www.vagrantup.com/vmware>`__ for VMWare).
-
-First boot
-----------
-
-The first time you use a reference VM, you need to create the box locally and then provision it.
-
-.. code-block:: bash
-
-    # Replace zephyr with the name of a different platform, if you are not using Zephyr.
-    ~/.../tvm $ cd apps/microtvm/reference-vm/zephyr
-    # Replace <provider_name> with the name of the hypervisor you wish to use (i.e. virtualbox, parallels, vmware_desktop).
-    ~/.../tvm/apps/microtvm/reference-vm/zephyr $ vagrant up --provider=<provider_name>
-
-
-This command will take a couple of minutes to run and will require 4 to 5GB of storage on your
-machine. It does the following:
-
-1. Downloads the `microTVM base box`_ and clones it to form a new VM specific to this TVM directory.
-2. Mounts your TVM directory (and, if using ``git-subtree``, the original ``.git`` repo) into the
-   VM.
-3. Builds TVM and installs a Python virtualenv with the dependencies corresponding with your TVM
-   build.
-
-.. _microTVM base box: https://app.vagrantup.com/tlcpack/boxes/microtvm
-
-Connect Hardware to the VM
---------------------------
-
-Next, you need to configure USB passthrough to attach your physical development board to the virtual
-machine (rather than directly to your laptop's host OS).
-
-It's suggested you setup a device filter, rather than doing a one-time forward, because often the
-device may reboot during the programming process and you may, at that time, need to enable
-forwarding again. It may not be obvious to the end user when this occurs. Instructions to do that:
-
- * `VirtualBox <https://www.virtualbox.org/manual/ch03.html#usb-support>`__
- * `Parallels <https://kb.parallels.com/122993>`__
- * `VMWare Workstation <https://docs.vmware.com/en/VMware-Workstation-Pro/15.0/com.vmware.ws.using.doc/GUID-E003456F-EB94-4B53-9082-293D9617CB5A.html>`__
-
-Rebuilding TVM inside the Reference VM
---------------------------------------
-
-After the first boot, you'll need to ensure you keep the build, in ``$TVM_HOME/build-microtvm-zephyr``,
-up-to-date when you modify the C++ runtime or checkout a different revision. You can either
-re-provision the machine (``vagrant provision`` in the same directory you ran ``vagrant up`` before)
-or manually rebuild TVM yourself.
-
-Remember: the TVM ``.so`` built inside the VM is different from the one you may use on your host
-machine. This is why it's built inside the special directory ``build-microtvm-zephyr``.
-
-Logging in to the VM
---------------------
-
-The VM should be available to your host only with the hostname ``microtvm``. You can SSH to the VM
-as follows:
-
-.. code-block:: bash
-
-    $ vagrant ssh
-
-Then ``cd`` to the same path used on your host machine for TVM. For example, on Mac:
-
-.. code-block:: bash
-
-    $ cd /Users/yourusername/path/to/tvm
-
-Running tests
-=============
-
-Once the VM has been provisioned, tests can be executed using ``poetry``:
-
-.. code-block:: bash
-
-    $ cd apps/microtvm/reference-vm/zephyr
-    $ poetry run python3 ../../../../tests/micro/zephyr/test_zephyr.py --board=stm32f746g_disco
-
-If you do not have physical hardware attached, but wish to run the tests using the
-local QEMU emulator running within the VM, run the following commands instead:
-
-.. code-block:: bash
-
-    $ cd /Users/yourusername/path/to/tvm
-    $ cd apps/microtvm/reference-vm/zephyr/
-    $ poetry run pytest ../../../../tests/micro/zephyr/test_zephyr.py --board=qemu_x86
-
-
-.. _sphx_glr_download_how_to_work_with_microtvm_micro_reference_vm.py:
-
-.. only:: html
-
-  .. container:: sphx-glr-footer sphx-glr-footer-example
-
-
-    .. container:: sphx-glr-download sphx-glr-download-python
-
-      :download:`Download Python source code: micro_reference_vm.py <micro_reference_vm.py>`
-
-    .. container:: sphx-glr-download sphx-glr-download-jupyter
-
-      :download:`Download Jupyter notebook: micro_reference_vm.ipynb <micro_reference_vm.ipynb>`
-
-
-.. only:: html
-
- .. rst-class:: sphx-glr-signature
-
-    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_tflite.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_tflite.rst.txt
index 168db4ac45..f18528d02e 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_tflite.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_tflite.rst.txt
@@ -22,9 +22,9 @@
 .. _sphx_glr_how_to_work_with_microtvm_micro_tflite.py:
 
 
-.. _microTVM-with-TFLite:
+.. _tutorial_micro_tflite:
 
-microTVM with TFLite Models
+2. microTVM TFLite Tutorial
 ===========================
 **Author**: `Tom Gall <https://github.com/tom-gall>`_
 
@@ -68,7 +68,7 @@ Import Python dependencies
 -------------------------------
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 51-68
+.. GENERATED FROM PYTHON SOURCE LINES 51-73
 
 .. code-block:: default
 
@@ -79,11 +79,16 @@ Import Python dependencies
     import numpy as np
 
     import tvm
+    import tvm.micro
+    import tvm.micro.testing
     from tvm import relay
     import tvm.contrib.utils
+    from tvm.micro import export_model_library_format
     from tvm.contrib.download import download_testdata
 
-    model_url = "https://people.linaro.org/~tom.gall/sine_model.tflite"
+    model_url = (
+        "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/sine_model.tflite"
+    )
     model_file = "sine_model.tflite"
     model_path = download_testdata(model_url, model_file, module="data")
 
@@ -96,11 +101,11 @@ Import Python dependencies
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 69-70
+.. GENERATED FROM PYTHON SOURCE LINES 74-75
 
 Using the buffer, transform into a tflite model python object
 
-.. GENERATED FROM PYTHON SOURCE LINES 70-79
+.. GENERATED FROM PYTHON SOURCE LINES 75-84
 
 .. code-block:: default
 
@@ -120,11 +125,11 @@ Using the buffer, transform into a tflite model python object
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 80-81
+.. GENERATED FROM PYTHON SOURCE LINES 85-86
 
 Print out the version of the model
 
-.. GENERATED FROM PYTHON SOURCE LINES 81-84
+.. GENERATED FROM PYTHON SOURCE LINES 86-89
 
 .. code-block:: default
 
@@ -144,7 +149,7 @@ Print out the version of the model
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 85-93
+.. GENERATED FROM PYTHON SOURCE LINES 90-98
 
 Parse the python model object to convert it into a relay module
 and weights.
@@ -155,7 +160,7 @@ If you are unsure what that might be, this can be discovered by using
 the ``visualize.py`` script within the Tensorflow project.
 See `How do I inspect a .tflite file? <https://www.tensorflow.org/lite/guide/faq>`_
 
-.. GENERATED FROM PYTHON SOURCE LINES 93-102
+.. GENERATED FROM PYTHON SOURCE LINES 98-107
 
 .. code-block:: default
 
@@ -175,51 +180,39 @@ See `How do I inspect a .tflite file? <https://www.tensorflow.org/lite/guide/faq
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 103-112
+.. GENERATED FROM PYTHON SOURCE LINES 108-117
 
 Defining the target
 -------------------
 
 Now we create a build config for relay, turning off two options and then calling relay.build which
 will result in a C source file for the selected TARGET. When running on a simulated target of the
-same architecture as the host (where this Python script is executed) choose "host" below for the
+same architecture as the host (where this Python script is executed) choose "crt" below for the
 TARGET, the C Runtime as the RUNTIME and a proper board/VM to run it (Zephyr will create the right
 QEMU VM based on BOARD. In the example below the x86 arch is selected and a x86 VM is picked up accordingly:
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 112-142
+.. GENERATED FROM PYTHON SOURCE LINES 117-135
 
 .. code-block:: default
 
     RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": True})
-    TARGET = tvm.target.target.micro("host")
+    TARGET = tvm.micro.testing.get_target("crt")
 
-    #
-    # Compiling for physical hardware
-    #  When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The
-    #  STM32F746 Nucleo target and board is chosen in the example below. Another option would be to
-    #  choose the STM32F746 Discovery board instead. Since that board has the same MCU as the Nucleo
-    #  board but a couple of wirings and configs differ, it's necessary to select the "stm32f746g_disco"
-    #  board to generated the right firmware image.
-    #
+    # When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The
+    # STM32L4R5ZI Nucleo target and board is chosen in the example below. You could change the testing
+    # board by simply exporting `TVM_MICRO_BOARD` variable with a different Zephyr supported board.
 
     if use_physical_hw:
-        boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
-        with open(boards_file) as f:
-            boards = json.load(f)
         BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
         SERIAL = os.getenv("TVM_MICRO_SERIAL", default=None)
-        TARGET = tvm.target.target.micro(boards[BOARD]["model"])
+        TARGET = tvm.micro.testing.get_target("zephyr", BOARD)
 
+    # For some boards, Zephyr runs them emulated by default, using QEMU. For example, below is the
+    # TARGET and BOARD used to build a microTVM firmware for the mps2-an521 board.
     #
-    #  For some boards, Zephyr runs them emulated by default, using QEMU. For example, below is the
-    #  TARGET and BOARD used to build a microTVM firmware for the mps2-an521 board. Since that board
-    #  runs emulated by default on Zephyr the suffix "-qemu" is added to the board name to inform
-    #  microTVM that the QEMU transporter must be used to communicate with the board. If the board name
-    #  already has the prefix "qemu_", like "qemu_x86", then it's not necessary to add that suffix.
-    #
-    #  TARGET = tvm.target.target.micro("mps2_an521")
-    #  BOARD = "mps2_an521-qemu"
+    # `mps2_an521 = "mps2_an521"`
+    # `TARGET = tvm.micro.testing.get_target("zephyr", BOARD)`
 
 
 
@@ -228,27 +221,41 @@ QEMU VM based on BOARD. In the example below the x86 arch is selected and a x86
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 143-144
+.. GENERATED FROM PYTHON SOURCE LINES 136-138
 
-Now, compile the model for the target:
+Now, compile the model for the target. If you do not specify Executor,
+by default it uses GraphExecutor.
 
-.. GENERATED FROM PYTHON SOURCE LINES 144-233
+.. GENERATED FROM PYTHON SOURCE LINES 138-143
 
 .. code-block:: default
 
 
-    with tvm.transform.PassContext(
-        opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["AlterOpLayout"]
-    ):
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         module = relay.build(mod, target=TARGET, runtime=RUNTIME, params=params)
 
 
-    # Inspecting the compilation output
-    # ---------------------------------
-    #
-    # The compilation process has produced some C code implementing the operators in this graph. We
-    # can inspect it by printing the CSourceModule contents (for the purposes of this tutorial, let's
-    # just print the first 10 lines):
+
+
+
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 144-151
+
+Inspecting the compilation output
+---------------------------------
+
+The compilation process has produced some C code implementing the operators in this graph. We
+can inspect it by printing the CSourceModule contents (for the purposes of this tutorial, let's
+just print the first 10 lines):
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 151-163
+
+.. code-block:: default
+
 
     c_source_module = module.get_lib().imported_modules[0]
     assert c_source_module.type_key == "c", "tutorial is broken"
@@ -261,26 +268,50 @@ Now, compile the model for the target:
     print("\n".join(first_few_lines))
 
 
-    # Compiling the generated code
-    # ----------------------------
-    #
-    # Now we need to incorporate the generated C code into a project that allows us to run inference on the
-    # device. The simplest way to do this is to integrate it yourself, using microTVM's standard output format
-    # (:doc:`Model Library Format` </dev/model_library_format>`). This is a tarball with a standard layout:
 
-    # Get a temporary path where we can store the tarball (since this is running as a tutorial).
 
-    fd, model_library_format_tar_path = tempfile.mkstemp()
-    os.close(fd)
-    os.unlink(model_library_format_tar_path)
-    tvm.micro.export_model_library_format(module, model_library_format_tar_path)
 
-    with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
-        print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
 
-    # Cleanup for tutorial:
-    os.unlink(model_library_format_tar_path)
+.. rst-class:: sphx-glr-script-out
 
+ .. code-block:: none
+
+    // tvm target: c -keys=cpu -model=host
+    #define TVM_EXPORTS
+    #include "tvm/runtime/c_runtime_api.h"
+    #include "tvm/runtime/c_backend_api.h"
+    #include <math.h>
+    #include <stdbool.h>
+    #ifdef __cplusplus
+    extern "C"
+    #endif
+    TVM_DLL int32_t tvmgen_default_fused_nn_contrib_dense_pack_add(void* args, int32_t* arg_type_ids, int32_t num_args, void* out_ret_value, int32_t* out_ret_tcode, void* resource_handle) {
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 164-170
+
+Compiling the generated code
+----------------------------
+
+Now we need to incorporate the generated C code into a project that allows us to run inference on the
+device. The simplest way to do this is to integrate it yourself, using microTVM's standard output format
+model library format. This is a tarball with a standard layout.
+
+.. GENERATED FROM PYTHON SOURCE LINES 170-219
+
+.. code-block:: default
+
+
+    # Get a temporary path where we can store the tarball (since this is running as a tutorial).
+
+    temp_dir = tvm.contrib.utils.tempdir()
+    model_tar_path = temp_dir / "model.tar"
+    export_model_library_format(module, model_tar_path)
+
+    with tarfile.open(model_tar_path, "r:*") as tar_f:
+        print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
 
     # TVM also provides a standard way for embedded platforms to automatically generate a standalone
     # project, compile and flash it to a target, and communicate with it using the standard TVM RPC
@@ -296,11 +327,8 @@ Now, compile the model for the target:
     template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("crt"))
     project_options = {}  # You can use options to provide platform-specific options through TVM.
 
-    # Compiling for physical hardware (or an emulated board, like the mps_an521)
-    # --------------------------------------------------------------------------
     #  For physical hardware, you can try out the Zephyr platform by using a different template project
     #  and options:
-    #
 
     if use_physical_hw:
         template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
@@ -313,7 +341,6 @@ Now, compile the model for the target:
         }
 
     # Create a temporary directory
-
     temp_dir = tvm.contrib.utils.tempdir()
     generated_project_dir = temp_dir / "generated-project"
     generated_project = tvm.micro.generate_project(
@@ -333,16 +360,6 @@ Now, compile the model for the target:
 
  .. code-block:: none
 
-    // tvm target: c -keys=cpu -model=host
-    #define TVM_EXPORTS
-    #include "tvm/runtime/c_runtime_api.h"
-    #include "tvm/runtime/c_backend_api.h"
-    #include <math.h>
-    #include <stdbool.h>
-    #ifdef __cplusplus
-    extern "C"
-    #endif
-    TVM_DLL int32_t tvmgen_default_fused_nn_dense_add(void* args, int32_t* arg_type_ids, int32_t num_args, void* out_ret_value, int32_t* out_ret_tcode, void* resource_handle) {
      - .
      - ./codegen
      - ./codegen/host
@@ -361,14 +378,14 @@ Now, compile the model for the target:
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 234-238
+.. GENERATED FROM PYTHON SOURCE LINES 220-224
 
 Next, establish a session with the simulated device and run the
 computation. The `with session` line would typically flash an attached
 microcontroller, but in this tutorial, it simply launches a subprocess
 to stand in for an attached microcontroller.
 
-.. GENERATED FROM PYTHON SOURCE LINES 238-255
+.. GENERATED FROM PYTHON SOURCE LINES 224-241
 
 .. code-block:: default
 
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
index 672f3afa01..1d82a6c586 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
@@ -22,10 +22,10 @@
 .. _sphx_glr_how_to_work_with_microtvm_micro_train.py:
 
 
-.. _microtvm-train-arduino:
+.. _tutorial-micro-train-arduino:
 
-Training Vision Models for microTVM on Arduino
-==============================================
+5. Training Vision Models for microTVM on Arduino
+=================================================
 **Author**: `Gavin Uberti <https://github.com/guberti>`_
 
 This tutorial shows how MobileNetV1 models can be trained
@@ -218,7 +218,7 @@ take about **2 minutes** to download the Stanford Cars, while COCO 2017 validati
  .. code-block:: none
 
 
-    '/tmp/tmp3fikx3rq/images/random'
+    '/tmp/tmpityrxtx9/images/random'
 
 
 
@@ -309,7 +309,7 @@ objects to other stuff? We can display some examples from our datasets using ``m
 
 
 .. image-sg:: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
-   :alt: [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]
+   :alt: [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]
    :srcset: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
    :class: sphx-glr-single-img
 
@@ -318,8 +318,8 @@ objects to other stuff? We can display some examples from our datasets using ``m
 
  .. code-block:: none
 
-    /tmp/tmp3fikx3rq/images/target contains 8144 images
-    /tmp/tmp3fikx3rq/images/random contains 5000 images
+    /tmp/tmpityrxtx9/images/target contains 8144 images
+    /tmp/tmpityrxtx9/images/random contains 5000 images
 
 
 
@@ -494,13 +494,13 @@ the time on our validation set).
  .. code-block:: none
 
     Epoch 1/3
-    328/328 - 48s - loss: 0.2346 - accuracy: 0.9202 - val_loss: 0.1551 - val_accuracy: 0.9437 - 48s/epoch - 146ms/step
+    328/328 - 47s - loss: 0.2257 - accuracy: 0.9240 - val_loss: 0.1305 - val_accuracy: 0.9554 - 47s/epoch - 142ms/step
     Epoch 2/3
-    328/328 - 44s - loss: 0.1037 - accuracy: 0.9630 - val_loss: 0.0930 - val_accuracy: 0.9611 - 44s/epoch - 134ms/step
+    328/328 - 43s - loss: 0.0952 - accuracy: 0.9624 - val_loss: 0.1304 - val_accuracy: 0.9520 - 43s/epoch - 131ms/step
     Epoch 3/3
-    328/328 - 44s - loss: 0.0693 - accuracy: 0.9751 - val_loss: 0.1008 - val_accuracy: 0.9645 - 44s/epoch - 134ms/step
+    328/328 - 43s - loss: 0.0688 - accuracy: 0.9732 - val_loss: 0.1035 - val_accuracy: 0.9698 - 43s/epoch - 131ms/step
 
-    <keras.callbacks.History object at 0x7fac2e03e690>
+    <keras.callbacks.History object at 0x7fde264f99d0>
 
 
 
@@ -633,7 +633,7 @@ Once we have set these configuration parameters, we will call ``tvm.relay.build`
 Relay model into the MLF intermediate representation. From here, we just need to call
 ``tvm.micro.generate_project`` and pass in the Arduino template project to finish compilation.
 
-.. GENERATED FROM PYTHON SOURCE LINES 440-476
+.. GENERATED FROM PYTHON SOURCE LINES 440-477
 
 .. code-block:: default
 
@@ -641,6 +641,7 @@ Relay model into the MLF intermediate representation. From here, we just need to
     import shutil
     import tflite
     import tvm
+    import tvm.micro.testing
 
     # Method to load model is different in TFLite 1 vs 2
     try:  # TFLite 2.1 and above
@@ -652,7 +653,7 @@ Relay model into the MLF intermediate representation. From here, we just need to
     mod, params = tvm.relay.frontend.from_tflite(tflite_model)
 
     # Set configuration flags to improve performance
-    target = tvm.target.target.micro("nrf52840")
+    target = tvm.micro.testing.get_target("zephyr", "nrf5340dk_nrf5340_cpuapp")
     runtime = tvm.relay.backend.Runtime("crt")
     executor = tvm.relay.backend.Executor("aot", {"unpacked-api": True})
 
@@ -680,7 +681,7 @@ Relay model into the MLF intermediate representation. From here, we just need to
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 477-518
+.. GENERATED FROM PYTHON SOURCE LINES 478-519
 
 Testing our Arduino Project
 ---------------------------
@@ -724,7 +725,7 @@ We can do both of these things with a few lines of Bash code:
       stream ~/tests/catan_64.png ~/tests/catan.raw
       bin2c -c -st ~/tests/catan.raw --name CATAN_IMAGE > ~/models/project/catan.c
 
-.. GENERATED FROM PYTHON SOURCE LINES 520-560
+.. GENERATED FROM PYTHON SOURCE LINES 521-561
 
 Writing our Arduino Script
 --------------------------
@@ -767,7 +768,7 @@ compile and flash commands underneath. We could also begin autotuning our model,
 subject for a different tutorial. To finish up, we'll verify no compiler errors are thrown
 by our project:
 
-.. GENERATED FROM PYTHON SOURCE LINES 560-565
+.. GENERATED FROM PYTHON SOURCE LINES 561-566
 
 .. code-block:: default
 
@@ -789,7 +790,7 @@ by our project:
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 571-578
+.. GENERATED FROM PYTHON SOURCE LINES 572-579
 
 Uploading to Our Device
 -----------------------
@@ -799,7 +800,7 @@ simple enough to do - we'll just turn our project into a `.zip` archive, and cal
 If you're running on Google Colab, you'll have to uncomment the last two lines to download the file
 after writing it.
 
-.. GENERATED FROM PYTHON SOURCE LINES 578-585
+.. GENERATED FROM PYTHON SOURCE LINES 579-586
 
 .. code-block:: default
 
@@ -817,7 +818,7 @@ after writing it.
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 606-640
+.. GENERATED FROM PYTHON SOURCE LINES 607-641
 
 From here, we'll need to open it in the Arduino IDE. You'll have to download the IDE as well as
 the SDK for whichever board you are using. For certain boards like the Sony SPRESENSE, you may
@@ -857,7 +858,7 @@ Arduino tutorial for how to do that `on GitHub <https://github.com/guberti/tvm-a
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 4 minutes  39.782 seconds)
+   **Total running time of the script:** ( 4 minutes  43.679 seconds)
 
 
 .. _sphx_glr_download_how_to_work_with_microtvm_micro_train.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_tvmc.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_tvmc.rst.txt
index dada1d652a..e04d151510 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_tvmc.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_tvmc.rst.txt
@@ -22,33 +22,28 @@
 .. _sphx_glr_how_to_work_with_microtvm_micro_tvmc.py:
 
 
-.. _tutorial-micro-tvmc:
+.. _tutorial-micro-cli-tool:
 
-Executing a Tiny Model with TVMC Micro
-======================================
+1. microTVM CLI Tool
+====================
 **Author**: `Mehrdad Hessar <https://github.com/mehrdadh>`_
 
 This tutorial explains how to compile a tiny model for a micro device,
 build a program on Zephyr platform to execute this model, flash the program
 and run the model all using `tvmc micro` command.
+You need to install python and Zephyr dependencies before processing with this tutorial.
 
-.. GENERATED FROM PYTHON SOURCE LINES 31-44
+.. GENERATED FROM PYTHON SOURCE LINES 32-34
 
-.. note::
-    This tutorial is explaining using TVMC Mirco on Zephyr platform. You need
-    to install Zephyr dependencies before processing with this tutorial. Alternatively,
-    you can run this tutorial in one of the following ways which has Zephyr depencencies already installed.
+.. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
 
-    * Use `microTVM Reference Virtual Machines <https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_reference_vm.html#sphx-glr-how-to-work-with-microtvm-micro-reference-vm-py>`_.
-    * Use QEMU docker image provided by TVM. Following these you will download and login to the docker image:
 
-    .. code-block:: bash
+.. GENERATED FROM PYTHON SOURCE LINES 37-39
 
-      cd tvm
-      ./docker/bash.sh tlcpack/ci-qemu
+.. include:: ../../../../gallery/how_to/work_with_microtvm/install_zephyr.rst
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 47-74
+.. GENERATED FROM PYTHON SOURCE LINES 43-70
 
 Using TVMC Micro
 ###########################################################
@@ -78,7 +73,7 @@ Using TVMC Micro
  ``tvmc micro <subcommand> --help``. We will use each subcommand in this tutorial.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 76-88
+.. GENERATED FROM PYTHON SOURCE LINES 72-84
 
 Obtain a Tiny Model
 ###########################################################
@@ -93,14 +88,14 @@ Obtain a Tiny Model
 	  wget https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/examples/magic_wand/magic_wand.tflite
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 90-116
+.. GENERATED FROM PYTHON SOURCE LINES 86-112
 
 Compiling a TFLite model to a Model Library Format
 ###########################################################
 
  Model Library Format (MLF) is an output format that TVM provides for micro targets. MLF is a tarball
  containing a file for each piece of the TVM compiler output which can be used on micro targets outside
- TVM environment. Read more about `Model Library Format <https://tvm.apache.org/docs//arch/model_library_format.html>`_.
+ TVM environment. Read more about :ref:`Model Library Format <model_library_format>`.
 
  Here, we generate a MLF file for ``qemu_x86`` Zephyr board. To generate MLF output for the ``magic_wand`` tflite model:
 
@@ -122,7 +117,7 @@ Compiling a TFLite model to a Model Library Format
  the target is ``--target='c -keys=cpu -model=nrf5340dk'``.
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 119-146
+.. GENERATED FROM PYTHON SOURCE LINES 115-142
 
 Create a Zephyr Project Using Model Library Format
 ###########################################################
@@ -152,7 +147,7 @@ Create a Zephyr Project Using Model Library Format
      tvmc micro create --help
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 148-171
+.. GENERATED FROM PYTHON SOURCE LINES 144-167
 
 Build and Flash Zephyr Project Using TVMC Micro
 ###########################################################
@@ -178,7 +173,7 @@ Build and Flash Zephyr Project Using TVMC Micro
 	      zephyr
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 173-199
+.. GENERATED FROM PYTHON SOURCE LINES 169-186
 
 Run Tiny Model on Micro Target
 ###########################################################
@@ -197,15 +192,21 @@ Run Tiny Model on Micro Target
 	      --fill-mode ones \
 	      --print-top 4
 
+
+.. GENERATED FROM PYTHON SOURCE LINES 188-200
+
+Specifically, this command sets the input of the model
+to all ones and shows the four values of the output with their indices.
+
+.. code-block:: bash
+
      # Output:
-     #
      # INFO:__main__:b'[100%] [QEMU] CPU: qemu32,+nx,+pae\n'
      # remote: microTVM Zephyr runtime - running
      # INFO:__main__:b'[100%] Built target run\n'
      # [[3.         1.         2.         0.        ]
      # [0.47213247 0.41364592 0.07525456 0.03896701]]
 
- Specifically, this command sets the input of the model to all ones and shows the four values of the output with their indices.
 
 
 .. _sphx_glr_download_how_to_work_with_microtvm_micro_tvmc.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
index bbf852eee5..c10da8214d 100644
--- a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
@@ -5,24 +5,22 @@
 
 Computation times
 =================
-**07:01.984** total execution time for **how_to_work_with_microtvm** files:
+**06:56.797** total execution time for **how_to_work_with_microtvm** files:
 
-+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``)               | 04:39.782 | 0.0 MB |
-+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_pytorch.py` (``micro_pytorch.py``)           | 01:13.915 | 0.0 MB |
-+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``)         | 00:54.775 | 0.0 MB |
-+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``)                   | 00:09.501 | 0.0 MB |
-+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``)             | 00:04.010 | 0.0 MB |
-+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``)             | 00:00.000 | 0.0 MB |
-+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tvmc.py` (``micro_tvmc.py``)                 | 00:00.000 | 0.0 MB |
-+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_reference_vm.py` (``micro_reference_vm.py``) | 00:00.000 | 0.0 MB |
-+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_mlperftiny.py` (``micro_mlperftiny.py``)     | 00:00.000 | 0.0 MB |
-+---------------------------------------------------------------------------------------------+-----------+--------+
++-----------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``)           | 04:43.679 | 0.0 MB |
++-----------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_pytorch.py` (``micro_pytorch.py``)       | 01:08.660 | 0.0 MB |
++-----------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``)     | 00:51.628 | 0.0 MB |
++-----------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``)               | 00:07.687 | 0.0 MB |
++-----------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``)         | 00:05.142 | 0.0 MB |
++-----------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``)         | 00:00.000 | 0.0 MB |
++-----------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tvmc.py` (``micro_tvmc.py``)             | 00:00.000 | 0.0 MB |
++-----------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_mlperftiny.py` (``micro_mlperftiny.py``) | 00:00.000 | 0.0 MB |
++-----------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
index fdaaa3b7e7..85513ba21d 100644
--- a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**00:46.425** total execution time for **how_to_work_with_relay** files:
+**00:44.351** total execution time for **how_to_work_with_relay** files:
 
 +----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:34.195 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:32.340 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``)           | 00:10.607 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``)           | 00:10.443 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``)                             | 00:01.616 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``)                             | 00:01.562 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``)                 | 00:00.006 | 0.0 MB |
 +----------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
index 46dbaf5548..6b1eaa15fe 100644
--- a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
@@ -264,7 +264,7 @@ The following example customizes CUDA lowering rule for :code:`exp`.
  .. code-block:: none
 
 
-    <function my_cuda_math_rule at 0x7faa1d56b3b0>
+    <function my_cuda_math_rule at 0x7fdcc1943950>
 
 
 
diff --git a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
index 5a04ef92d9..9b0d7b3292 100644
--- a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
@@ -5,22 +5,22 @@
 
 Computation times
 =================
-**00:08.154** total execution time for **how_to_work_with_schedules** files:
+**00:06.021** total execution time for **how_to_work_with_schedules** files:
 
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``)                 | 00:05.566 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``)                 | 00:03.536 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``)                     | 00:01.184 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``)                     | 00:01.128 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``)                     | 00:00.594 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``)                     | 00:00.575 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``)                               | 00:00.577 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``)                               | 00:00.555 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``)                     | 00:00.119 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``)                     | 00:00.120 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.054 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.051 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``)                               | 00:00.035 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``)                               | 00:00.032 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``)               | 00:00.025 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``)               | 00:00.024 | 0.0 MB |
 +------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
index 7440639f4e..6d804d265a 100644
--- a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
@@ -340,7 +340,7 @@ The importing needs to happen before the tensorized GEMV being executed.
         def main(A: T.Buffer((1024, 64), "float32"), B: T.Buffer((512, 64), "float32"), C: T.Buffer((1024, 512), "float32")):
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             i = T.var("int32")
-            T.attr(T.iter_var(i, None, "DataPar", ""), "pragma_import_llvm", "; ModuleID = '/tmp/tmp33il914m/input0.cc'\nsource_filename = \"/tmp/tmp33il914m/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca float*, align 8\n  %8 = alloca float*, align 8\n  %9 = alloca float*, [...]
+            T.attr(T.iter_var(i, None, "DataPar", ""), "pragma_import_llvm", "; ModuleID = '/tmp/tmpw4ds75gv/input0.cc'\nsource_filename = \"/tmp/tmpw4ds75gv/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n  %7 = alloca float*, align 8\n  %8 = alloca float*, align 8\n  %9 = alloca float*, [...]
             for i, j_outer in T.grid(1024, 32):
                 T.call_extern("int32", "gemv_update", T.tvm_access_ptr(T.type_annotation("float32"), C.data, i * 512 + j_outer * 16, 16, 2), T.tvm_access_ptr(T.type_annotation("float32"), A.data, i * 64, 64, 1), T.tvm_access_ptr(T.type_annotation("float32"), B.data, j_outer * 1024, 1024, 1), 16, 64, 64)
 
diff --git a/docs/_sources/topic/microtvm/index.rst.txt b/docs/_sources/topic/microtvm/index.rst.txt
index ebcadb3442..4dd4ab5d51 100644
--- a/docs/_sources/topic/microtvm/index.rst.txt
+++ b/docs/_sources/topic/microtvm/index.rst.txt
@@ -50,13 +50,12 @@ Getting Started with microTVM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Before working with microTVM, we recommend you have a supported development board. Then, follow these
-tutorials to get started with microTVM:
+tutorials to get started with microTVM. Tutorials are in the order that could help developers to learn
+more as they follow through them. Here is a list of tutorials that you can start with:
 
-1. :ref:`Start the microTVM Reference VM <tutorial-micro-reference-vm>`. The microTVM tutorials
-   depend on Zephyr and on a compiler toolchain for your hardware. The reference VM is a convenient
-   way to install those dependencies.
-2. Try the :ref:`microTVM with TFLite Tutorial <microTVM-with-TFLite>`.
-3. Try running a more complex `CIFAR10-CNN model <https://github.com/areusch/microtvm-blogpost-eval>`_.
+1. Try :ref:`microTVM CLI Tool <tutorial-micro-cli-tool>`.
+2. Try the :ref:`microTVM TFLite Tutorial <tutorial_micro_tflite>`.
+3. Try running a more complex tutorial: :ref:`Creating Your MLPerfTiny Submission with microTVM <tutorial-micro-mlperftiny>`.
 
 
 How microTVM Works
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
index 6f9e882565..a3b1f6e391 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:32.272** total execution time for **topic_vta_tutorials_autotvm** files:
+**00:29.826** total execution time for **topic_vta_tutorials_autotvm** files:
 
 +---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:32.265 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:29.820 | 0.0 MB |
 +---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``)     | 00:00.007 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``)     | 00:00.006 | 0.0 MB |
 +---------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
index 97decbc7af..cdaa3d3fae 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -293,7 +293,7 @@ The compilation steps are:
       DeprecationWarning,
     /workspace/vta/tutorials/frontend/deploy_classification.py:213: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the  new recommended usage.
       relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
-    resnet18_v1 inference graph built in 34.73s!
+    resnet18_v1 inference graph built in 31.69s!
 
 
 
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
index 79210f388e..63c69133c3 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
@@ -337,7 +337,7 @@ The compilation steps are:
 
     /workspace/python/tvm/relay/build_module.py:348: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
       DeprecationWarning,
-    yolov3-tiny inference graph built in 23.52s!
+    yolov3-tiny inference graph built in 21.73s!
 
 
 
diff --git a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
index 1fd7aee932..a91ec7d9e6 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**01:42.436** total execution time for **topic_vta_tutorials_frontend** files:
+**01:37.597** total execution time for **topic_vta_tutorials_frontend** files:
 
 +------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:51.718 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``)           | 00:49.094 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``)           | 00:50.718 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:48.503 | 0.0 MB |
 +------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
index d6ce5bb8e3..018ee15362 100644
--- a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:03.286** total execution time for **topic_vta_tutorials_optimize** files:
+**00:03.150** total execution time for **topic_vta_tutorials_optimize** files:
 
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)         | 00:02.796 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``)         | 00:02.685 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.490 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.465 | 0.0 MB |
 +--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
index a2d9b61f48..16429d3abc 100644
--- a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
 
 Computation times
 =================
-**00:00.862** total execution time for **topic_vta_tutorials** files:
+**00:00.779** total execution time for **topic_vta_tutorials** files:
 
 +---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.460 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.402 | 0.0 MB |
 +---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.402 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.377 | 0.0 MB |
 +---------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
index ec629117be..5f11df23f7 100644
--- a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
@@ -207,6 +207,13 @@ trials, we can load the best schedule from the log file and apply it.
 
 
 
+.. rst-class:: sphx-glr-script-out
+
+ .. code-block:: none
+
+    *E
+
+
 
 
 
@@ -318,7 +325,7 @@ We build the binary and check its correctness and performance.
 
  .. code-block:: none
 
-    Execution time of this operator: 97.638 ms
+    Execution time of this operator: 94.307 ms
 
 
 
@@ -418,7 +425,7 @@ resume the status and do more 5 trials.
     Resume search:
     /venv/apache-tvm-py3.7/lib/python3.7/site-packages/xgboost/training.py:17: UserWarning: Old style callback is deprecated.  See: https://xgboost.readthedocs.io/en/latest/python/callbacks.html
       warnings.warn(f'Old style callback is deprecated.  See: {link}', UserWarning)
-
+    *E
 
 
 
@@ -436,7 +443,7 @@ operations.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  34.012 seconds)
+   **Total running time of the script:** ( 1 minutes  54.264 seconds)
 
 
 .. _sphx_glr_download_tutorial_auto_scheduler_matmul_x86.py:
diff --git a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
index 97278386d3..c481165c02 100644
--- a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
@@ -454,16 +454,16 @@ reduce variance, we take 5 measurements and average them.
     waiting for device...
     device available
     Get devices for measurement successfully!
-    No: 1   GFLOPS: 1.74/1.74       result: MeasureResult(costs=(0.154085187,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.704826593399048, timestamp=1674848126.1297917) [('tile_y', [-1, 512]), ('tile_x', [-1, 4])],None,29
-    No: 2   GFLOPS: 3.27/3.27       result: MeasureResult(costs=(0.0820791094,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5852487087249756, timestamp=1674848127.7140095)       [('tile_y', [-1, 64]), ('tile_x', [-1, 8])],None,36
-    No: 3   GFLOPS: 1.59/3.27       result: MeasureResult(costs=(0.1685287386,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.945835828781128, timestamp=1674848131.4821894)        [('tile_y', [-1, 32]), ('tile_x', [-1, 4])],None,25
-    No: 4   GFLOPS: 10.93/10.93     result: MeasureResult(costs=(0.024559538,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.661290168762207, timestamp=1674848132.9526696) [('tile_y', [-1, 4]), ('tile_x', [-1, 128])],None,72
-    No: 5   GFLOPS: 0.94/10.93      result: MeasureResult(costs=(0.286682917,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.836069822311401, timestamp=1674848137.9160361) [('tile_y', [-1, 32]), ('tile_x', [-1, 2])],None,15
-    No: 6   GFLOPS: 12.67/12.67     result: MeasureResult(costs=(0.02118379,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5966391563415527, timestamp=1674848139.3248727) [('tile_y', [-1, 8]), ('tile_x', [-1, 512])],None,93
-    No: 7   GFLOPS: 1.75/12.67      result: MeasureResult(costs=(0.1529695782,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.678224563598633, timestamp=1674848142.027855) [('tile_y', [-1, 4]), ('tile_x', [-1, 1])],None,2
-    No: 8   GFLOPS: 0.51/12.67      result: MeasureResult(costs=(0.5278115134,), error_no=MeasureErrorNo.NO_ERROR, all_cost=8.686321258544922, timestamp=1674848150.7280724)        [('tile_y', [-1, 128]), ('tile_x', [-1, 1])],None,7
-    No: 9   GFLOPS: 3.00/12.67      result: MeasureResult(costs=(0.089538356,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6537797451019287, timestamp=1674848152.5550673)        [('tile_y', [-1, 2]), ('tile_x', [-1, 16])],None,41
-    No: 10  GFLOPS: 11.46/12.67     result: MeasureResult(costs=(0.023432478,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6591908931732178, timestamp=1674848153.1913335)        [('tile_y', [-1, 64]), ('tile_x', [-1, 32])],None,56
+    No: 1   GFLOPS: 1.74/1.74       result: MeasureResult(costs=(0.1543798046,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.699293375015259, timestamp=1674861808.6912024)        [('tile_y', [-1, 4]), ('tile_x', [-1, 1])],None,2
+    No: 2   GFLOPS: 10.61/10.61     result: MeasureResult(costs=(0.025300943799999996,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6451883316040039, timestamp=1674861810.1005502)       [('tile_y', [-1, 512]), ('tile_x', [-1, 256])],None,89
+    No: 3   GFLOPS: 12.71/12.71     result: MeasureResult(costs=(0.021127269799999997,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5675485134124756, timestamp=1674861811.4439735)       [('tile_y', [-1, 32]), ('tile_x', [-1, 512])],None,95
+    No: 4   GFLOPS: 10.68/12.71     result: MeasureResult(costs=(0.0251274182,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6238992214202881, timestamp=1674861812.100912)        [('tile_y', [-1, 1]), ('tile_x', [-1, 512])],None,90
+    No: 5   GFLOPS: 10.56/12.71     result: MeasureResult(costs=(0.0254106022,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6388294696807861, timestamp=1674861812.9238553)       [('tile_y', [-1, 512]), ('tile_x', [-1, 512])],None,99
+    No: 6   GFLOPS: 12.79/12.79     result: MeasureResult(costs=(0.020995783,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5954904556274414, timestamp=1674861813.5185819)        [('tile_y', [-1, 128]), ('tile_x', [-1, 256])],None,87
+    No: 7   GFLOPS: 2.17/12.79      result: MeasureResult(costs=(0.12367873780000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.2109549045562744, timestamp=1674861816.4984136)        [('tile_y', [-1, 4]), ('tile_x', [-1, 2])],None,12
+    No: 8   GFLOPS: 9.17/12.79      result: MeasureResult(costs=(0.0292765938,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7544360160827637, timestamp=1674861817.2192068)       [('tile_y', [-1, 16]), ('tile_x', [-1, 32])],None,54
+    No: 9   GFLOPS: 10.42/12.79     result: MeasureResult(costs=(0.025759962600000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.698195219039917, timestamp=1674861818.029947) [('tile_y', [-1, 1]), ('tile_x', [-1, 64])],None,60
+    No: 10  GFLOPS: 1.60/12.79      result: MeasureResult(costs=(0.16801652620000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.8946549892425537, timestamp=1674861820.9771154)        [('tile_y', [-1, 2]), ('tile_x', [-1, 2])],None,11
 
 
 
diff --git a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
index bb83b79ec5..f997373dd7 100644
--- a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
@@ -311,7 +311,7 @@ standard deviation.
 
  .. code-block:: none
 
-    {'mean': 519.8183306999988, 'median': 519.4112837500029, 'std': 2.049956864285323}
+    {'mean': 513.3865644599973, 'median': 513.2042234999972, 'std': 1.6686009692921546}
 
 
 
@@ -545,30 +545,29 @@ the tuning data to.
 
  .. code-block:: none
 
-
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  1/25]  Current/Best:   10.95/  18.19 GFLOPS | Progress: (4/20) | 7.15 s
    [Task  1/25]  Current/Best:    6.92/  18.19 GFLOPS | Progress: (8/20) | 12.74 s
    [Task  1/25]  Current/Best:   18.42/  18.42 GFLOPS | Progress: (12/20) | 14.97 s
    [Task  1/25]  Current/Best:    3.20/  18.42 GFLOPS | Progress: (16/20) | 18.35 s
    [Task  1/25]  Current/Best:   17.82/  22.28 GFLOPS | Progress: (20/20) | 20.59 s Done.
-
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  2/25]  Current/Best:    8.23/  17.58 GFLOPS | Progress: (4/20) | 3.61 s
    [Task  2/25]  Current/Best:   12.77/  17.58 GFLOPS | Progress: (8/20) | 5.30 s
    [Task  2/25]  Current/Best:   16.03/  17.58 GFLOPS | Progress: (12/20) | 7.55 s
    [Task  2/25]  Current/Best:   13.94/  17.58 GFLOPS | Progress: (16/20) | 10.00 s
    [Task  2/25]  Current/Best:   12.98/  17.58 GFLOPS | Progress: (20/20) | 11.57 s Done.
-
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  3/25]  Current/Best:   15.21/  18.92 GFLOPS | Progress: (4/20) | 3.93 s
    [Task  3/25]  Current/Best:   15.54/  18.92 GFLOPS | Progress: (8/20) | 6.33 s
    [Task  3/25]  Current/Best:   21.68/  21.68 GFLOPS | Progress: (12/20) | 8.58 s
    [Task  3/25]  Current/Best:   10.53/  21.68 GFLOPS | Progress: (16/20) | 10.79 s
    [Task  3/25]  Current/Best:   13.29/  23.87 GFLOPS | Progress: (20/20) | 14.87 s Done.
-
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  4/25]  Current/Best:   12.17/  21.36 GFLOPS | Progress: (4/20) | 9.31 s
    [Task  4/25]  Current/Best:   16.52/  21.36 GFLOPS | Progress: (8/20) | 14.60 s
    [Task  4/25]  Current/Best:   13.29/  21.36 GFLOPS | Progress: (12/20) | 17.56 s
    [Task  4/25]  Current/Best:   12.41/  21.36 GFLOPS | Progress: (16/20) | 20.99 s
    [Task  4/25]  Current/Best:   13.66/  21.36 GFLOPS | Progress: (20/20) | 28.48 s Done.
-
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  5/25]  Current/Best:   17.86/  20.12 GFLOPS | Progress: (4/20) | 3.53 s
    [Task  5/25]  Current/Best:   10.24/  20.12 GFLOPS | Progress: (8/20) | 6.19 s
    [Task  5/25]  Current/Best:   13.07/  20.12 GFLOPS | Progress: (12/20) | 8.16 s
    [Task  5/25]  Current/Best:   13.64/  20.12 GFLOPS | Progress: (16/20) | 10.05 s
    [Task  5/25]  Current/Best:   11.14/  20.12 GFLOPS | Progress: (20/20) | 12.56 s Done.
-
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  6/25]  Current/Best:    4.20/  17.77 GFLOPS | Progress: (4/20) | 4.71 s
    [Task  6/25]  Current/Best:    5.69/  17.87 GFLOPS | Progress: (8/20) | 7.25 s
    [Task  6/25]  Current/Best:   12.22/  17.87 GFLOPS | Progress: (12/20) | 10.04 s
    [Task  6/25]  Current/Best:    5.95/  17.87 GFLOPS | Progress: (16/20) | 12.47 s
    [Task  6/25]  Current/Best:   10.43/  18.05 GFLOPS | Progress: (20/20) | 16.67 s Done.
-
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  7/25]  Current/Best:   13.49/  14.31 GFLOPS | Progress: (4/20) | 4.44 s
    [Task  7/25]  Current/Best:   13.18/  14.92 GFLOPS | Progress: (8/20) | 6.99 s
    [Task  7/25]  Current/Best:    8.19/  14.92 GFLOPS | Progress: (12/20) | 10.67 s
    [Task  7/25]  Current/Best:   12.48/  20.51 GFLOPS | Progress: (16/20) | 12.75 s
    [Task  7/25]  Current/Best:   18.86/  20.51 GFLOPS | Progress: (20/20) | 15.58 s Done.
-
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  8/25]  Current/Best:   15.25/  21.37 GFLOPS | Progress: (4/20) | 6.82 s
    [Task  8/25]  Current/Best:    5.49/  21.37 GFLOPS | Progress: (8/20) | 9.81 s
    [Task  8/25]  Current/Best:    8.51/  21.37 GFLOPS | Progress: (12/20) | 21.34 s
    [Task  8/25]  Current/Best:    7.92/  21.37 GFLOPS | Progress: (16/20) | 27.01 s
    [Task  8/25]  Current/Best:   17.95/  21.37 GFLOPS | Progress: (20/20) | 29.18 s
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  9/25]  Current/Best:   11.81/  18.03 GFLOPS | Progress: (4/20) | 7.85 s
    [Task  9/25]  Current/Best:   17.84/  22.90 GFLOPS | Progress: (8/20) | 10.81 s
    [Task  9/25]  Current/Best:   17.77/  22.90 GFLOPS | Progress: (12/20) | 12.42 s
    [Task  9/25]  Current/Best:    9.09/  22.90 GFLOPS | Progress: (16/20) | 15.16 s
    [Task  9/25]  Current/Best:   10.86/  22.90 GFLOPS | Progress: (20/2
 0) | 18.20 s Done.
-
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 10/25]  Current/Best:   19.08/  19.08 GFLOPS | Progress: (4/20) | 4.14 s
    [Task 10/25]  Current/Best:   11.03/  19.08 GFLOPS | Progress: (8/20) | 6.85 s
    [Task 10/25]  Current/Best:    9.60/  20.37 GFLOPS | Progress: (12/20) | 11.43 s
    [Task 10/25]  Current/Best:   11.51/  20.37 GFLOPS | Progress: (16/20) | 14.11 s
    [Task 10/25]  Current/Best:   11.49/  20.37 GFLOPS | Progress: (20/20) | 16.40 s Done.
-
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 11/25]  Current/Best:   19.69/  20.80 GFLOPS | Progress: (4/20) | 3.85 s
    [Task 11/25]  Current/Best:   13.21/  20.80 GFLOPS | Progress: (8/20) | 6.30 s
    [Task 11/25]  Current/Best:   15.75/  22.36 GFLOPS | Progress: (12/20) | 9.03 s
    [Task 11/25]  Current/Best:    8.14/  22.36 GFLOPS | Progress: (16/20) | 11.52 s
    [Task 11/25]  Current/Best:    9.44/  22.36 GFLOPS | Progress: (20/20) | 13.91 s Done.
-
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 12/25]  Current/Best:   12.58/  13.93 GFLOPS | Progress: (4/20) | 4.57 s
    [Task 12/25]  Current/Best:   17.53/  21.44 GFLOPS | Progress: (8/20) | 7.19 s
    [Task 12/25]  Current/Best:    3.53/  21.44 GFLOPS | Progress: (12/20) | 10.63 s
    [Task 12/25]  Current/Best:    2.72/  21.44 GFLOPS | Progress: (16/20) | 13.84 s
    [Task 12/25]  Current/Best:   20.54/  21.44 GFLOPS | Progress: (20/20) | 16.02 s Done.
-
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 13/25]  Current/Best:   11.74/  11.74 GFLOPS | Progress: (4/20) | 5.99 s
    [Task 13/25]  Current/Best:    3.10/  16.24 GFLOPS | Progress: (8/20) | 9.55 s
    [Task 13/25]  Current/Best:    9.09/  16.24 GFLOPS | Progress: (12/20) | 13.85 s
    [Task 13/25]  Current/Best:    6.33/  18.36 GFLOPS | Progress: (16/20) | 16.20 s
    [Task 13/25]  Current/Best:    7.55/  18.36 GFLOPS | Progress: (20/20) | 20.83 s Done.
-
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 14/25]  Current/Best:    7.75/  14.83 GFLOPS | Progress: (4/20) | 8.40 s
    [Task 14/25]  Current/Best:    7.28/  14.83 GFLOPS | Progress: (8/20) | 10.56 s
    [Task 14/25]  Current/Best:    5.11/  14.83 GFLOPS | Progress: (12/20) | 13.16 s
    [Task 14/25]  Current/Best:    9.49/  21.27 GFLOPS | Progress: (16/20) | 15.61 s Done.
-
    [Task 14/25]  Current/Best:    8.56/  21.27 GFLOPS | Progress: (20/20) | 18.15 s
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 15/25]  Current/Best:   18.10/  18.10 GFLOPS | Progress: (4/20) | 4.29 s
    [Task 15/25]  Current/Best:    6.03/  22.31 GFLOPS | Progress: (8/20) | 7.69 s
    [Task 15/25]  Current/Best:   17.97/  22.31 GFLOPS | Progress: (12/20) | 9.44 s
    [Task 15/25]  Current/Best:   19.29/  22.31 GFLOPS | Progress: (16/20) | 11.06 s
    [Task 15/25]  Current/Best:    9.79/  22.31 GFLOPS | Progress: (20/20) | 14.78 s
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 16/25]  Current/Best:   18.52/  18.66 GFLOPS | Progress: (4/20) | 3.55 s
    [Task 16/25]  Current/Best:   13.27/  18.66 GFLOPS | Progress: (8/20) | 5.71 s
    [Task 16/25]  Current/Best:   19.28/  19.28 GFLOPS | Progress: (12/20) | 7.91 s
    [Task 16/25]  Current/Best:   16.03/  20.62 GFLOPS | Progress: (16/20) 
 | 9.49 s
    [Task 16/25]  Current/Best:   13.09/  20.62 GFLOPS | Progress: (20/20) | 11.13 s Done.
-
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 17/25]  Current/Best:   18.71/  18.71 GFLOPS | Progress: (4/20) | 5.29 s
    [Task 17/25]  Current/Best:   22.07/  22.07 GFLOPS | Progress: (8/20) | 8.24 s
    [Task 17/25]  Current/Best:   22.75/  23.16 GFLOPS | Progress: (12/20) | 11.01 s
    [Task 17/25]  Current/Best:   12.18/  23.16 GFLOPS | Progress: (16/20) | 14.18 s
    [Task 17/25]  Current/Best:   13.37/  23.16 GFLOPS | Progress: (20/20) | 16.89 s Done.
-
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 18/25]  Current/Best:   10.14/  14.74 GFLOPS | Progress: (4/20) | 6.50 s
    [Task 18/25]  Current/Best:   16.35/  16.97 GFLOPS | Progress: (8/20) | 10.94 s
    [Task 18/25]  Current/Best:   16.59/  18.85 GFLOPS | Progress: (12/20) | 13.31 s
    [Task 18/25]  Current/Best:   19.63/  19.63 GFLOPS | Progress: (16/20) | 17.52 s
    [Task 18/25]  Current/Best:   13.04/  19.63 GFLOPS | Progress: (20/20) | 21.32 s Done.
-
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 19/25]  Current/Best:    4.91/  16.98 GFLOPS | Progress: (4/20) | 5.36 s
    [Task 19/25]  Current/Best:   21.95/  21.95 GFLOPS | Progress: (8/20) | 9.52 s
    [Task 19/25]  Current/Best:    7.22/  21.95 GFLOPS | Progress: (12/20) | 13.31 s
    [Task 19/25]  Current/Best:   15.80/  21.95 GFLOPS | Progress: (16/20) | 16.41 s
    [Task 19/25]  Current/Best:   19.23/  21.95 GFLOPS | Progress: (20/20) | 19.18 s Done.
-
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 20/25]  Current/Best:   11.09/  19.16 GFLOPS | Progress: (4/20) | 3.74 s
    [Task 20/25]  Current/Best:    8.80/  19.16 GFLOPS | Progress: (8/20) | 8.55 s
    [Task 20/25]  Current/Best:   13.06/  20.21 GFLOPS | Progress: (12/20) | 11.64 s
    [Task 20/25]  Current/Best:   13.39/  20.21 GFLOPS | Progress: (16/20) | 14.93 s
    [Task 20/25]  Current/Best:   15.39/  20.21 GFLOPS | Progress: (20/20) | 18.19 s
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s Done.
+
    [Task  1/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  1/25]  Current/Best:    9.84/  19.18 GFLOPS | Progress: (4/20) | 7.58 s
    [Task  1/25]  Current/Best:   17.74/  19.18 GFLOPS | Progress: (8/20) | 10.67 s
    [Task  1/25]  Current/Best:   12.43/  19.18 GFLOPS | Progress: (12/20) | 13.44 s
    [Task  1/25]  Current/Best:   21.33/  21.33 GFLOPS | Progress: (16/20) | 16.71 s
    [Task  1/25]  Current/Best:    9.61/  21.33 GFLOPS | Progress: (20/20) | 20.69 s Done.
+
    [Task  2/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  2/25]  Current/Best:    9.56/  15.12 GFLOPS | Progress: (4/20) | 3.26 s
    [Task  2/25]  Current/Best:   10.00/  15.19 GFLOPS | Progress: (8/20) | 5.38 s
    [Task  2/25]  Current/Best:   13.79/  20.23 GFLOPS | Progress: (12/20) | 7.79 s
    [Task  2/25]  Current/Best:   12.59/  20.23 GFLOPS | Progress: (16/20) | 10.88 s
    [Task  2/25]  Current/Best:    6.81/  20.23 GFLOPS | Progress: (20/20) | 12.38 s Done.
+
    [Task  3/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  3/25]  Current/Best:   22.86/  22.86 GFLOPS | Progress: (4/20) | 3.83 s
    [Task  3/25]  Current/Best:   10.21/  22.86 GFLOPS | Progress: (8/20) | 6.89 s
    [Task  3/25]  Current/Best:   19.23/  22.86 GFLOPS | Progress: (12/20) | 9.16 s
    [Task  3/25]  Current/Best:   18.91/  22.86 GFLOPS | Progress: (16/20) | 11.30 s
    [Task  3/25]  Current/Best:   14.48/  22.86 GFLOPS | Progress: (20/20) | 13.40 s Done.
+
    [Task  4/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  4/25]  Current/Best:   17.84/  17.84 GFLOPS | Progress: (4/20) | 3.40 s
    [Task  4/25]  Current/Best:   17.74/  17.84 GFLOPS | Progress: (8/20) | 5.67 s
    [Task  4/25]  Current/Best:   16.09/  18.75 GFLOPS | Progress: (12/20) | 16.70 s
    [Task  4/25]  Current/Best:    6.77/  18.75 GFLOPS | Progress: (16/20) | 21.63 s
    [Task  4/25]  Current/Best:    5.72/  18.75 GFLOPS | Progress: (20/20) | 24.79 s Done.
+
    [Task  5/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  5/25]  Current/Best:   20.55/  20.55 GFLOPS | Progress: (4/20) | 3.54 s
    [Task  5/25]  Current/Best:    4.55/  20.55 GFLOPS | Progress: (8/20) | 5.91 s
    [Task  5/25]  Current/Best:   13.91/  20.55 GFLOPS | Progress: (12/20) | 7.77 s
    [Task  5/25]  Current/Best:    5.43/  20.55 GFLOPS | Progress: (16/20) | 9.54 s
    [Task  5/25]  Current/Best:   15.40/  20.55 GFLOPS | Progress: (20/20) | 11.50 s Done.
+
    [Task  6/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  6/25]  Current/Best:   10.52/  15.47 GFLOPS | Progress: (4/20) | 8.76 s
    [Task  6/25]  Current/Best:   11.25/  20.98 GFLOPS | Progress: (8/20) | 11.50 s
    [Task  6/25]  Current/Best:   14.46/  20.98 GFLOPS | Progress: (12/20) | 13.76 s
    [Task  6/25]  Current/Best:   19.02/  20.98 GFLOPS | Progress: (16/20) | 17.44 s
    [Task  6/25]  Current/Best:   17.14/  20.98 GFLOPS | Progress: (20/20) | 20.74 s Done.
+
    [Task  7/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  7/25]  Current/Best:    9.02/  15.03 GFLOPS | Progress: (4/20) | 4.08 s
    [Task  7/25]  Current/Best:   11.50/  16.37 GFLOPS | Progress: (8/20) | 7.83 s
    [Task  7/25]  Current/Best:    6.30/  18.67 GFLOPS | Progress: (12/20) | 10.46 s
    [Task  7/25]  Current/Best:    6.08/  19.01 GFLOPS | Progress: (16/20) | 12.64 s
    [Task  7/25]  Current/Best:   12.66/  19.01 GFLOPS | Progress: (20/20) | 15.66 s Done.
+
    [Task  8/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  8/25]  Current/Best:    4.09/   7.55 GFLOPS | Progress: (4/20) | 14.25 s
    [Task  8/25]  Current/Best:    8.37/  15.64 GFLOPS | Progress: (8/20) | 17.27 s
    [Task  8/25]  Current/Best:   13.05/  15.64 GFLOPS | Progress: (12/20) | 20.61 s
    [Task  8/25]  Current/Best:    7.27/  15.64 GFLOPS | Progress: (16/20) | 25.36 s
    [Task  8/25]  Current/Best:    9.99/  15.83 GFLOPS | Progress: (20/20) | 29.82 s
    [Task  9/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task  9/25]  Current/Best:    4.76/  13.47 GFLOPS | Progress: (4/20) | 13.02 s
    [Task  9/25]  Current/Best:   13.14/  13.47 GFLOPS | Progress: (8/20) | 16.87 s
    [Task  9/25]  Current/Best:   21.84/  21.84 GFLOPS | Progress: (12/20) | 27.83 s
    [Task  9/25]  Current/Best:   13.27/  21.84 GFLOPS | Progress: (16/20) | 30.89 s
    [Task  9/25]  Current/Best:    6.66/  21.84 GFLOPS | Progress: (2
 0/20) | 41.99 s
    [Task 10/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s Done.
      Done.
-
    [Task 21/25]  Current/Best:   16.55/  16.55 GFLOPS | Progress: (4/20) | 4.32 s
    [Task 21/25]  Current/Best:    9.29/  19.00 GFLOPS | Progress: (8/20) | 6.33 s
    [Task 21/25]  Current/Best:   16.83/  19.00 GFLOPS | Progress: (12/20) | 8.44 s
    [Task 21/25]  Current/Best:    9.16/  19.00 GFLOPS | Progress: (16/20) | 10.98 s
    [Task 21/25]  Current/Best:   16.11/  19.00 GFLOPS | Progress: (20/20) | 13.47 s
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 22/25]  Current/Best:   11.29/  11.29 GFLOPS | Progress: (4/20) | 4.28 s
    [Task 22/25]  Current/Best:    9.01/  11.88 GFLOPS | Progress: (8/20) | 6.64 s
    [Task 22/25]  Current/Best:   12.18/  19.60 GFLOPS | Progress: (12/20) | 9.99 s
    [Task 22/25]  Current/Best:    9.83/  19.60 GFLOPS | Progress: (16/20) | 11.87 s
    [Task 22/25]  Current/Best:    8.17/  19.60 GFLOPS | Progress: (20/20) | 14.69 s Done.
-
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 23/25]  Current/Best:   12.54/  18.36 GFLOPS | Progress: (4/20) | 6.09 s
    [Task 23/25]  Current/Best:   22.42/  22.42 GFLOPS | Progress: (8/20) | 8.24 s
    [Task 23/25]  Current/Best:   10.21/  22.42 GFLOPS | Progress: (12/20) | 12.54 s
    [Task 23/25]  Current/Best:   16.97/  22.42 GFLOPS | Progress: (16/20) | 14.92 s
    [Task 23/25]  Current/Best:   11.51/  22.42 GFLOPS | Progress: (20/20) | 17.72 s Done.
-
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 24/25]  Current/Best:    5.74/   9.20 GFLOPS | Progress: (4/20) | 3.63 s
    [Task 24/25]  Current/Best:    8.10/   9.20 GFLOPS | Progress: (8/20) | 14.29 s
    [Task 24/25]  Current/Best:    8.19/   9.20 GFLOPS | Progress: (12/20) | 22.22 s
    [Task 24/25]  Current/Best:    1.81/   9.20 GFLOPS | Progress: (16/20) | 33.21 s
    [Task 24/25]  Current/Best:    5.57/   9.20 GFLOPS | Progress: (20/20) | 44.40 s
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s Done.
-     Done.
-
    [Task 25/25]  Current/Best:    3.85/   7.69 GFLOPS | Progress: (4/20) | 14.11 s
    [Task 25/25]  Current/Best:    1.54/   9.09 GFLOPS | Progress: (8/20) | 26.16 s
    [Task 25/25]  Current/Best:    8.55/   9.09 GFLOPS | Progress: (12/20) | 37.13 s
    [Task 25/25]  Current/Best:    2.63/   9.42 GFLOPS | Progress: (16/20) | 41.21 s
    [Task 25/25]  Current/Best:    5.76/   9.42 GFLOPS | Progress: (20/20) | 45.86 s
+
    [Task 10/25]  Current/Best:    6.16/  15.10 GFLOPS | Progress: (4/20) | 4.31 s
    [Task 10/25]  Current/Best:   17.50/  17.50 GFLOPS | Progress: (8/20) | 6.20 s
    [Task 10/25]  Current/Best:   17.78/  17.78 GFLOPS | Progress: (12/20) | 8.16 s
    [Task 10/25]  Current/Best:   13.52/  17.78 GFLOPS | Progress: (16/20) | 9.83 s
    [Task 10/25]  Current/Best:   12.86/  20.80 GFLOPS | Progress: (20/20) | 12.93 s Done.
+
    [Task 11/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 11/25]  Current/Best:   13.34/  13.34 GFLOPS | Progress: (4/20) | 4.29 s
    [Task 11/25]  Current/Best:   23.89/  23.89 GFLOPS | Progress: (8/20) | 8.21 s
    [Task 11/25]  Current/Best:   21.38/  23.89 GFLOPS | Progress: (12/20) | 10.93 s
    [Task 11/25]  Current/Best:   18.58/  23.89 GFLOPS | Progress: (16/20) | 13.28 s
    [Task 11/25]  Current/Best:   19.27/  23.89 GFLOPS | Progress: (20/20) | 15.30 s Done.
+
    [Task 12/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 12/25]  Current/Best:    9.43/   9.43 GFLOPS | Progress: (4/20) | 4.13 s
    [Task 12/25]  Current/Best:    4.49/  18.64 GFLOPS | Progress: (8/20) | 6.58 s
    [Task 12/25]  Current/Best:   13.86/  18.64 GFLOPS | Progress: (12/20) | 11.44 s
    [Task 12/25]  Current/Best:    6.63/  18.64 GFLOPS | Progress: (16/20) | 16.00 s
    [Task 12/25]  Current/Best:   11.30/  20.30 GFLOPS | Progress: (20/20) | 18.28 s Done.
+
    [Task 13/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 13/25]  Current/Best:    7.05/  17.50 GFLOPS | Progress: (4/20) | 5.02 s
    [Task 13/25]  Current/Best:   10.73/  17.50 GFLOPS | Progress: (8/20) | 9.43 s
    [Task 13/25]  Current/Best:   11.53/  17.50 GFLOPS | Progress: (12/20) | 12.64 s
    [Task 13/25]  Current/Best:   19.20/  19.69 GFLOPS | Progress: (16/20) | 15.65 s
    [Task 13/25]  Current/Best:   14.95/  19.69 GFLOPS | Progress: (20/20) | 18.77 s Done.
+
    [Task 14/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 14/25]  Current/Best:   15.89/  15.89 GFLOPS | Progress: (4/20) | 3.92 s
    [Task 14/25]  Current/Best:    2.98/  21.42 GFLOPS | Progress: (8/20) | 7.31 s
    [Task 14/25]  Current/Best:    3.60/  21.42 GFLOPS | Progress: (12/20) | 10.64 s
    [Task 14/25]  Current/Best:   17.30/  21.42 GFLOPS | Progress: (16/20) | 13.56 s
    [Task 14/25]  Current/Best:    8.49/  21.42 GFLOPS | Progress: (20/20) | 16.94 s
    [Task 15/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 15/25]  Current/Best:    9.06/   9.21 GFLOPS | Progress: (4/20) | 7.09 s
    [Task 15/25]  Current/Best:    6.62/  18.59 GFLOPS | Progress: (8/20) | 9.04 s
    [Task 15/25]  Current/Best:   10.67/  18.59 GFLOPS | Progress: (12/20) | 12.88 s Done.
+
    [Task 15/25]  Current/Best:   18.39/  18.59 GFLOPS | Progress: (16/20) | 14.75 s
    [Task 15/25]  Current/Best:   15.62/  18.59 GFLOPS | Progress: (20/20) | 17.92 s
    [Task 16/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 16/25]  Current/Best:   14.47/  14.47 GFLOPS | Progress: (4/20) | 5.64 s
    [Task 16/25]  Current/Best:   17.76/  18.56 GFLOPS | Progress: (8/20) | 7.15 s
    [Task 16/25]  Current/Best:    7.19/  18.56 GFLOPS | Progress: (12/20) | 8.89 s
    [Task 16/25]  Current/Best:   18.97/  18.97 GFLOPS | Progress: (16/20) | 10.64 s
    [Task 16/25]  Current/Best:   12.73/  19.52 GFLOPS | Progress: (20/20) | 12.21 s Done.
+
    [Task 17/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 17/25]  Current/Best:    9.37/  15.67 GFLOPS | Progress: (4/20) | 4.38 s
    [Task 17/25]  Current/Best:   11.58/  15.67 GFLOPS | Progress: (8/20) | 7.71 s
    [Task 17/25]  Current/Best:   12.25/  18.84 GFLOPS | Progress: (12/20) | 10.82 s
    [Task 17/25]  Current/Best:   12.30/  21.87 GFLOPS | Progress: (16/20) | 13.29 s
    [Task 17/25]  Current/Best:    5.35/  21.87 GFLOPS | Progress: (20/20) | 15.58 s Done.
+
    [Task 18/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 18/25]  Current/Best:    7.65/  17.56 GFLOPS | Progress: (4/20) | 4.38 s
    [Task 18/25]  Current/Best:   11.95/  17.56 GFLOPS | Progress: (8/20) | 7.44 s
    [Task 18/25]  Current/Best:   16.53/  17.56 GFLOPS | Progress: (12/20) | 10.53 s
    [Task 18/25]  Current/Best:   13.54/  18.41 GFLOPS | Progress: (16/20) | 18.58 s
    [Task 18/25]  Current/Best:   19.64/  19.64 GFLOPS | Progress: (20/20) | 20.50 s Done.
+
    [Task 19/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 19/25]  Current/Best:   11.70/  11.70 GFLOPS | Progress: (4/20) | 5.92 s
    [Task 19/25]  Current/Best:   18.82/  21.81 GFLOPS | Progress: (8/20) | 9.28 s
    [Task 19/25]  Current/Best:    4.75/  21.81 GFLOPS | Progress: (12/20) | 12.06 s
    [Task 19/25]  Current/Best:   18.34/  21.81 GFLOPS | Progress: (16/20) | 16.95 s
    [Task 19/25]  Current/Best:   12.93/  21.81 GFLOPS | Progress: (20/20) | 19.34 s Done.
+
    [Task 20/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 20/25]  Current/Best:   13.94/  14.10 GFLOPS | Progress: (4/20) | 5.87 s
    [Task 20/25]  Current/Best:    7.27/  16.16 GFLOPS | Progress: (8/20) | 11.42 s
    [Task 20/25]  Current/Best:    8.80/  16.16 GFLOPS | Progress: (12/20) | 14.17 s
    [Task 20/25]  Current/Best:    3.11/  16.16 GFLOPS | Progress: (16/20) | 18.37 s
    [Task 20/25]  Current/Best:   18.67/  18.67 GFLOPS | Progress: (20/20) | 20.64 s
    [Task 21/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 21/25]  Current/Best:    8.94/  14.20 GFLOPS | Progress: (4/20) | 5.30 s Done.
+
    [Task 21/25]  Current/Best:    9.35/  18.41 GFLOPS | Progress: (8/20) | 7.30 s
    [Task 21/25]  Current/Best:    5.28/  20.84 GFLOPS | Progress: (12/20) | 11.09 s
    [Task 21/25]  Current/Best:   20.64/  20.84 GFLOPS | Progress: (16/20) | 13.20 s
    [Task 21/25]  Current/Best:   16.39/  20.84 GFLOPS | Progress: (20/20) | 15.59 s
    [Task 22/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 22/25]  Current/Best:   18.08/  18.08 GFLOPS | Progress: (4/20) | 4.27 s
    [Task 22/25]  Current/Best:   16.42/  18.08 GFLOPS | Progress: (8/20) | 6.10 s
    [Task 22/25]  Current/Best:   23.08/  23.08 GFLOPS | Progress: (12/20) | 7.64 s
    [Task 22/25]  Current/Best:   19.59/  23.08 GFLOPS | Progress: (16/20) | 10.06 s
    [Task 22/25]  Current/Best:   20.46/  23.08 GFLOPS | Progress: (20/20) | 11.95 s Done.
+
    [Task 23/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 23/25]  Current/Best:   10.37/  11.29 GFLOPS | Progress: (4/20) | 6.43 s
    [Task 23/25]  Current/Best:   20.54/  20.54 GFLOPS | Progress: (8/20) | 11.59 s
    [Task 23/25]  Current/Best:    1.55/  20.54 GFLOPS | Progress: (12/20) | 15.29 s
    [Task 23/25]  Current/Best:   18.69/  20.54 GFLOPS | Progress: (16/20) | 17.98 s
    [Task 23/25]  Current/Best:   12.37/  20.54 GFLOPS | Progress: (20/20) | 21.66 s Done.
+
    [Task 24/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 24/25]  Current/Best:    2.32/   7.78 GFLOPS | Progress: (4/20) | 12.44 s
    [Task 24/25]  Current/Best:    1.87/   7.78 GFLOPS | Progress: (8/20) | 24.23 s
    [Task 24/25]  Current/Best:    2.51/   7.78 GFLOPS | Progress: (12/20) | 35.16 s
    [Task 24/25]  Current/Best:    3.49/  10.65 GFLOPS | Progress: (16/20) | 46.09 s
    [Task 24/25]  Current/Best:    5.80/  10.65 GFLOPS | Progress: (20/20) | 57.08 s
    [Task 25/25]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (0/20) | 0.00 s
    [Task 25/25]  Current/Best:    6.38/   6.38 GFLOPS | Progress: (4/20) | 12.71 s
    [Task 25/25]  Current/Best:    3.29/   6.77 GFLOPS | Progress: (8/20) | 23.66 s
    [Task 25/25]  Current/Best:    8.13/   8.13 GFLOPS | Progress: (12/20) | 34.59 s
    [Task 25/25]  Current/Best:    9.42/   9.42 GFLOPS | Progress: (16/20) | 46.27 s
    [Task 25/25]  Current/Best:    5.41/   9.42 GFLOPS | Progress: (2
 0/20) | 48.45 s Done.
+
 
 
 
@@ -722,8 +721,8 @@ improvement in comparing the optimized model to the unoptimized model.
 
  .. code-block:: none
 
-    optimized: {'mean': 403.3998722799993, 'median': 401.76060245000826, 'std': 3.2980608059631606}
-    unoptimized: {'mean': 519.8183306999988, 'median': 519.4112837500029, 'std': 2.049956864285323}
+    optimized: {'mean': 423.93186388999766, 'median': 422.54658939998535, 'std': 3.325839859534913}
+    unoptimized: {'mean': 513.3865644599973, 'median': 513.2042234999972, 'std': 1.6686009692921546}
 
 
 
@@ -746,7 +745,7 @@ profiling/benchmarking.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 12 minutes  3.277 seconds)
+   **Total running time of the script:** ( 12 minutes  38.249 seconds)
 
 
 .. _sphx_glr_download_tutorial_autotvm_relay_x86.py:
diff --git a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
index 103cf8ccee..9221f3b18a 100644
--- a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
@@ -274,7 +274,7 @@ device and returns the measured cost. Network overhead is excluded.
 
  .. code-block:: none
 
-    1.233e-07 secs/op
+    1.327e-07 secs/op
 
 
 
diff --git a/docs/_sources/tutorial/intro_topi.rst.txt b/docs/_sources/tutorial/intro_topi.rst.txt
index 8906b0b312..1181364418 100644
--- a/docs/_sources/tutorial/intro_topi.rst.txt
+++ b/docs/_sources/tutorial/intro_topi.rst.txt
@@ -277,7 +277,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
 
  .. code-block:: none
 
-    [stage(a, placeholder(a, 0x4c1e6b0)), stage(b, placeholder(b, 0x20938d60)), stage(T_add, compute(T_add, body=[a[ax0, ax1, ax2] + b[ax1, ax2]], axis=[T.iter_var(ax0, T.Range(0, 100), "DataPar", ""), T.iter_var(ax1, T.Range(0, 10), "DataPar", ""), T.iter_var(ax2, T.Range(0, 10), "DataPar", "")], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[a[ax0, ax1, ax2] * b[ax1, ax2]], axis=[T.iter_var(ax0, T.Range(0, 100), "DataPar", ""), T.iter_var(ax1, T. [...]
+    [stage(a, placeholder(a, 0x103435e0)), stage(b, placeholder(b, 0x11a3b500)), stage(T_add, compute(T_add, body=[a[ax0, ax1, ax2] + b[ax1, ax2]], axis=[T.iter_var(ax0, T.Range(0, 100), "DataPar", ""), T.iter_var(ax1, T.Range(0, 10), "DataPar", ""), T.iter_var(ax2, T.Range(0, 10), "DataPar", "")], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[a[ax0, ax1, ax2] * b[ax1, ax2]], axis=[T.iter_var(ax0, T.Range(0, 100), "DataPar", ""), T.iter_var(ax1, T [...]
 
 
 
diff --git a/docs/_sources/tutorial/sg_execution_times.rst.txt b/docs/_sources/tutorial/sg_execution_times.rst.txt
index b0057e388d..5286ec989e 100644
--- a/docs/_sources/tutorial/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorial/sg_execution_times.rst.txt
@@ -5,24 +5,24 @@
 
 Computation times
 =================
-**15:51.490** total execution time for **tutorial** files:
+**16:29.263** total execution time for **tutorial** files:
 
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``)                 | 12:03.277 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``)                 | 12:38.249 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:34.012 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:54.264 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)     | 01:00.514 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``)     | 01:00.856 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``)                 | 00:36.872 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``)                 | 00:35.163 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``)               | 00:34.299 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``)               | 00:19.165 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``)       | 00:01.465 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``)                               | 00:00.820 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``)                               | 00:00.857 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``)       | 00:00.592 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.193 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.153 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_tutorial_uma.py` (``uma.py``)                                             | 00:00.000 | 0.0 MB |
 +------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
index 2f37b5c5cc..1505c552b3 100644
--- a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
@@ -285,8 +285,8 @@ helper function to run a profile of the TVM generated code.
 
  .. code-block:: none
 
-    Numpy running time: 0.000009
-    naive: 0.000011
+    Numpy running time: 0.000007
+    naive: 0.000007
 
 
 
@@ -392,7 +392,7 @@ compile and run this new schedule with the parallel operation applied:
 
  .. code-block:: none
 
-    parallel: 0.000014
+    parallel: 0.000008
 
 
 
@@ -504,10 +504,10 @@ We can now compare the different schedules
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                   numpy    9.116809999341057e-06                    1.0
-                   naive    1.1330299999999999e-05    1.2427921609443355
-                parallel    1.3904800000000001e-05    1.5251826023581723
-                  vector             2.46466e-05      2.7034236758012287
+                   numpy    7.005869997556147e-06                    1.0
+                   naive    6.710799999999999e-06     0.9578824617557739
+                parallel              7.8162e-06      1.1156644360695416
+                  vector             2.46033e-05      3.5118122386773307
 
 
 
@@ -928,7 +928,7 @@ matrix multiplication.
 
  .. code-block:: none
 
-    Numpy running time: 0.019289
+    Numpy running time: 0.017531
 
 
 
@@ -986,7 +986,7 @@ optimizations.
 
  .. code-block:: none
 
-    none: 3.290109
+    none: 3.438193
 
 
 
@@ -1086,7 +1086,7 @@ schedule.
 
  .. code-block:: none
 
-    blocking: 0.333122
+    blocking: 0.283883
 
 
 
@@ -1170,7 +1170,7 @@ already cache friendly from our previous optimizations.
 
  .. code-block:: none
 
-    vectorization: 0.354711
+    vectorization: 0.322069
     # from tvm.script import ir as I
     # from tvm.script import tir as T
 
@@ -1236,7 +1236,7 @@ more cache friendly.
 
  .. code-block:: none
 
-    loop permutation: 0.132614
+    loop permutation: 0.115929
     # from tvm.script import ir as I
     # from tvm.script import tir as T
 
@@ -1327,7 +1327,7 @@ optimized schedule.
 
  .. code-block:: none
 
-    array packing: 0.110053
+    array packing: 0.109654
     # from tvm.script import ir as I
     # from tvm.script import tir as T
 
@@ -1410,7 +1410,7 @@ to `C` when all the block results are ready.
 
  .. code-block:: none
 
-    block caching: 0.112351
+    block caching: 0.110210
     # from tvm.script import ir as I
     # from tvm.script import tir as T
 
@@ -1484,7 +1484,7 @@ of thread-level parallelization.
 
  .. code-block:: none
 
-    parallelization: 0.148527
+    parallelization: 0.145757
     # from tvm.script import ir as I
     # from tvm.script import tir as T
 
@@ -1554,13 +1554,13 @@ working, we can compare the results.
  .. code-block:: none
 
                 Operator                  Timing             Performance
-                    none            3.2901086747                     1.0
-                blocking            0.3331215811     0.10124941575991402
-           vectorization            0.3547107283     0.10781124983123647
-        loop permutation            0.1326141963    0.040306934941014394
-           array packing     0.11005252170000002     0.03344950960017601
-           block caching     0.11235119660000001     0.03414817190202523
-         parallelization     0.14852727970000001      0.0451435786428979
+                    none      3.4381934865000003                     1.0
+                blocking     0.28388305529999996     0.08256750424740819
+           vectorization     0.32206923719999997     0.09367397107364625
+        loop permutation     0.11592884019999998     0.03371795120175532
+           array packing            0.1096544594     0.03189304494658492
+           block caching     0.11021029550000001     0.03205471010655409
+         parallelization     0.14575665489999998    0.042393383464982594
 
 
 
@@ -1602,7 +1602,7 @@ the computation for specific platforms.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  0.514 seconds)
+   **Total running time of the script:** ( 1 minutes  0.856 seconds)
 
 
 .. _sphx_glr_download_tutorial_tensor_expr_get_started.py:
diff --git a/docs/commit_hash b/docs/commit_hash
index d70c5c1c17..4e9e9b88d5 100644
--- a/docs/commit_hash
+++ b/docs/commit_hash
@@ -1 +1 @@
-95fa22308bd08e583dadb3ad429aa768ecce85c2
+c2cc01910c1b88ad2b593a138c8b984385d47db8
diff --git a/docs/how_to/compile_models/from_darknet.html b/docs/how_to/compile_models/from_darknet.html
index f2b8271cae..8ee0cebf79 100644
--- a/docs/how_to/compile_models/from_darknet.html
+++ b/docs/how_to/compile_models/from_darknet.html
@@ -585,7 +585,7 @@ class:[&#39;truck 0.9266&#39;] left:471 top:83 right:689 bottom:169
 class:[&#39;bicycle 0.9984&#39;] left:111 top:113 right:577 bottom:447
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  20.114 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  16.820 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-darknet-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7716f96385bd5abb6e822041e285be54/from_darknet.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_darknet.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_keras.html b/docs/how_to/compile_models/from_keras.html
index fd85b22844..c1c6b30ac2 100644
--- a/docs/how_to/compile_models/from_keras.html
+++ b/docs/how_to/compile_models/from_keras.html
@@ -506,7 +506,7 @@ Tensorflow is also required since it’s used as the default backend of keras.</
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Relay top-1 id: 285, class name: Egyptian cat
 
 1/1 [==============================] - ETA: 0s
-1/1 [==============================] - 1s 997ms/step
+1/1 [==============================] - 1s 944ms/step
 Keras top-1 id: 285, class name: Egyptian cat
 </pre></div>
 </div>
diff --git a/docs/how_to/compile_models/from_mxnet.html b/docs/how_to/compile_models/from_mxnet.html
index d99318fd06..08be0dc576 100644
--- a/docs/how_to/compile_models/from_mxnet.html
+++ b/docs/how_to/compile_models/from_mxnet.html
@@ -439,7 +439,7 @@
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;x&quot;</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#tuple" title="builtins.tuple" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">x</span><span class="o">.</span><span class="n">shape</span></a><span class="p">)</span>
 </pre></div>
 </div>
-<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipffcd1a4f-2ae9-47b0-a873-3c22a873fc38 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipe52823d6-67b8-4970-a66d-061c7030a3da from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
 x (1, 3, 224, 224)
 </pre></div>
 </div>
diff --git a/docs/how_to/compile_models/from_oneflow.html b/docs/how_to/compile_models/from_oneflow.html
index 057ee98333..7f2ebb664b 100644
--- a/docs/how_to/compile_models/from_oneflow.html
+++ b/docs/how_to/compile_models/from_oneflow.html
@@ -449,13 +449,13 @@ Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdo
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: &quot;https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip&quot; to /workspace/.oneflow/flowvision_cache/resnet18.zip
 
   0%|          | 0.00/41.5M [00:00&lt;?, ?B/s]
- 15%|#5        | 6.33M/41.5M [00:00&lt;00:00, 49.9MB/s]
- 27%|##6       | 11.1M/41.5M [00:00&lt;00:00, 39.5MB/s]
- 44%|####4     | 18.3M/41.5M [00:00&lt;00:00, 52.7MB/s]
- 58%|#####7    | 24.0M/41.5M [00:00&lt;00:00, 53.6MB/s]
- 77%|#######7  | 32.0M/41.5M [00:00&lt;00:00, 56.1MB/s]
- 96%|#########6| 40.0M/41.5M [00:00&lt;00:00, 60.5MB/s]
-100%|##########| 41.5M/41.5M [00:00&lt;00:00, 57.1MB/s]
+ 15%|#5        | 6.33M/41.5M [00:00&lt;00:00, 57.4MB/s]
+ 28%|##8       | 11.8M/41.5M [00:00&lt;00:00, 50.0MB/s]
+ 40%|####      | 16.6M/41.5M [00:00&lt;00:00, 42.4MB/s]
+ 54%|#####3    | 22.3M/41.5M [00:00&lt;00:00, 46.1MB/s]
+ 65%|######4   | 26.8M/41.5M [00:00&lt;00:00, 41.4MB/s]
+ 77%|#######7  | 32.0M/41.5M [00:00&lt;00:00, 39.2MB/s]
+100%|##########| 41.5M/41.5M [00:00&lt;00:00, 48.8MB/s]
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/compile_models/from_pytorch.html b/docs/how_to/compile_models/from_pytorch.html
index 7b5c15d872..ff1f9e4e76 100644
--- a/docs/how_to/compile_models/from_pytorch.html
+++ b/docs/how_to/compile_models/from_pytorch.html
@@ -432,11 +432,10 @@ be unstable.</p>
 Downloading: &quot;https://download.pytorch.org/models/resnet18-f37072fd.pth&quot; to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
 
   0%|          | 0.00/44.7M [00:00&lt;?, ?B/s]
- 25%|##4       | 11.0M/44.7M [00:00&lt;00:00, 116MB/s]
- 51%|#####     | 22.6M/44.7M [00:00&lt;00:00, 119MB/s]
- 76%|#######6  | 34.0M/44.7M [00:00&lt;00:00, 110MB/s]
-100%|#########9| 44.5M/44.7M [00:00&lt;00:00, 106MB/s]
-100%|##########| 44.7M/44.7M [00:00&lt;00:00, 109MB/s]
+ 18%|#8        | 8.12M/44.7M [00:00&lt;00:00, 71.4MB/s]
+ 42%|####2     | 18.9M/44.7M [00:00&lt;00:00, 94.2MB/s]
+ 72%|#######1  | 32.0M/44.7M [00:00&lt;00:00, 95.5MB/s]
+100%|##########| 44.7M/44.7M [00:00&lt;00:00, 104MB/s]
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/compile_models/from_tensorflow.html b/docs/how_to/compile_models/from_tensorflow.html
index f74ef1ae21..c07995ee55 100644
--- a/docs/how_to/compile_models/from_tensorflow.html
+++ b/docs/how_to/compile_models/from_tensorflow.html
@@ -649,7 +649,7 @@ banana (score = 0.00022)
 desk (score = 0.00019)
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  26.367 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  20.392 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-tensorflow-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7f1d3d1b878694c201c614c807cdebc8/from_tensorflow.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_tensorflow.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/sg_execution_times.html b/docs/how_to/compile_models/sg_execution_times.html
index 49780ca037..8783a01201 100644
--- a/docs/how_to/compile_models/sg_execution_times.html
+++ b/docs/how_to/compile_models/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-compile-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>06:42.659</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
+<p><strong>06:19.372</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 81%" />
@@ -349,43 +349,43 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></td>
-<td><p>01:26.367</p></td>
+<td><p>01:20.392</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></td>
-<td><p>01:20.114</p></td>
+<td><p>01:16.820</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></td>
-<td><p>00:56.607</p></td>
+<td><p>00:51.752</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></td>
-<td><p>00:37.207</p></td>
+<td><p>00:35.713</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
-<td><p>00:32.225</p></td>
+<td><p>00:30.081</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
-<td><p>00:31.423</p></td>
+<td><p>00:29.950</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
-<td><p>00:27.907</p></td>
+<td><p>00:27.477</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></td>
-<td><p>00:25.628</p></td>
+<td><p>00:24.333</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></td>
-<td><p>00:22.457</p></td>
+<td><p>00:20.219</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></td>
-<td><p>00:02.724</p></td>
+<td><p>00:02.635</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/deploy_models/deploy_model_on_adreno.html b/docs/how_to/deploy_models/deploy_model_on_adreno.html
index 7007e84b53..78402eb978 100644
--- a/docs/how_to/deploy_models/deploy_model_on_adreno.html
+++ b/docs/how_to/deploy_models/deploy_model_on_adreno.html
@@ -920,10 +920,10 @@ Top5 predictions:
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
- 3345.3108    3345.1042    3347.3534    3344.2457      0.9430
+ 3339.0634    3338.9141    3341.0578    3337.0339      1.2823
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  4.549 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  2.822 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-model-on-adreno-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/2387d8448da213eb625e6b3d916327d4/deploy_model_on_adreno.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_model_on_adreno.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_model_on_android.html b/docs/how_to/deploy_models/deploy_model_on_android.html
index 935181530a..2de454697c 100644
--- a/docs/how_to/deploy_models/deploy_model_on_android.html
+++ b/docs/how_to/deploy_models/deploy_model_on_android.html
@@ -662,7 +662,7 @@ to the remote android device.</p>
 Evaluate inference time cost...
 Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  16.3370      16.3538      16.5308      16.1540       0.1121
+  15.7339      15.7292      15.8864      15.6243       0.0913
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
index a50b976d01..52af1f16cf 100644
--- a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
+++ b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
@@ -454,24 +454,28 @@ be unstable.</p>
 Downloading: &quot;https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth&quot; to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
 
   0%|          | 0.00/170M [00:00&lt;?, ?B/s]
-  5%|5         | 9.27M/170M [00:00&lt;00:01, 97.2MB/s]
- 11%|#         | 18.5M/170M [00:00&lt;00:02, 73.6MB/s]
- 15%|#5        | 25.9M/170M [00:00&lt;00:02, 65.8MB/s]
- 20%|##        | 34.1M/170M [00:00&lt;00:02, 64.2MB/s]
- 25%|##4       | 42.1M/170M [00:00&lt;00:02, 63.9MB/s]
- 33%|###2      | 56.0M/170M [00:00&lt;00:01, 72.6MB/s]
- 41%|####1     | 69.9M/170M [00:00&lt;00:01, 90.8MB/s]
- 47%|####6     | 79.1M/170M [00:01&lt;00:01, 86.7MB/s]
- 52%|#####1    | 88.0M/170M [00:01&lt;00:00, 88.7MB/s]
- 57%|#####6    | 96.8M/170M [00:01&lt;00:01, 76.5MB/s]
- 64%|######3   | 108M/170M [00:01&lt;00:00, 86.7MB/s]
- 71%|#######   | 120M/170M [00:01&lt;00:00, 86.4MB/s]
- 77%|#######6  | 130M/170M [00:01&lt;00:00, 90.4MB/s]
- 82%|########1 | 139M/170M [00:01&lt;00:00, 69.0MB/s]
- 87%|########6 | 147M/170M [00:01&lt;00:00, 73.1MB/s]
- 93%|#########3| 158M/170M [00:02&lt;00:00, 82.6MB/s]
- 98%|#########8| 167M/170M [00:02&lt;00:00, 81.9MB/s]
-100%|##########| 170M/170M [00:02&lt;00:00, 78.5MB/s]
+  4%|3         | 6.30M/170M [00:00&lt;00:02, 66.0MB/s]
+  7%|7         | 12.6M/170M [00:00&lt;00:02, 56.4MB/s]
+ 11%|#         | 18.1M/170M [00:00&lt;00:03, 43.0MB/s]
+ 14%|#4        | 24.0M/170M [00:00&lt;00:03, 44.6MB/s]
+ 19%|#8        | 32.0M/170M [00:00&lt;00:02, 52.8MB/s]
+ 24%|##3       | 40.0M/170M [00:00&lt;00:02, 47.4MB/s]
+ 28%|##8       | 48.0M/170M [00:01&lt;00:02, 48.2MB/s]
+ 36%|###6      | 61.5M/170M [00:01&lt;00:01, 69.1MB/s]
+ 42%|####2     | 72.0M/170M [00:01&lt;00:01, 70.4MB/s]
+ 47%|####7     | 80.0M/170M [00:01&lt;00:01, 72.1MB/s]
+ 52%|#####1    | 88.0M/170M [00:01&lt;00:01, 71.8MB/s]
+ 56%|#####6    | 95.2M/170M [00:01&lt;00:01, 64.8MB/s]
+ 60%|#####9    | 102M/170M [00:01&lt;00:01, 64.8MB/s]
+ 64%|######3   | 108M/170M [00:01&lt;00:01, 50.6MB/s]
+ 68%|######8   | 116M/170M [00:02&lt;00:00, 57.4MB/s]
+ 72%|#######1  | 122M/170M [00:02&lt;00:00, 56.3MB/s]
+ 75%|#######5  | 128M/170M [00:02&lt;00:00, 50.2MB/s]
+ 80%|########  | 136M/170M [00:02&lt;00:00, 58.2MB/s]
+ 89%|########8 | 150M/170M [00:02&lt;00:00, 81.4MB/s]
+ 94%|#########3| 159M/170M [00:02&lt;00:00, 74.5MB/s]
+ 98%|#########8| 167M/170M [00:02&lt;00:00, 62.8MB/s]
+100%|##########| 170M/170M [00:02&lt;00:00, 60.7MB/s]
 /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/nn/functional.py:3897: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
   for i in range(dim)
 /venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/detection/anchor_utils.py:124: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the &#39;trunc&#39; function NOT &#39;floor&#39;). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode=&#39;trunc&#39;), or for actual floor division, use torch.div(a, b, rounding_mode=& [...]
@@ -569,7 +573,7 @@ torchvision rcnn models.</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Get 9 valid boxes
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  43.340 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  22.809 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-object-detection-pytorch-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7795da4b258c8feff986668b95ef57ad/deploy_object_detection_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_object_detection_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized.html b/docs/how_to/deploy_models/deploy_prequantized.html
index 4b6bb33200..c3b9b08abb 100644
--- a/docs/how_to/deploy_models/deploy_prequantized.html
+++ b/docs/how_to/deploy_models/deploy_prequantized.html
@@ -495,9 +495,8 @@ training. Other models require a full post training calibration.</p>
 Downloading: &quot;https://download.pytorch.org/models/mobilenet_v2-b0353104.pth&quot; to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
 
   0%|          | 0.00/13.6M [00:00&lt;?, ?B/s]
- 59%|#####8    | 7.99M/13.6M [00:00&lt;00:00, 47.4MB/s]
- 92%|#########2| 12.5M/13.6M [00:00&lt;00:00, 45.0MB/s]
-100%|##########| 13.6M/13.6M [00:00&lt;00:00, 48.3MB/s]
+ 59%|#####8    | 7.99M/13.6M [00:00&lt;00:00, 72.2MB/s]
+100%|##########| 13.6M/13.6M [00:00&lt;00:00, 86.3MB/s]
 </pre></div>
 </div>
 </div>
@@ -588,7 +587,7 @@ output values are identical out of 1000 outputs from mobilenet v2.</p>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  90.5697      90.4384      95.3423      90.2346       0.6346
+  90.0964      89.9840      95.3425      89.8630       0.5670
 </pre></div>
 </div>
 <div class="admonition note">
@@ -627,7 +626,7 @@ This includes support for the VNNI 8 bit dot product instruction (CascadeLake or
 <div class="section" id="deploy-a-quantized-tflite-model">
 <h2>Deploy a quantized TFLite Model<a class="headerlink" href="#deploy-a-quantized-tflite-model" title="Permalink to this headline">¶</a></h2>
 <p>TODO</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  17.620 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  12.482 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/fb8217c13f4351224c6cf3aacf1a87fc/deploy_prequantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized_tflite.html b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
index 7cfb863049..fc94c34154 100644
--- a/docs/how_to/deploy_models/deploy_prequantized_tflite.html
+++ b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
@@ -580,7 +580,7 @@ TFLite Top-5 labels: [387 102 386 341 349]
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
  mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)
-  121.5602     121.5118     123.0774     120.6707      0.4584
+  118.2268     118.0605     120.5108     116.8578      0.7670
 </pre></div>
 </div>
 <div class="admonition note">
@@ -608,7 +608,7 @@ network for ARM CPU</span></a>.</p></li>
 </ul>
 </div></blockquote>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  36.672 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  36.205 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-tflite-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/56691c7a27d45da61d112276334640d3/deploy_prequantized_tflite.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized_tflite.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_quantized.html b/docs/how_to/deploy_models/deploy_quantized.html
index 32e27892e1..70d9890050 100644
--- a/docs/how_to/deploy_models/deploy_quantized.html
+++ b/docs/how_to/deploy_models/deploy_quantized.html
@@ -521,7 +521,7 @@ for calibration. But the accuracy might be impacted.</p>
   DeprecationWarning,
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  39.242 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  36.839 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
index d40c10bfd3..7d93646b4c 100644
--- a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
+++ b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
@@ -463,25 +463,23 @@ to your device.</p>
 Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
 
   0%|          | 0/132723 [00:00&lt;?, ?KB/s]
-  4%|4         | 5406/132723 [00:00&lt;00:02, 54053.61KB/s]
-  9%|9         | 12568/132723 [00:00&lt;00:01, 64303.17KB/s]
- 15%|#5        | 19913/132723 [00:00&lt;00:01, 68474.63KB/s]
- 20%|##        | 27147/132723 [00:00&lt;00:01, 69997.59KB/s]
- 26%|##5       | 34473/132723 [00:00&lt;00:01, 71127.78KB/s]
- 31%|###1      | 41630/132723 [00:00&lt;00:01, 71276.62KB/s]
- 37%|###6      | 48911/132723 [00:00&lt;00:01, 71771.12KB/s]
- 42%|####2     | 56125/132723 [00:00&lt;00:01, 71885.15KB/s]
- 48%|####7     | 63353/132723 [00:00&lt;00:00, 72006.43KB/s]
- 53%|#####3    | 70562/132723 [00:01&lt;00:00, 72028.11KB/s]
- 59%|#####8    | 77765/132723 [00:01&lt;00:00, 71671.32KB/s]
- 64%|######4   | 85525/132723 [00:01&lt;00:00, 73465.63KB/s]
- 70%|#######   | 93176/132723 [00:01&lt;00:00, 74383.10KB/s]
- 76%|#######6  | 100953/132723 [00:01&lt;00:00, 75402.56KB/s]
- 82%|########1 | 108602/132723 [00:01&lt;00:00, 75728.05KB/s]
- 88%|########7 | 116261/132723 [00:01&lt;00:00, 75984.24KB/s]
- 93%|#########3| 124018/132723 [00:01&lt;00:00, 76312.12KB/s]
-100%|#########9| 132220/132723 [00:01&lt;00:00, 78022.00KB/s]
-100%|##########| 132723/132723 [00:01&lt;00:00, 73286.92KB/s]
+  5%|4         | 6432/132723 [00:00&lt;00:01, 64315.00KB/s]
+ 11%|#         | 14292/132723 [00:00&lt;00:01, 72714.86KB/s]
+ 17%|#6        | 22172/132723 [00:00&lt;00:01, 75485.50KB/s]
+ 23%|##2       | 30013/132723 [00:00&lt;00:01, 76634.87KB/s]
+ 28%|##8       | 37677/132723 [00:00&lt;00:01, 75788.39KB/s]
+ 34%|###4      | 45625/132723 [00:00&lt;00:01, 77028.58KB/s]
+ 40%|####      | 53568/132723 [00:00&lt;00:01, 77807.97KB/s]
+ 46%|####6     | 61505/132723 [00:00&lt;00:00, 78300.89KB/s]
+ 52%|#####2    | 69457/132723 [00:00&lt;00:00, 78677.13KB/s]
+ 58%|#####8    | 77480/132723 [00:01&lt;00:00, 79153.48KB/s]
+ 64%|######4   | 85408/132723 [00:01&lt;00:00, 79190.85KB/s]
+ 70%|#######   | 93328/132723 [00:01&lt;00:00, 79153.56KB/s]
+ 76%|#######6  | 101256/132723 [00:01&lt;00:00, 79190.82KB/s]
+ 82%|########2 | 109236/132723 [00:01&lt;00:00, 79371.81KB/s]
+ 88%|########8 | 117211/132723 [00:01&lt;00:00, 79483.38KB/s]
+ 94%|#########4| 125228/132723 [00:01&lt;00:00, 79688.28KB/s]
+100%|##########| 132723/132723 [00:01&lt;00:00, 78195.11KB/s]
 </pre></div>
 </div>
 <p>Create TVM runtime and do inference
@@ -520,7 +518,7 @@ Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from h
 <span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
 </pre></div>
 </div>
-<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  42.908 seconds)</p>
+<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  30.899 seconds)</p>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-ssd-gluoncv-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/cccb17d28e5e8b2e94ea8cd5ec59f6ed/deploy_ssd_gluoncv.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_ssd_gluoncv.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/sg_execution_times.html b/docs/how_to/deploy_models/sg_execution_times.html
index 53b0b1f30a..4fae741733 100644
--- a/docs/how_to/deploy_models/sg_execution_times.html
+++ b/docs/how_to/deploy_models/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-deploy-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>15:45.104</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
+<p><strong>14:56.331</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 86%" />
@@ -348,40 +348,40 @@
 <col style="width: 6%" />
 </colgroup>
 <tbody>
-<tr class="row-odd"><td><p><a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></td>
-<td><p>03:43.340</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></td>
+<td><p>03:30.899</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></td>
-<td><p>03:42.908</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></td>
+<td><p>03:22.809</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></td>
-<td><p>02:36.672</p></td>
+<td><p>02:36.205</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></td>
-<td><p>01:39.242</p></td>
+<td><p>01:36.839</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></td>
-<td><p>01:17.620</p></td>
+<td><p>01:12.482</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_adreno.html#sphx-glr-how-to-deploy-models-deploy-model-on-adreno-py"><span class="std std-ref">Deploy the Pretrained Model on Adreno</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_adreno.py</span></code>)</p></td>
-<td><p>01:04.549</p></td>
+<td><p>01:02.822</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></td>
-<td><p>00:43.076</p></td>
+<td><p>00:39.952</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_nano.html#sphx-glr-how-to-deploy-models-deploy-model-on-nano-py"><span class="std std-ref">Deploy the Pretrained Model on Jetson Nano</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_nano.py</span></code>)</p></td>
-<td><p>00:29.035</p></td>
+<td><p>00:27.314</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></td>
-<td><p>00:28.657</p></td>
+<td><p>00:27.002</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/bring_your_own_datatypes.html b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
index 2d294f78c8..8560b47ee3 100644
--- a/docs/how_to/extend_tvm/bring_your_own_datatypes.html
+++ b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
@@ -619,7 +619,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
 <span class="n">module</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#dict" title="builtins.dict" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">params</span></a> <span class="o">=</span> <span class="n">get_mobilenet</span><span class="p">()</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip6daa7b75-ff03-48fa-9986-16dd6f921aee from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip4e80f2d3-e9d7-4777-a3fc-910c963a2b33 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
 </pre></div>
 </div>
 <p>It’s easy to execute MobileNet with native TVM:</p>
diff --git a/docs/how_to/extend_tvm/index.html b/docs/how_to/extend_tvm/index.html
index d6bf0da968..0b0d4d625d 100644
--- a/docs/how_to/extend_tvm/index.html
+++ b/docs/how_to/extend_tvm/index.html
@@ -49,7 +49,7 @@
     <link rel="index" title="Index" href="../../genindex.html" />
     <link rel="search" title="Search" href="../../search.html" />
     <link rel="next" title="Writing a Customized Pass" href="low_level_custom_pass.html" />
-    <link rel="prev" title="Executing a Tiny Model with TVMC Micro" href="../work_with_microtvm/micro_tvmc.html" /> 
+    <link rel="prev" title="8. Creating Your MLPerfTiny Submission with microTVM" href="../work_with_microtvm/micro_mlperftiny.html" /> 
 </head>
 
 <body class="wy-body-for-nav">
@@ -398,7 +398,7 @@ extended.</p>
         <a href="low_level_custom_pass.html" class="btn btn-neutral float-right" title="Writing a Customized Pass" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
       
       
-        <a href="../work_with_microtvm/micro_tvmc.html" class="btn btn-neutral float-left" title="Executing a Tiny Model with TVMC Micro" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+        <a href="../work_with_microtvm/micro_mlperftiny.html" class="btn btn-neutral float-left" title="8. Creating Your MLPerfTiny Submission with microTVM" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
       
     </div>
 
diff --git a/docs/how_to/extend_tvm/sg_execution_times.html b/docs/how_to/extend_tvm/sg_execution_times.html
index 8dee3f00a4..26790a6210 100644
--- a/docs/how_to/extend_tvm/sg_execution_times.html
+++ b/docs/how_to/extend_tvm/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-extend-tvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:56.270</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
+<p><strong>00:51.788</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 84%" />
@@ -349,19 +349,19 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></td>
-<td><p>00:52.287</p></td>
+<td><p>00:48.088</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></td>
-<td><p>00:02.848</p></td>
+<td><p>00:02.643</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></td>
-<td><p>00:01.128</p></td>
+<td><p>00:01.049</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></td>
-<td><p>00:00.007</p></td>
+<td><p>00:00.008</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/extend_tvm/use_pass_instrument.html b/docs/how_to/extend_tvm/use_pass_instrument.html
index 2aafdb5a71..9d8d8399a8 100644
--- a/docs/how_to/extend_tvm/use_pass_instrument.html
+++ b/docs/how_to/extend_tvm/use_pass_instrument.html
@@ -526,10 +526,10 @@ profile the execution time of each passes.</p>
 </pre></div>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 21703us [21703us] (47.86%; 47.86%)
-FoldScaleAxis: 23641us [10us] (52.14%; 52.14%)
-        FoldConstant: 23631us [1853us] (52.11%; 99.96%)
-                InferType: 21778us [21778us] (48.03%; 92.16%)
+InferType: 21097us [21097us] (48.81%; 48.81%)
+FoldScaleAxis: 22125us [6us] (51.19%; 51.19%)
+        FoldConstant: 22118us [1665us] (51.17%; 99.97%)
+                InferType: 20454us [20454us] (47.32%; 92.47%)
 </pre></div>
 </div>
 </div>
@@ -551,10 +551,10 @@ Refer to following sections and <a class="reference internal" href="../../refere
 </pre></div>
 </div>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 21797us [21797us] (48.65%; 48.65%)
-FoldScaleAxis: 23003us [8us] (51.35%; 51.35%)
-        FoldConstant: 22995us [1841us] (51.33%; 99.97%)
-                InferType: 21155us [21155us] (47.22%; 92.00%)
+InferType: 20549us [20549us] (48.45%; 48.45%)
+FoldScaleAxis: 21861us [5us] (51.55%; 51.55%)
+        FoldConstant: 21856us [1662us] (51.54%; 99.98%)
+                InferType: 20194us [20194us] (47.62%; 92.40%)
 </pre></div>
 </div>
 <p>Register empty list to clear existing instruments.</p>
diff --git a/docs/how_to/optimize_operators/opt_conv_cuda.html b/docs/how_to/optimize_operators/opt_conv_cuda.html
index b8da91d8c0..4dff41c2dd 100644
--- a/docs/how_to/optimize_operators/opt_conv_cuda.html
+++ b/docs/how_to/optimize_operators/opt_conv_cuda.html
@@ -575,7 +575,7 @@ latency of convolution.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Convolution: </span><span class="si">%f</span><span class="s2"> ms&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">*</span> <span cl [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 52.344928 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 49.075649 ms
 </pre></div>
 </div>
 <div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-optimize-operators-opt-conv-cuda-py">
diff --git a/docs/how_to/optimize_operators/opt_conv_tensorcore.html b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
index 4cf39333c3..eb48dfe2cb 100644
--- a/docs/how_to/optimize_operators/opt_conv_tensorcore.html
+++ b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
@@ -867,7 +867,7 @@ be able to run on our build server</p>
     <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;conv2d with tensor core: </span><span class="si">%f</span><span class="s2"> ms&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">* [...]
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 7.066883 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 10.195248 ms
 </pre></div>
 </div>
 </div>
diff --git a/docs/how_to/optimize_operators/opt_gemm.html b/docs/how_to/optimize_operators/opt_gemm.html
index 626e9b6c58..032592005a 100644
--- a/docs/how_to/optimize_operators/opt_gemm.html
+++ b/docs/how_to/optimize_operators/opt_gemm.html
@@ -472,8 +472,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Baseline: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018969
-Baseline: 3.226941
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.017991
+Baseline: 3.421825
 </pre></div>
 </div>
 <p>In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -532,7 +532,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt1: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.314088
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.298870
 </pre></div>
 </div>
 <p>Here is the generated IR after blocking.</p>
@@ -589,7 +589,7 @@ vastly.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt2: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.343279
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.335195
 </pre></div>
 </div>
 <p>Here is the generated IR after vectorization.</p>
@@ -644,7 +644,7 @@ the access pattern for A matrix is more cache friendly.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt3: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.132868
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.114142
 </pre></div>
 </div>
 <p>Here is the generated IR after loop permutation.</p>
@@ -721,7 +721,7 @@ flattening.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt4: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.109901
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.110151
 </pre></div>
 </div>
 <p>Here is the generated IR after array packing.</p>
@@ -799,7 +799,7 @@ write to C when all the block results are ready.</p>
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt5: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111493
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111765
 </pre></div>
 </div>
 <p>Here is the generated IR after blocking.</p>
@@ -879,7 +879,7 @@ class Module:
 <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Opt6: </span><span class="si">%f</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">opt6_time</span><span class="p">)</span>
 </pre></div>
 </div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.148603
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.146912
 </pre></div>
 </div>
 <p>Here is the generated IR after parallelization.</p>
diff --git a/docs/how_to/optimize_operators/sg_execution_times.html b/docs/how_to/optimize_operators/sg_execution_times.html
index 432dac0fbb..cf595ac57f 100644
--- a/docs/how_to/optimize_operators/sg_execution_times.html
+++ b/docs/how_to/optimize_operators/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-optimize-operators-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:35.066</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
+<p><strong>00:34.796</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 83%" />
@@ -349,7 +349,7 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></td>
-<td><p>00:32.429</p></td>
+<td><p>00:32.247</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></td>
@@ -357,7 +357,7 @@
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></td>
-<td><p>00:01.153</p></td>
+<td><p>00:01.065</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
index 9d0728a25e..eb0b14acad 100644
--- a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
+++ b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
@@ -340,7 +340,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-how-to-tune-with-autoscheduler-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>09:49.275</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
+<p><strong>09:15.573</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 85%" />
@@ -349,27 +349,27 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></td>
-<td><p>05:49.007</p></td>
+<td><p>05:35.959</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></td>
-<td><p>01:42.967</p></td>
+<td><p>01:38.441</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></td>
-<td><p>01:07.441</p></td>
+<td><p>01:05.335</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></td>
-<td><p>00:41.045</p></td>
+<td><p>00:29.087</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></td>
-<td><p>00:15.132</p></td>
+<td><p>00:13.906</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></td>
-<td><p>00:13.683</p></td>
+<td><p>00:12.845</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
index 92bba6e1f5..253a4f8f31 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
@@ -506,787 +506,1119 @@ class Module:
     def main(data: T.Buffer((1, 512, 7, 7), &quot;float32&quot;), kernel: T.Buffer((512, 512, 3, 3), &quot;float32&quot;), bias: T.Buffer((1, 512, 1, 1), &quot;float32&quot;), compute: T.Buffer((1, 512, 7, 7), &quot;float32&quot;)):
         T.func_attr({&quot;from_legacy_te_schedule&quot;: True, &quot;global_symbol&quot;: &quot;main&quot;, &quot;tir.noalias&quot;: True})
         blockIdx_x = T.env_thread(&quot;blockIdx.x&quot;)
-        T.launch_thread(blockIdx_x, 28)
-        conv2d_nchw = T.allocate([8], &quot;float32&quot;, &quot;local&quot;)
-        pad_temp_shared = T.allocate([216], &quot;float32&quot;, &quot;shared&quot;)
-        kernel_shared = T.allocate([9216], &quot;float32&quot;, &quot;shared&quot;)
+        T.launch_thread(blockIdx_x, 32)
+        conv2d_nchw = T.allocate([14], &quot;float32&quot;, &quot;local&quot;)
+        pad_temp_shared = T.allocate([648], &quot;float32&quot;, &quot;shared&quot;)
... 9739 lines suppressed ...